summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/task.go
blob: 9a95bf44cb8f51011a2bd6e48e0250566cb25e33 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
	gocontext "context"
	"runtime/trace"
	"sync/atomic"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/bpf"
	"gvisor.dev/gvisor/pkg/errors/linuxerr"
	"gvisor.dev/gvisor/pkg/hostarch"
	"gvisor.dev/gvisor/pkg/sentry/fs"
	"gvisor.dev/gvisor/pkg/sentry/inet"
	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
	"gvisor.dev/gvisor/pkg/sentry/platform"
	"gvisor.dev/gvisor/pkg/sentry/seccheck"
	"gvisor.dev/gvisor/pkg/sentry/usage"
	"gvisor.dev/gvisor/pkg/sentry/vfs"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/waiter"
)

// Task represents a thread of execution in the untrusted app.  It
// includes registers and any thread-specific state that you would
// normally expect.
//
// Each task is associated with a goroutine, called the task goroutine, that
// executes code (application code, system calls, etc.) on behalf of that task.
// See Task.run (task_run.go).
//
// All fields that are "owned by the task goroutine" can only be mutated by the
// task goroutine while it is running. The task goroutine does not require
// synchronization to read these fields, although it still requires
// synchronization as described for those fields to mutate them.
//
// All fields that are "exclusive to the task goroutine" can only be accessed
// by the task goroutine while it is running. The task goroutine does not
// require synchronization to read or write these fields.
//
// +stateify savable
type Task struct {
	taskNode

	// goid is the task goroutine's ID. goid is owned by the task goroutine,
	// but since it's used to detect cases where non-task goroutines
	// incorrectly access state owned by, or exclusive to, the task goroutine,
	// goid is always accessed using atomic memory operations.
	goid int64 `state:"nosave"`

	// runState is what the task goroutine is executing if it is not stopped.
	// If runState is nil, the task goroutine should exit or has exited.
	// runState is exclusive to the task goroutine.
	runState taskRunState

	// taskWorkCount represents the current size of the task work queue. It is
	// used to avoid acquiring taskWorkMu when the queue is empty.
	//
	// Must accessed with atomic memory operations.
	taskWorkCount int32

	// taskWorkMu protects taskWork.
	taskWorkMu sync.Mutex `state:"nosave"`

	// taskWork is a queue of work to be executed before resuming user execution.
	// It is similar to the task_work mechanism in Linux.
	//
	// taskWork is exclusive to the task goroutine.
	taskWork []TaskWorker

	// haveSyscallReturn is true if image.Arch().Return() represents a value
	// returned by a syscall (or set by ptrace after a syscall).
	//
	// haveSyscallReturn is exclusive to the task goroutine.
	haveSyscallReturn bool

	// interruptChan is notified whenever the task goroutine is interrupted
	// (usually by a pending signal). interruptChan is effectively a condition
	// variable that can be used in select statements.
	//
	// interruptChan is not saved; because saving interrupts all tasks,
	// interruptChan is always notified after restore (see Task.run).
	interruptChan chan struct{} `state:"nosave"`

	// gosched contains the current scheduling state of the task goroutine.
	//
	// gosched is protected by goschedSeq. gosched is owned by the task
	// goroutine.
	goschedSeq sync.SeqCount `state:"nosave"`
	gosched    TaskGoroutineSchedInfo

	// yieldCount is the number of times the task goroutine has called
	// Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
	// Task.Yield(), voluntarily ceasing execution.
	//
	// yieldCount is accessed using atomic memory operations. yieldCount is
	// owned by the task goroutine.
	yieldCount uint64

	// pendingSignals is the set of pending signals that may be handled only by
	// this task.
	//
	// pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
	// (hereafter "the signal mutex"); see comment on
	// ThreadGroup.signalHandlers.
	pendingSignals pendingSignals

	// signalMask is the set of signals whose delivery is currently blocked.
	//
	// signalMask is accessed using atomic memory operations, and is protected
	// by the signal mutex (such that reading signalMask is safe if either the
	// signal mutex is locked or if atomic memory operations are used, while
	// writing signalMask requires both). signalMask is owned by the task
	// goroutine.
	signalMask linux.SignalSet

	// If the task goroutine is currently executing Task.sigtimedwait,
	// realSignalMask is the previous value of signalMask, which has temporarily
	// been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0.
	//
	// realSignalMask is exclusive to the task goroutine.
	realSignalMask linux.SignalSet

	// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
	// should be applied after the task has either delivered one signal to a
	// user handler or is about to resume execution in the untrusted
	// application.
	//
	// Both haveSavedSignalMask and savedSignalMask are exclusive to the task
	// goroutine.
	haveSavedSignalMask bool
	savedSignalMask     linux.SignalSet

	// signalStack is the alternate signal stack used by signal handlers for
	// which the SA_ONSTACK flag is set.
	//
	// signalStack is exclusive to the task goroutine.
	signalStack linux.SignalStack

	// signalQueue is a set of registered waiters for signal-related events.
	//
	// signalQueue is protected by the signalMutex. Note that the task does
	// not implement all queue methods, specifically the readiness checks.
	// The task only broadcast a notification on signal delivery.
	signalQueue waiter.Queue `state:"zerovalue"`

	// If groupStopPending is true, the task should participate in a group
	// stop in the interrupt path.
	//
	// groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux.
	//
	// groupStopPending is protected by the signal mutex.
	groupStopPending bool

	// If groupStopAcknowledged is true, the task has already acknowledged that
	// it is entering the most recent group stop that has been initiated on its
	// thread group.
	//
	// groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
	//
	// groupStopAcknowledged is protected by the signal mutex.
	groupStopAcknowledged bool

	// If trapStopPending is true, the task goroutine should enter a
	// PTRACE_INTERRUPT-induced stop from the interrupt path.
	//
	// trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that
	// Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects
	// JOBCTL_STOP_PENDING.
	//
	// trapStopPending is protected by the signal mutex.
	trapStopPending bool

	// If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group
	// stop has begun or ended since the last time the task entered a
	// ptrace-stop from the group-stop path.
	//
	// trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux.
	//
	// trapNotifyPending is protected by the signal mutex.
	trapNotifyPending bool

	// If stop is not nil, it is the internally-initiated condition that
	// currently prevents the task goroutine from running.
	//
	// stop is protected by the signal mutex.
	stop TaskStop

	// stopCount is the number of active external stops (calls to
	// Task.BeginExternalStop that have not been paired with a call to
	// Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
	// non-zero if the task goroutine should stop.
	//
	// Mutating stopCount requires both locking the signal mutex and using
	// atomic memory operations. Reading stopCount requires either locking the
	// signal mutex or using atomic memory operations. This allows Task.doStop
	// to require only a single atomic read in the common case where stopCount
	// is 0.
	//
	// stopCount is not saved, because external stops cannot be retained across
	// a save/restore cycle. (Suppose a sentryctl command issues an external
	// stop; after a save/restore cycle, the restored sentry has no knowledge
	// of the pre-save sentryctl command, and the stopped task would remain
	// stopped forever.)
	stopCount int32 `state:"nosave"`

	// endStopCond is signaled when stopCount transitions to 0. The combination
	// of stopCount and endStopCond effectively form a sync.WaitGroup, but
	// WaitGroup provides no way to read its counter value.
	//
	// Invariant: endStopCond.L is the signal mutex. (This is not racy because
	// sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
	// calls sync.Cond.Wait; and only the task goroutine can change the
	// identity of the signal mutex, in Task.finishExec.)
	endStopCond sync.Cond `state:"nosave"`

	// exitStatus is the task's exit status.
	//
	// exitStatus is protected by the signal mutex.
	exitStatus linux.WaitStatus

	// syscallRestartBlock represents a custom restart function to run in
	// restart_syscall(2) to resume an interrupted syscall.
	//
	// syscallRestartBlock is exclusive to the task goroutine.
	syscallRestartBlock SyscallRestartBlock

	// p provides the mechanism by which the task runs code in userspace. The p
	// interface object is immutable.
	p platform.Context `state:"nosave"`

	// k is the Kernel that this task belongs to. The k pointer is immutable.
	k *Kernel

	// containerID has no equivalent in Linux; it's used by runsc to track all
	// tasks that belong to a given containers since cgroups aren't implemented.
	// It's inherited by the children, is immutable, and may be empty.
	//
	// NOTE: cgroups can be used to track this when implemented.
	containerID string

	// mu protects some of the following fields.
	mu sync.Mutex `state:"nosave"`

	// image holds task data provided by the ELF loader.
	//
	// image is protected by mu, and is owned by the task goroutine.
	image TaskImage

	// fsContext is the task's filesystem context.
	//
	// fsContext is protected by mu, and is owned by the task goroutine.
	fsContext *FSContext

	// fdTable is the task's file descriptor table.
	//
	// fdTable is protected by mu, and is owned by the task goroutine.
	fdTable *FDTable

	// If vforkParent is not nil, it is the task that created this task with
	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
	// this TaskImage is released.
	//
	// vforkParent is protected by the TaskSet mutex.
	vforkParent *Task

	// exitState is the task's progress through the exit path.
	//
	// exitState is protected by the TaskSet mutex. exitState is owned by the
	// task goroutine.
	exitState TaskExitState

	// exitTracerNotified is true if the exit path has either signaled the
	// task's tracer to indicate the exit, or determined that no such signal is
	// needed. exitTracerNotified can only be true if exitState is
	// TaskExitZombie or TaskExitDead.
	//
	// exitTracerNotified is protected by the TaskSet mutex.
	exitTracerNotified bool

	// exitTracerAcked is true if exitTracerNotified is true and either the
	// task's tracer has acknowledged the exit notification, or the exit path
	// has determined that no such notification is needed.
	//
	// exitTracerAcked is protected by the TaskSet mutex.
	exitTracerAcked bool

	// exitParentNotified is true if the exit path has either signaled the
	// task's parent to indicate the exit, or determined that no such signal is
	// needed. exitParentNotified can only be true if exitState is
	// TaskExitZombie or TaskExitDead.
	//
	// exitParentNotified is protected by the TaskSet mutex.
	exitParentNotified bool

	// exitParentAcked is true if exitParentNotified is true and either the
	// task's parent has acknowledged the exit notification, or the exit path
	// has determined that no such acknowledgment is needed.
	//
	// exitParentAcked is protected by the TaskSet mutex.
	exitParentAcked bool

	// goroutineStopped is a WaitGroup whose counter value is 1 when the task
	// goroutine is running and 0 when the task goroutine is stopped or has
	// exited.
	goroutineStopped sync.WaitGroup `state:"nosave"`

	// ptraceTracer is the task that is ptrace-attached to this one. If
	// ptraceTracer is nil, this task is not being traced. Note that due to
	// atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
	// ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
	//
	// ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
	// operations. This allows paths that wouldn't otherwise lock the TaskSet
	// mutex, notably the syscall path, to check if ptraceTracer is nil without
	// additional synchronization.
	ptraceTracer atomic.Value `state:".(*Task)"`

	// ptraceTracees is the set of tasks that this task is ptrace-attached to.
	//
	// ptraceTracees is protected by the TaskSet mutex.
	ptraceTracees map[*Task]struct{}

	// ptraceSeized is true if ptraceTracer attached to this task with
	// PTRACE_SEIZE.
	//
	// ptraceSeized is protected by the TaskSet mutex.
	ptraceSeized bool

	// ptraceOpts contains ptrace options explicitly set by the tracer. If
	// ptraceTracer is nil, ptraceOpts is expected to be the zero value.
	//
	// ptraceOpts is protected by the TaskSet mutex.
	ptraceOpts ptraceOptions

	// ptraceSyscallMode controls ptrace behavior around syscall entry and
	// exit.
	//
	// ptraceSyscallMode is protected by the TaskSet mutex.
	ptraceSyscallMode ptraceSyscallMode

	// If ptraceSinglestep is true, the next time the task executes application
	// code, single-stepping should be enabled. ptraceSinglestep is stored
	// independently of the architecture-specific trap flag because tracer
	// detaching (which can happen concurrently with the tracee's execution if
	// the tracer exits) must disable single-stepping, and the task's
	// architectural state is implicitly exclusive to the task goroutine (no
	// synchronization occurs before passing registers to SwitchToApp).
	//
	// ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
	//
	// ptraceSinglestep is protected by the TaskSet mutex.
	ptraceSinglestep bool

	// If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
	// time that t entered the ptrace stop, reset to 0 when the tracer
	// acknowledges the stop with a wait*() syscall. Otherwise, it is the
	// signal number passed to the ptrace operation that ended the last ptrace
	// stop on this task. In the latter case, the effect of ptraceCode depends
	// on the nature of the ptrace stop; signal-delivery-stop uses it to
	// conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
	// signal to the task after leaving the stop, and PTRACE_EVENT stops and
	// traced group stops ignore it entirely.
	//
	// Linux contextually stores the equivalent of ptraceCode in
	// task_struct::exit_code.
	//
	// ptraceCode is protected by the TaskSet mutex.
	ptraceCode int32

	// ptraceSiginfo is the value returned to the tracer by
	// ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
	// (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
	// ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
	// required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
	// is in turn required to distinguish group stops from other ptrace stops,
	// per subsection "Group-stop" in ptrace(2)).
	//
	// ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
	//
	// ptraceSiginfo is protected by the TaskSet mutex.
	ptraceSiginfo *linux.SignalInfo

	// ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
	// the tracer by ptrace(PTRACE_GETEVENTMSG).
	//
	// ptraceEventMsg is protected by the TaskSet mutex.
	ptraceEventMsg uint64

	// ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has
	// been added before. This is used during task exit to decide whether we need
	// to clean up YAMA exceptions.
	//
	// ptraceYAMAExceptionAdded is protected by the TaskSet mutex.
	ptraceYAMAExceptionAdded bool

	// The struct that holds the IO-related usage. The ioUsage pointer is
	// immutable.
	ioUsage *usage.IO

	// logPrefix is a string containing the task's thread ID in the root PID
	// namespace, and is prepended to log messages emitted by Task.Infof etc.
	logPrefix atomic.Value `state:"nosave"`

	// traceContext and traceTask are both used for tracing, and are
	// updated along with the logPrefix in updateInfoLocked.
	//
	// These are exclusive to the task goroutine.
	traceContext gocontext.Context `state:"nosave"`
	traceTask    *trace.Task       `state:"nosave"`

	// creds is the task's credentials.
	//
	// creds.Load() may be called without synchronization. creds.Store() is
	// serialized by mu. creds is owned by the task goroutine. All
	// auth.Credentials objects that creds may point to, or have pointed to
	// in the past, must be treated as immutable.
	creds auth.AtomicPtrCredentials

	// utsns is the task's UTS namespace.
	//
	// utsns is protected by mu. utsns is owned by the task goroutine.
	utsns *UTSNamespace

	// ipcns is the task's IPC namespace.
	//
	// ipcns is protected by mu. ipcns is owned by the task goroutine.
	ipcns *IPCNamespace

	// abstractSockets tracks abstract sockets that are in use.
	//
	// abstractSockets is protected by mu.
	abstractSockets *AbstractSocketNamespace

	// mountNamespaceVFS2 is the task's mount namespace.
	//
	// It is protected by mu. It is owned by the task goroutine.
	mountNamespaceVFS2 *vfs.MountNamespace

	// parentDeathSignal is sent to this task's thread group when its parent exits.
	//
	// parentDeathSignal is protected by mu.
	parentDeathSignal linux.Signal

	// syscallFilters is all seccomp-bpf syscall filters applicable to the
	// task, in the order in which they were installed. The type of the atomic
	// is []bpf.Program. Writing needs to be protected by the signal mutex.
	//
	// syscallFilters is owned by the task goroutine.
	syscallFilters atomic.Value `state:".([]bpf.Program)"`

	// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
	// task's virtual address space; when the task exits, set the pointed-to
	// ThreadID to 0, and wake any futex waiters.
	//
	// cleartid is exclusive to the task goroutine.
	cleartid hostarch.Addr

	// This is mostly a fake cpumask just for sched_set/getaffinity as we
	// don't really control the affinity.
	//
	// Invariant: allowedCPUMask.Size() ==
	// sched.CPUMaskSize(Kernel.applicationCores).
	//
	// allowedCPUMask is protected by mu.
	allowedCPUMask sched.CPUSet

	// cpu is the fake cpu number returned by getcpu(2). cpu is ignored
	// entirely if Kernel.useHostCores is true.
	//
	// cpu is accessed using atomic memory operations.
	cpu int32

	// This is used to keep track of changes made to a process' priority/niceness.
	// It is mostly used to provide some reasonable return value from
	// getpriority(2) after a call to setpriority(2) has been made.
	// We currently do not actually modify a process' scheduling priority.
	// NOTE: This represents the userspace view of priority (nice).
	// This means that the value should be in the range [-20, 19].
	//
	// niceness is protected by mu.
	niceness int

	// This is used to track the numa policy for the current thread. This can be
	// modified through a set_mempolicy(2) syscall. Since we always report a
	// single numa node, all policies are no-ops. We only track this information
	// so that we can return reasonable values if the application calls
	// get_mempolicy(2) after setting a non-default policy. Note that in the
	// real syscall, nodemask can be longer than a single unsigned long, but we
	// always report a single node so never need to save more than a single
	// bit.
	//
	// numaPolicy and numaNodeMask are protected by mu.
	numaPolicy   linux.NumaPolicy
	numaNodeMask uint64

	// netns is the task's network namespace. netns is never nil.
	//
	// netns is protected by mu.
	netns *inet.Namespace

	// If rseqPreempted is true, before the next call to p.Switch(),
	// interrupt rseq critical regions as defined by rseqAddr and
	// tg.oldRSeqCritical and write the task goroutine's CPU number to
	// rseqAddr/oldRSeqCPUAddr.
	//
	// We support two ABIs for restartable sequences:
	//
	//  1. The upstream interface added in v4.18,
	//  2. An "old" interface never merged upstream. In the implementation,
	//     this is referred to as "old rseq".
	//
	// rseqPreempted is exclusive to the task goroutine.
	rseqPreempted bool `state:"nosave"`

	// rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
	//
	// If rseq is unused, rseqCPU is -1 for convenient use in
	// platform.Context.Switch.
	//
	// rseqCPU is exclusive to the task goroutine.
	rseqCPU int32

	// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
	//
	// oldRSeqCPUAddr is exclusive to the task goroutine.
	oldRSeqCPUAddr hostarch.Addr

	// rseqAddr is a pointer to the userspace linux.RSeq structure.
	//
	// rseqAddr is exclusive to the task goroutine.
	rseqAddr hostarch.Addr

	// rseqSignature is the signature that the rseq abort IP must be signed
	// with.
	//
	// rseqSignature is exclusive to the task goroutine.
	rseqSignature uint32

	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
	// implementations that require an intermediate buffer to copy data
	// into/out of. It prevents these buffers from being allocated/zeroed in
	// each syscall and eventually garbage collected.
	//
	// copyScratchBuffer is exclusive to the task goroutine.
	copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`

	// blockingTimer is used for blocking timeouts. blockingTimerChan is the
	// channel that is sent to when blockingTimer fires.
	//
	// blockingTimer is exclusive to the task goroutine.
	blockingTimer     *ktime.Timer    `state:"nosave"`
	blockingTimerChan <-chan struct{} `state:"nosave"`

	// futexWaiter is used for futex(FUTEX_WAIT) syscalls.
	//
	// futexWaiter is exclusive to the task goroutine.
	futexWaiter *futex.Waiter `state:"nosave"`

	// robustList is a pointer to the head of the tasks's robust futex
	// list.
	robustList hostarch.Addr

	// startTime is the real time at which the task started. It is set when
	// a Task is created or invokes execve(2).
	//
	// startTime is protected by mu.
	startTime ktime.Time

	// kcov is the kcov instance providing code coverage owned by this task.
	//
	// kcov is exclusive to the task goroutine.
	kcov *Kcov

	// cgroups is the set of cgroups this task belongs to. This may be empty if
	// no cgroup controllers are enabled. Protected by mu.
	//
	// +checklocks:mu
	cgroups map[Cgroup]struct{}
}

func (t *Task) savePtraceTracer() *Task {
	return t.ptraceTracer.Load().(*Task)
}

func (t *Task) loadPtraceTracer(tracer *Task) {
	t.ptraceTracer.Store(tracer)
}

func (t *Task) saveSyscallFilters() []bpf.Program {
	if f := t.syscallFilters.Load(); f != nil {
		return f.([]bpf.Program)
	}
	return nil
}

func (t *Task) loadSyscallFilters(filters []bpf.Program) {
	t.syscallFilters.Store(filters)
}

// afterLoad is invoked by stateify.
func (t *Task) afterLoad() {
	t.updateInfoLocked()
	t.interruptChan = make(chan struct{}, 1)
	t.gosched.State = TaskGoroutineNonexistent
	if t.stop != nil {
		t.stopCount = 1
	}
	t.endStopCond.L = &t.tg.signalHandlers.mu
	t.p = t.k.Platform.NewContext()
	t.rseqPreempted = true
	t.futexWaiter = futex.NewWaiter()
}

// copyScratchBufferLen is the length of Task.copyScratchBuffer.
const copyScratchBufferLen = 144 // sizeof(struct stat)

// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
// functions. It must only be used within those functions and can only be used
// by the task goroutine; it exists to improve performance and thus
// intentionally lacks any synchronization.
//
// Callers should pass a constant value as an argument if possible, which will
// allow the compiler to inline and optimize out the if statement below.
func (t *Task) CopyScratchBuffer(size int) []byte {
	if size > copyScratchBufferLen {
		return make([]byte, size)
	}
	return t.copyScratchBuffer[:size]
}

// FutexWaiter returns the Task's futex.Waiter.
func (t *Task) FutexWaiter() *futex.Waiter {
	return t.futexWaiter
}

// Kernel returns the Kernel containing t.
func (t *Task) Kernel() *Kernel {
	return t.k
}

// SetClearTID sets t's cleartid.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) SetClearTID(addr hostarch.Addr) {
	t.cleartid = addr
}

// SetSyscallRestartBlock sets the restart block for use in
// restart_syscall(2). After registering a restart block, a syscall should
// return ERESTART_RESTARTBLOCK to request a restart using the block.
//
// Precondition: The caller must be running on the task goroutine.
func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
	t.syscallRestartBlock = r
}

// SyscallRestartBlock returns the currently registered restart block for use in
// restart_syscall(2). This function is *not* idempotent and may be called once
// per syscall. This function must not be called if a restart block has not been
// registered for the current syscall.
//
// Precondition: The caller must be running on the task goroutine.
func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
	r := t.syscallRestartBlock
	// Explicitly set the restart block to nil so that a future syscall can't
	// accidentally reuse it.
	t.syscallRestartBlock = nil
	return r
}

// IsChrooted returns true if the root directory of t's FSContext is not the
// root directory of t's MountNamespace.
//
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) IsChrooted() bool {
	if VFS2Enabled {
		realRoot := t.mountNamespaceVFS2.Root()
		root := t.fsContext.RootDirectoryVFS2()
		defer root.DecRef(t)
		return root != realRoot
	}

	realRoot := t.tg.mounts.Root()
	defer realRoot.DecRef(t)
	root := t.fsContext.RootDirectory()
	if root != nil {
		defer root.DecRef(t)
	}
	return root != realRoot
}

// TaskImage returns t's TaskImage.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) TaskImage() *TaskImage {
	return &t.image
}

// FSContext returns t's FSContext. FSContext does not take an additional
// reference on the returned FSContext.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) FSContext() *FSContext {
	return t.fsContext
}

// FDTable returns t's FDTable. FDMTable does not take an additional reference
// on the returned FDMap.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) FDTable() *FDTable {
	return t.fdTable
}

// GetFile is a convenience wrapper for t.FDTable().Get.
//
// Precondition: same as FDTable.Get.
func (t *Task) GetFile(fd int32) *fs.File {
	f, _ := t.fdTable.Get(fd)
	return f
}

// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2.
//
// Precondition: same as FDTable.Get.
func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription {
	f, _ := t.fdTable.GetVFS2(fd)
	return f
}

// NewFDs is a convenience wrapper for t.FDTable().NewFDs.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) {
	return t.fdTable.NewFDs(t, fd, files, flags)
}

// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
	return t.fdTable.NewFDsVFS2(t, fd, files, flags)
}

// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) {
	fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags)
	if err != nil {
		return 0, err
	}
	return fds[0], nil
}

// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.Get.
func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
	return t.fdTable.NewFDVFS2(t, fd, file, flags)
}

// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
	return t.fdTable.NewFDAt(t, fd, file, flags)
}

// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
	return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
}

// WithMuLocked executes f with t.mu locked.
func (t *Task) WithMuLocked(f func(*Task)) {
	t.mu.Lock()
	f(t)
	t.mu.Unlock()
}

// MountNamespace returns t's MountNamespace. MountNamespace does not take an
// additional reference on the returned MountNamespace.
func (t *Task) MountNamespace() *fs.MountNamespace {
	return t.tg.mounts
}

// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
// returned mount namespace.
func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
	t.mu.Lock()
	defer t.mu.Unlock()
	return t.mountNamespaceVFS2
}

// AbstractSockets returns t's AbstractSocketNamespace.
func (t *Task) AbstractSockets() *AbstractSocketNamespace {
	return t.abstractSockets
}

// ContainerID returns t's container ID.
func (t *Task) ContainerID() string {
	return t.containerID
}

// OOMScoreAdj gets the task's thread group's OOM score adjustment.
func (t *Task) OOMScoreAdj() int32 {
	return atomic.LoadInt32(&t.tg.oomScoreAdj)
}

// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The
// value should be between -1000 and 1000 inclusive.
func (t *Task) SetOOMScoreAdj(adj int32) error {
	if adj > 1000 || adj < -1000 {
		return linuxerr.EINVAL
	}
	atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
	return nil
}

// KUID returns t's kuid.
func (t *Task) KUID() uint32 {
	return uint32(t.Credentials().EffectiveKUID)
}

// KGID returns t's kgid.
func (t *Task) KGID() uint32 {
	return uint32(t.Credentials().EffectiveKGID)
}

// SetKcov sets the kcov instance associated with t.
func (t *Task) SetKcov(k *Kcov) {
	t.kcov = k
}

// ResetKcov clears the kcov instance associated with t.
func (t *Task) ResetKcov() {
	if t.kcov != nil {
		t.kcov.OnTaskExit()
		t.kcov = nil
	}
}

// Preconditions: The TaskSet mutex must be locked.
func (t *Task) loadSeccheckInfoLocked(req seccheck.TaskFieldSet, mask *seccheck.TaskFieldSet, info *seccheck.TaskInfo) {
	if req.Contains(seccheck.TaskFieldThreadID) {
		info.ThreadID = int32(t.k.tasks.Root.tids[t])
		mask.Add(seccheck.TaskFieldThreadID)
	}
	if req.Contains(seccheck.TaskFieldThreadStartTime) {
		info.ThreadStartTime = t.startTime
		mask.Add(seccheck.TaskFieldThreadStartTime)
	}
	if req.Contains(seccheck.TaskFieldThreadGroupID) {
		info.ThreadGroupID = int32(t.k.tasks.Root.tgids[t.tg])
		mask.Add(seccheck.TaskFieldThreadGroupID)
	}
	if req.Contains(seccheck.TaskFieldThreadGroupStartTime) {
		info.ThreadGroupStartTime = t.tg.leader.startTime
		mask.Add(seccheck.TaskFieldThreadGroupStartTime)
	}
}