summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/task_signals.go
blob: 6a204aa5974a06cc51f434efecb798050d33e271 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

// This file defines the behavior of task signal handling.

import (
	"fmt"
	"sync/atomic"
	"time"

	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
	ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
	"gvisor.googlesource.com/gvisor/pkg/syserror"
)

// SignalAction is an internal signal action.
type SignalAction int

// Available signal actions.
// Note that although we refer the complete set internally,
// the application is only capable of using the Default and
// Ignore actions from the system call interface.
const (
	SignalActionTerm SignalAction = iota
	SignalActionCore
	SignalActionStop
	SignalActionIgnore
	SignalActionHandler
)

// Default signal handler actions. Note that for most signals,
// (except SIGKILL and SIGSTOP) these can be overridden by the app.
var defaultActions = map[linux.Signal]SignalAction{
	// POSIX.1-1990 standard.
	linux.SIGHUP:  SignalActionTerm,
	linux.SIGINT:  SignalActionTerm,
	linux.SIGQUIT: SignalActionCore,
	linux.SIGILL:  SignalActionCore,
	linux.SIGABRT: SignalActionCore,
	linux.SIGFPE:  SignalActionCore,
	linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects
	linux.SIGSEGV: SignalActionCore,
	linux.SIGPIPE: SignalActionTerm,
	linux.SIGALRM: SignalActionTerm,
	linux.SIGTERM: SignalActionTerm,
	linux.SIGUSR1: SignalActionTerm,
	linux.SIGUSR2: SignalActionTerm,
	linux.SIGCHLD: SignalActionIgnore,
	linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects
	linux.SIGSTOP: SignalActionStop,
	linux.SIGTSTP: SignalActionStop,
	linux.SIGTTIN: SignalActionStop,
	linux.SIGTTOU: SignalActionStop,
	// POSIX.1-2001 standard.
	linux.SIGBUS:    SignalActionCore,
	linux.SIGPROF:   SignalActionTerm,
	linux.SIGSYS:    SignalActionCore,
	linux.SIGTRAP:   SignalActionCore,
	linux.SIGURG:    SignalActionIgnore,
	linux.SIGVTALRM: SignalActionTerm,
	linux.SIGXCPU:   SignalActionCore,
	linux.SIGXFSZ:   SignalActionCore,
	// The rest on linux.
	linux.SIGSTKFLT: SignalActionTerm,
	linux.SIGIO:     SignalActionTerm,
	linux.SIGPWR:    SignalActionTerm,
	linux.SIGWINCH:  SignalActionIgnore,
}

// computeAction figures out what to do given a signal number
// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop,
// and SIGKILL always results in a SignalActionTerm.
// Signal 0 is always ignored as many programs use it for various internal functions
// and don't expect it to do anything.
//
// In the event the signal is not one of these, act.Handler determines what
// happens next.
// If act.Handler is:
// 0, the default action is taken;
// 1, the signal is ignored;
// anything else, the function returns SignalActionHandler.
func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction {
	switch sig {
	case linux.SIGSTOP:
		return SignalActionStop
	case linux.SIGKILL:
		return SignalActionTerm
	case linux.Signal(0):
		return SignalActionIgnore
	}

	switch act.Handler {
	case arch.SignalActDefault:
		return defaultActions[sig]
	case arch.SignalActIgnore:
		return SignalActionIgnore
	default:
		return SignalActionHandler
	}
}

// UnblockableSignals contains the set of signals which cannot be blocked.
var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)

// StopSignals is the set of signals whose default action is SignalActionStop.
var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)

// dequeueSignalLocked returns a pending signal that is *not* included in mask.
// If there are no pending unmasked signals, dequeueSignalLocked returns nil.
//
// Preconditions: t.tg.signalHandlers.mu must be locked.
func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *arch.SignalInfo {
	if info := t.pendingSignals.dequeue(mask); info != nil {
		return info
	}
	return t.tg.pendingSignals.dequeue(mask)
}

// discardSpecificLocked removes all instances of the given signal from all
// signal queues in tg.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) {
	tg.pendingSignals.discardSpecific(sig)
	for t := tg.tasks.Front(); t != nil; t = t.Next() {
		t.pendingSignals.discardSpecific(sig)
	}
}

// PendingSignals returns the set of pending signals.
func (t *Task) PendingSignals() linux.SignalSet {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet
}

// deliverSignal delivers the given signal and returns the following run state.
func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState {
	sigact := computeAction(linux.Signal(info.Signo), act)

	if t.haveSyscallReturn {
		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
			// Signals that are ignored, cause a thread group stop, or
			// terminate the thread group do not interact with interrupted
			// syscalls; in Linux terms, they are never returned to the signal
			// handling path from get_signal => get_signal_to_deliver. The
			// behavior of an interrupted syscall is determined by the first
			// signal that is actually handled (by userspace).
			if sigact == SignalActionHandler {
				switch {
				case sre == ERESTARTNOHAND:
					fallthrough
				case sre == ERESTART_RESTARTBLOCK:
					fallthrough
				case (sre == ERESTARTSYS && !act.IsRestart()):
					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
					t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
				default:
					t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
					t.Arch().RestartSyscall()
				}
			}
		}
	}

	switch sigact {
	case SignalActionTerm, SignalActionCore:
		// "Default action is to terminate the process." - signal(7)
		t.Debugf("Signal %d: terminating thread group", info.Signo)

		// Emit an event channel messages related to this uncaught signal.
		ucs := &ucspb.UncaughtSignal{
			Tid:          int32(t.Kernel().TaskSet().Root.IDOfTask(t)),
			Pid:          int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())),
			Registers:    t.Arch().StateData().Proto(),
			SignalNumber: info.Signo,
		}

		// Attach an fault address if appropriate.
		switch linux.Signal(info.Signo) {
		case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS:
			ucs.FaultAddr = info.Addr()
		}

		eventchannel.Emit(ucs)

		t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
		return (*runExit)(nil)

	case SignalActionStop:
		// "Default action is to stop the process."
		t.initiateGroupStop(info)

	case SignalActionIgnore:
		// "Default action is to ignore the signal."
		t.Debugf("Signal %d: ignored", info.Signo)

	case SignalActionHandler:
		// Try to deliver the signal to the user-configured handler.
		t.Debugf("Signal %d: delivering to handler", info.Signo)
		if err := t.deliverSignalToHandler(info, act); err != nil {
			// This is not a warning, it can occur during normal operation.
			t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err)

			// Send a forced SIGSEGV. If the signal that couldn't be delivered
			// was a SIGSEGV, force the handler to SIG_DFL.
			t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
			t.SendSignal(sigPriv(linux.SIGSEGV))
		}

	default:
		panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act)))
	}
	return (*runInterrupt)(nil)
}

// deliverSignalToHandler changes the task's userspace state to enter the given
// user-configured handler for the given signal.
func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error {
	// Signal delivery to an application handler interrupts restartable
	// sequences.
	t.rseqInterrupt()

	// Are executing on the main stack,
	// or the provided alternate stack?
	sp := usermem.Addr(t.Arch().Stack())

	// N.B. This is a *copy* of the alternate stack that the user's signal
	// handler expects to see in its ucontext (even if it's not in use).
	alt := t.signalStack
	if act.IsOnStack() && alt.IsEnabled() {
		alt.SetOnStack()
		if !alt.Contains(sp) {
			sp = usermem.Addr(alt.Top())
		}
	}

	// Set up the signal handler. If we have a saved signal mask, the signal
	// handler should run with the current mask, but sigreturn should restore
	// the saved one.
	st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
	mask := t.signalMask
	if t.haveSavedSignalMask {
		mask = t.savedSignalMask
	}
	if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
		return err
	}
	t.haveSavedSignalMask = false

	// Add our signal mask.
	newMask := t.signalMask | act.Mask
	if !act.IsNoDefer() {
		newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
	}
	t.SetSignalMask(newMask)

	return nil
}

var ctrlResume = &SyscallControl{ignoreReturn: true}

// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if
// rt is true).
func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
	st := t.Stack()
	sigset, alt, err := t.Arch().SignalRestore(st, rt)
	if err != nil {
		return nil, err
	}

	// Attempt to record the given signal stack. Note that we silently
	// ignore failures here, as does Linux. Only an EFAULT may be
	// generated, but SignalRestore has already deserialized the entire
	// frame successfully.
	t.SetSignalStack(alt)

	// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
	t.SetSignalMask(sigset &^ UnblockableSignals)

	return ctrlResume, nil
}

// Sigtimedwait implements the semantics of sigtimedwait(2).
//
// Preconditions: The caller must be running on the task goroutine. t.exitState
// < TaskExitZombie.
func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
	// set is the set of signals we're interested in; invert it to get the set
	// of signals to block.
	mask := ^set &^ UnblockableSignals

	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	if info := t.dequeueSignalLocked(mask); info != nil {
		return info, nil
	}

	if timeout == 0 {
		return nil, syserror.EAGAIN
	}

	// Unblock signals we're waiting for. Remember the original signal mask so
	// that Task.sendSignalTimerLocked doesn't discard ignored signals that
	// we're temporarily unblocking.
	t.realSignalMask = t.signalMask
	t.setSignalMaskLocked(t.signalMask & mask)

	// Wait for a timeout or new signal.
	t.tg.signalHandlers.mu.Unlock()
	_, err := t.BlockWithTimeout(nil, true, timeout)
	t.tg.signalHandlers.mu.Lock()

	// Restore the original signal mask.
	t.setSignalMaskLocked(t.realSignalMask)
	t.realSignalMask = 0

	if info := t.dequeueSignalLocked(mask); info != nil {
		return info, nil
	}
	if err == syserror.ETIMEDOUT {
		return nil, syserror.EAGAIN
	}
	return nil, err
}

// SendSignal sends the given signal to t.
//
// The following errors may be returned:
//
//	syserror.ESRCH - The task has exited.
//	syserror.EINVAL - The signal is not valid.
//	syserror.EAGAIN - THe signal is realtime, and cannot be queued.
//
func (t *Task) SendSignal(info *arch.SignalInfo) error {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	return t.sendSignalLocked(info, false /* group */)
}

// SendGroupSignal sends the given signal to t's thread group.
func (t *Task) SendGroupSignal(info *arch.SignalInfo) error {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	return t.sendSignalLocked(info, true /* group */)
}

// SendSignal sends the given signal to tg, using tg's leader to determine if
// the signal is blocked.
func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
	tg.pidns.owner.mu.RLock()
	defer tg.pidns.owner.mu.RUnlock()
	tg.signalHandlers.mu.Lock()
	defer tg.signalHandlers.mu.Unlock()
	return tg.leader.sendSignalLocked(info, true /* group */)
}

func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
	return t.sendSignalTimerLocked(info, group, nil)
}

func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *IntervalTimer) error {
	if t.exitState == TaskExitDead {
		return syserror.ESRCH
	}
	sig := linux.Signal(info.Signo)
	if sig == 0 {
		return nil
	}
	if !sig.IsValid() {
		return syserror.EINVAL
	}

	// Signal side effects apply even if the signal is ultimately discarded.
	t.tg.applySignalSideEffectsLocked(sig)

	// TODO: "Only signals for which the "init" process has established a
	// signal handler can be sent to the "init" process by other members of the
	// PID namespace. This restriction applies even to privileged processes,
	// and prevents other members of the PID namespace from accidentally
	// killing the "init" process." - pid_namespaces(7). We don't currently do
	// this for child namespaces, though we should; we also don't do this for
	// the root namespace (the same restriction applies to global init on
	// Linux), where whether or not we should is much murkier. In practice,
	// most sandboxed applications are not prepared to function as an init
	// process.

	// Unmasked, ignored signals are discarded without being queued, unless
	// they will be visible to a tracer. Even for group signals, it's the
	// originally-targeted task's signal mask and tracer that matter; compare
	// Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
	// sig_ignored().
	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
	if sigset := linux.SignalSetOf(sig); sigset&t.signalMask == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() {
		t.Debugf("Discarding ignored signal %d", sig)
		if timer != nil {
			timer.signalRejectedLocked()
		}
		return nil
	}

	q := &t.pendingSignals
	if group {
		q = &t.tg.pendingSignals
	}
	if !q.enqueue(info, timer) {
		if sig.IsRealtime() {
			return syserror.EAGAIN
		}
		t.Debugf("Discarding duplicate signal %d", sig)
		if timer != nil {
			timer.signalRejectedLocked()
		}
		return nil
	}

	// Find a receiver to notify. Note that the task we choose to notify, if
	// any, may not be the task that actually dequeues and handles the signal;
	// e.g. a racing signal mask change may cause the notified task to become
	// ineligible, or a racing sibling task may dequeue the signal first.
	if t.canReceiveSignalLocked(sig) {
		t.Debugf("Notified of signal %d", sig)
		t.interrupt()
		return nil
	}
	if group {
		if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
			nt.Debugf("Notified of group signal %d", sig)
			nt.interrupt()
			return nil
		}
	}
	t.Debugf("No task notified of signal %d", sig)
	return nil
}

func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
	switch {
	case linux.SignalSetOf(sig)&StopSignals != 0:
		// Stop signals cause all prior SIGCONT to be discarded. (This is
		// despite the fact this has little effect since SIGCONT's most
		// important effect is applied when the signal is sent in the branch
		// below, not when the signal is delivered.)
		tg.discardSpecificLocked(linux.SIGCONT)
	case sig == linux.SIGCONT:
		// "The SIGCONT signal has a side effect of waking up (all threads of)
		// a group-stopped process. This side effect happens before
		// signal-delivery-stop. The tracer can't suppress this side effect (it
		// can only suppress signal injection, which only causes the SIGCONT
		// handler to not be executed in the tracee, if such a handler is
		// installed." - ptrace(2)
		tg.endGroupStopLocked(true)
	case sig == linux.SIGKILL:
		// "SIGKILL does not generate signal-delivery-stop and therefore the
		// tracer can't suppress it. SIGKILL kills even within system calls
		// (syscall-exit-stop is not generated prior to death by SIGKILL)." -
		// ptrace(2)
		//
		// Note that this differs from ThreadGroup.requestExit in that it
		// ignores tg.execing.
		if !tg.exiting {
			tg.exiting = true
			tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)}
		}
		for t := tg.tasks.Front(); t != nil; t = t.Next() {
			t.killLocked()
		}
	}
}

// canReceiveSignalLocked returns true if t should be interrupted to receive
// the given signal. canReceiveSignalLocked is analogous to Linux's
// kernel/signal.c:wants_signal(), but see below for divergences.
//
// Preconditions: The signal mutex must be locked.
func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
	// - Do not choose tasks that are blocking the signal.
	if linux.SignalSetOf(sig)&t.signalMask != 0 {
		return false
	}
	// - No need to check Task.exitState, as the exit path sets every bit in the
	// signal mask when it transitions from TaskExitNone to TaskExitInitiated.
	// - No special case for SIGKILL: SIGKILL already interrupted all tasks in the
	// task group via applySignalSideEffects => killLocked.
	// - Do not choose stopped tasks, which cannot handle signals.
	if t.stop != nil {
		return false
	}
	// - TODO: No special case for when t is also the sending task,
	// because the identity of the sender is unknown.
	// - Do not choose tasks that have already been interrupted, as they may be
	// busy handling another signal.
	if len(t.interruptChan) != 0 {
		return false
	}
	return true
}

// findSignalReceiverLocked returns a task in tg that should be interrupted to
// receive the given signal. If no such task exists, findSignalReceiverLocked
// returns nil.
//
// Linux actually records curr_target to balance the group signal targets.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task {
	for t := tg.tasks.Front(); t != nil; t = t.Next() {
		if t.canReceiveSignalLocked(sig) {
			return t
		}
	}
	return nil
}

// forceSignal ensures that the task is not ignoring or blocking the given
// signal. If unconditional is true, forceSignal takes action even if the
// signal isn't being ignored or blocked.
func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	t.forceSignalLocked(sig, unconditional)
}

func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
	blocked := linux.SignalSetOf(sig)&t.signalMask != 0
	act := t.tg.signalHandlers.actions[sig]
	ignored := act.Handler == arch.SignalActIgnore
	if blocked || ignored || unconditional {
		act.Handler = arch.SignalActDefault
		t.tg.signalHandlers.actions[sig] = act
		if blocked {
			t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig))
		}
	}
}

// SignalMask returns a copy of t's signal mask.
func (t *Task) SignalMask() linux.SignalSet {
	return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.signalMask)))
}

// SetSignalMask sets t's signal mask.
//
// Preconditions: SetSignalMask can only be called by the task goroutine.
// t.exitState < TaskExitZombie.
func (t *Task) SetSignalMask(mask linux.SignalSet) {
	// By precondition, t prevents t.tg from completing an execve and mutating
	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
	t.tg.signalHandlers.mu.Lock()
	t.setSignalMaskLocked(mask)
	t.tg.signalHandlers.mu.Unlock()
}

// Preconditions: The signal mutex must be locked.
func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
	oldMask := t.signalMask
	atomic.StoreUint64((*uint64)(&t.signalMask), uint64(mask))

	// If the new mask blocks any signals that were not blocked by the old
	// mask, and at least one such signal is pending in tg.pendingSignals, and
	// t has been woken, it could be the case that t was woken to handle that
	// signal, but will no longer do so as a result of its new signal mask, so
	// we have to pick a replacement.
	blocked := mask &^ oldMask
	blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet
	if blockedGroupPending != 0 && t.interrupted() {
		linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) {
			if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
				nt.interrupt()
				return
			}
		})
		// We have to re-issue the interrupt consumed by t.interrupted() since
		// it might have been for a different reason.
		t.interruptSelf()
	}

	// Conversely, if the new mask unblocks any signals that were blocked by
	// the old mask, and at least one such signal is pending, we may now need
	// to handle that signal.
	unblocked := oldMask &^ mask
	unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet)
	if unblockedPending != 0 {
		t.interruptSelf()
	}
}

// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
// comment).
//
// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
	t.savedSignalMask = mask
	t.haveSavedSignalMask = true
}

// SignalStack returns the task-private signal stack.
func (t *Task) SignalStack() arch.SignalStack {
	alt := t.signalStack
	if t.onSignalStack(alt) {
		alt.Flags |= arch.SignalStackFlagOnStack
	}
	return alt
}

// onSignalStack returns true if the task is executing on the given signal stack.
func (t *Task) onSignalStack(alt arch.SignalStack) bool {
	sp := usermem.Addr(t.Arch().Stack())
	return alt.Contains(sp)
}

// SetSignalStack sets the task-private signal stack.
//
// This value may not be changed if the task is currently executing on the
// signal stack, i.e. if t.onSignalStack returns true. In this case, this
// function will return false. Otherwise, true is returned.
func (t *Task) SetSignalStack(alt arch.SignalStack) bool {
	// Check that we're not executing on the stack.
	if t.onSignalStack(t.signalStack) {
		return false
	}

	if alt.Flags&arch.SignalStackFlagDisable != 0 {
		// Don't record anything beyond the flags.
		t.signalStack = arch.SignalStack{
			Flags: arch.SignalStackFlagDisable,
		}
	} else {
		// Mask out irrelevant parts: only disable matters.
		alt.Flags &= arch.SignalStackFlagDisable
		t.signalStack = alt
	}
	return true
}

// SetSignalAct atomically sets the thread group's signal action for signal sig
// to *actptr (if actptr is not nil) and returns the old signal action.
func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) {
	if !sig.IsValid() {
		return arch.SignalAct{}, syserror.EINVAL
	}

	tg.pidns.owner.mu.RLock()
	defer tg.pidns.owner.mu.RUnlock()
	sh := tg.signalHandlers
	sh.mu.Lock()
	defer sh.mu.Unlock()
	oldact := sh.actions[sig]
	if actptr != nil {
		if sig == linux.SIGKILL || sig == linux.SIGSTOP {
			return oldact, syserror.EINVAL
		}

		act := *actptr
		act.Mask &^= UnblockableSignals
		sh.actions[sig] = act
		// From POSIX, by way of Linux:
		//
		// "Setting a signal action to SIG_IGN for a signal that is pending
		// shall cause the pending signal to be discarded, whether or not it is
		// blocked."
		//
		// "Setting a signal action to SIG_DFL for a signal that is pending and
		// whose default action is to ignore the signal (for example, SIGCHLD),
		// shall cause the pending signal to be discarded, whether or not it is
		// blocked."
		if computeAction(sig, act) == SignalActionIgnore {
			tg.discardSpecificLocked(sig)
		}
	}
	return oldact, nil
}

// CopyOutSignalAct converts the given SignalAct into an architecture-specific
// type and then copies it out to task memory.
func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
	n := t.Arch().NewSignalAct()
	n.SerializeFrom(s)
	_, err := t.CopyOut(addr, n)
	return err
}

// CopyInSignalAct copies an architecture-specific sigaction type from task
// memory and then converts it into a SignalAct.
func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
	n := t.Arch().NewSignalAct()
	var s arch.SignalAct
	if _, err := t.CopyIn(addr, n); err != nil {
		return s, err
	}
	n.DeserializeTo(&s)
	return s, nil
}

// CopyOutSignalStack converts the given SignalStack into an
// architecture-specific type and then copies it out to task memory.
func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
	n := t.Arch().NewSignalStack()
	n.SerializeFrom(s)
	_, err := t.CopyOut(addr, n)
	return err
}

// CopyInSignalStack copies an architecture-specific stack_t from task memory
// and then converts it into a SignalStack.
func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
	n := t.Arch().NewSignalStack()
	var s arch.SignalStack
	if _, err := t.CopyIn(addr, n); err != nil {
		return s, err
	}
	n.DeserializeTo(&s)
	return s, nil
}

// groupStop is a TaskStop placed on tasks that have received a stop signal
// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
// the ptrace man page.)
//
// +stateify savable
type groupStop struct{}

// Killable implements TaskStop.Killable.
func (*groupStop) Killable() bool { return true }

// initiateGroupStop attempts to initiate a group stop based on a
// previously-dequeued stop signal.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	if t.groupStopPending {
		t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo)
		return
	}
	if !t.tg.groupStopDequeued {
		t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo)
		return
	}
	if t.tg.exiting {
		t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo)
		return
	}
	if t.tg.execing != nil {
		t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
		return
	}
	if !t.tg.groupStopComplete {
		t.tg.groupStopSignal = linux.Signal(info.Signo)
	}
	t.tg.groupStopPendingCount = 0
	for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
		if t2.killedLocked() || t2.exitState >= TaskExitInitiated {
			t2.groupStopPending = false
			continue
		}
		t2.groupStopPending = true
		t2.groupStopAcknowledged = false
		if t2.ptraceSeized {
			t2.trapNotifyPending = true
			if s, ok := t2.stop.(*ptraceStop); ok && s.listen {
				t2.endInternalStopLocked()
			}
		}
		t2.interrupt()
		t.tg.groupStopPendingCount++
	}
	t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount)
}

// endGroupStopLocked ensures that all prior stop signals received by tg are
// not stopping tg and will not stop tg in the future. If broadcast is true,
// parent and tracer notification will be scheduled if appropriate.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
	// Discard all previously-queued stop signals.
	linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)

	if tg.groupStopPendingCount == 0 && !tg.groupStopComplete {
		return
	}

	completeStr := "incomplete"
	if tg.groupStopComplete {
		completeStr = "complete"
	}
	tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount)
	for t := tg.tasks.Front(); t != nil; t = t.Next() {
		t.groupStopPending = false
		if t.ptraceSeized {
			t.trapNotifyPending = true
			if s, ok := t.stop.(*ptraceStop); ok && s.listen {
				t.endInternalStopLocked()
			}
		} else {
			if _, ok := t.stop.(*groupStop); ok {
				t.endInternalStopLocked()
			}
		}
	}
	if broadcast {
		// Instead of notifying the parent here, set groupContNotify so that
		// one of the continuing tasks does so. (Linux does something similar.)
		// The reason we do this is to keep locking sane. In order to send a
		// signal to the parent, we need to lock its signal mutex, but we're
		// already holding tg's signal mutex, and the TaskSet mutex must be
		// locked for writing for us to hold two signal mutexes. Since we don't
		// want to require this for endGroupStopLocked (which is called from
		// signal-sending paths), nor do we want to lose atomicity by releasing
		// the mutexes we're already holding, just let the continuing thread
		// group deal with it.
		tg.groupContNotify = true
		tg.groupContInterrupted = !tg.groupStopComplete
		tg.groupContWaitable = true
	}
	// Unsetting groupStopDequeued will cause racing calls to initiateGroupStop
	// to recognize that the group stop has been cancelled.
	tg.groupStopDequeued = false
	tg.groupStopSignal = 0
	tg.groupStopPendingCount = 0
	tg.groupStopComplete = false
	tg.groupStopWaitable = false
}

// participateGroupStopLocked is called to handle thread group side effects
// after t unsets t.groupStopPending. The caller must handle task side effects
// (e.g. placing the task goroutine into the group stop). It returns true if
// the caller must notify t.tg.leader's parent of a completed group stop (which
// participateGroupStopLocked cannot do due to holding the wrong locks).
//
// Preconditions: The signal mutex must be locked.
func (t *Task) participateGroupStopLocked() bool {
	if t.groupStopAcknowledged {
		return false
	}
	t.groupStopAcknowledged = true
	t.tg.groupStopPendingCount--
	if t.tg.groupStopPendingCount != 0 {
		return false
	}
	if t.tg.groupStopComplete {
		return false
	}
	t.Debugf("Completing group stop")
	t.tg.groupStopComplete = true
	t.tg.groupStopWaitable = true
	t.tg.groupContNotify = false
	t.tg.groupContWaitable = false
	return true
}

// signalStop sends a signal to t's thread group of a new group stop, group
// continue, or ptrace stop, if appropriate. code and status are set in the
// signal sent to tg, if any.
//
// Preconditions: The TaskSet mutex must be locked (for reading or writing).
func (t *Task) signalStop(target *Task, code int32, status int32) {
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD]
	if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) {
		sigchld := &arch.SignalInfo{
			Signo: int32(linux.SIGCHLD),
			Code:  code,
		}
		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
		sigchld.SetStatus(status)
		// TODO: Set utime, stime.
		t.sendSignalLocked(sigchld, true /* group */)
	}
}

// The runInterrupt state handles conditions indicated by interrupts.
//
// +stateify savable
type runInterrupt struct{}

func (*runInterrupt) execute(t *Task) taskRunState {
	// Interrupts are de-duplicated (if t is interrupted twice before
	// t.interrupted() is called, t.interrupted() will only return true once),
	// so early exits from this function must re-enter the runInterrupt state
	// to check for more interrupt-signaled conditions.

	t.tg.signalHandlers.mu.Lock()

	// Did we just leave a group stop?
	if t.tg.groupContNotify {
		t.tg.groupContNotify = false
		sig := t.tg.groupStopSignal
		intr := t.tg.groupContInterrupted
		t.tg.signalHandlers.mu.Unlock()
		t.tg.pidns.owner.mu.RLock()
		// For consistency with Linux, if the parent and (thread group
		// leader's) tracer are in the same thread group, deduplicate
		// notifications.
		notifyParent := t.tg.leader.parent != nil
		if tracer := t.tg.leader.Tracer(); tracer != nil {
			if notifyParent && tracer.tg == t.tg.leader.parent.tg {
				notifyParent = false
			}
			// Sending CLD_STOPPED to the tracer doesn't really make any sense;
			// the thread group leader may have already entered the stop and
			// notified its tracer accordingly. But it's consistent with
			// Linux...
			if intr {
				tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
				if !notifyParent {
					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop)
				} else {
					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop)
				}
			} else {
				tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
				tracer.tg.eventQueue.Notify(EventGroupContinue)
			}
		}
		if notifyParent {
			// If groupContInterrupted, do as Linux does and pretend the group
			// stop completed just before it ended. The theoretical behavior in
			// this case would be to send a SIGCHLD indicating the completed
			// stop, followed by a SIGCHLD indicating the continue. However,
			// SIGCHLD is a standard signal, so the latter would always be
			// dropped. Hence sending only the former is equivalent.
			if intr {
				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop)
			} else {
				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue)
			}
		}
		t.tg.pidns.owner.mu.RUnlock()
		return (*runInterrupt)(nil)
	}

	// Do we need to enter a group stop or related ptrace stop? This path is
	// analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop()
	// (with ptrace enabled) and do_jobctl_trap().
	if t.groupStopPending || t.trapStopPending || t.trapNotifyPending {
		sig := t.tg.groupStopSignal
		notifyParent := false
		if t.groupStopPending {
			t.groupStopPending = false
			// We care about t.tg.groupStopSignal (for tracer notification)
			// even if this doesn't complete a group stop, so keep the
			// value of sig we've already read.
			notifyParent = t.participateGroupStopLocked()
		}
		t.trapStopPending = false
		t.trapNotifyPending = false
		// Drop the signal mutex so we can take the TaskSet mutex.
		t.tg.signalHandlers.mu.Unlock()

		t.tg.pidns.owner.mu.RLock()
		if t.tg.leader.parent == nil {
			notifyParent = false
		}
		if tracer := t.Tracer(); tracer != nil {
			if t.ptraceSeized {
				if sig == 0 {
					sig = linux.SIGTRAP
				}
				// "If tracee was attached using PTRACE_SEIZE, group-stop is
				// indicated by PTRACE_EVENT_STOP: status>>16 ==
				// PTRACE_EVENT_STOP. This allows detection of group-stops
				// without requiring an extra PTRACE_GETSIGINFO call." -
				// "Group-stop", ptrace(2)
				t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8
				t.ptraceSiginfo = &arch.SignalInfo{
					Signo: int32(sig),
					Code:  t.ptraceCode,
				}
				t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
				t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
			} else {
				t.ptraceCode = int32(sig)
				t.ptraceSiginfo = nil
			}
			if t.beginPtraceStopLocked() {
				tracer.signalStop(t, arch.CLD_STOPPED, int32(sig))
				// For consistency with Linux, if the parent and tracer are in the
				// same thread group, deduplicate notification signals.
				if notifyParent && tracer.tg == t.tg.leader.parent.tg {
					notifyParent = false
					tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop)
				} else {
					tracer.tg.eventQueue.Notify(EventTraceeStop)
				}
			}
		} else {
			t.tg.signalHandlers.mu.Lock()
			if !t.killedLocked() {
				t.beginInternalStopLocked((*groupStop)(nil))
			}
			t.tg.signalHandlers.mu.Unlock()
		}
		if notifyParent {
			t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
			t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
		}
		t.tg.pidns.owner.mu.RUnlock()

		return (*runInterrupt)(nil)
	}

	// Are there signals pending?
	if info := t.dequeueSignalLocked(t.signalMask); info != nil {
		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
			// Indicate that we've dequeued a stop signal before unlocking the
			// signal mutex; initiateGroupStop will check for races with
			// endGroupStopLocked after relocking it.
			t.tg.groupStopDequeued = true
		}
		if t.ptraceSignalLocked(info) {
			// Dequeueing the signal action must wait until after the
			// signal-delivery-stop ends since the tracer can change or
			// suppress the signal.
			t.tg.signalHandlers.mu.Unlock()
			return (*runInterruptAfterSignalDeliveryStop)(nil)
		}
		act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
		t.tg.signalHandlers.mu.Unlock()
		return t.deliverSignal(info, act)
	}

	t.tg.signalHandlers.mu.Unlock()
	return (*runApp)(nil)
}

// +stateify savable
type runInterruptAfterSignalDeliveryStop struct{}

func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
	t.tg.pidns.owner.mu.Lock()
	// Can't defer unlock: deliverSignal must be called without holding TaskSet
	// mutex.
	sig := linux.Signal(t.ptraceCode)
	defer func() {
		t.ptraceSiginfo = nil
	}()
	if !sig.IsValid() {
		t.tg.pidns.owner.mu.Unlock()
		return (*runInterrupt)(nil)
	}
	info := t.ptraceSiginfo
	if sig != linux.Signal(info.Signo) {
		info.Signo = int32(sig)
		info.Errno = 0
		info.Code = arch.SignalInfoUser
		// pid isn't a valid field for all signal numbers, but Linux
		// doesn't care (kernel/signal.c:ptrace_signal()).
		//
		// Linux uses t->parent for the tid and uid here, which is the tracer
		// if it hasn't detached or the real parent otherwise.
		parent := t.parent
		if tracer := t.Tracer(); tracer != nil {
			parent = tracer
		}
		if parent == nil {
			// Tracer has detached and t was created by Kernel.CreateProcess().
			// Pretend the parent is in an ancestor PID + user namespace.
			info.SetPid(0)
			info.SetUid(int32(auth.OverflowUID))
		} else {
			info.SetPid(int32(t.tg.pidns.tids[parent]))
			info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
		}
	}
	t.tg.signalHandlers.mu.Lock()
	t.tg.pidns.owner.mu.Unlock()
	// If the signal is masked, re-queue it.
	if linux.SignalSetOf(sig)&t.signalMask != 0 {
		t.sendSignalLocked(info, false /* group */)
		t.tg.signalHandlers.mu.Unlock()
		return (*runInterrupt)(nil)
	}
	act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
	t.tg.signalHandlers.mu.Unlock()
	return t.deliverSignal(info, act)
}