summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/futex/futex.go
blob: 3bd5c04afaee388743a67bbf97bd76d5ed5ec657 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package futex provides an implementation of the futex interface as found in
// the Linux kernel. It allows one to easily transform Wait() calls into waits
// on a channel, which is useful in a Go-based kernel, for example.
package futex

import (
	"sync"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/sentry/memmap"
	"gvisor.dev/gvisor/pkg/sentry/usermem"
	"gvisor.dev/gvisor/pkg/syserror"
)

// KeyKind indicates the type of a Key.
type KeyKind int

const (
	// KindPrivate indicates a private futex (a futex syscall with the
	// FUTEX_PRIVATE_FLAG set).
	KindPrivate KeyKind = iota

	// KindSharedPrivate indicates a shared futex on a private memory mapping.
	// Although KindPrivate and KindSharedPrivate futexes both use memory
	// addresses to identify futexes, they do not interoperate (in Linux, the
	// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
	// comparison).
	KindSharedPrivate

	// KindSharedMappable indicates a shared futex on a memory mapping other
	// than a private anonymous memory mapping.
	KindSharedMappable
)

// Key represents something that a futex waiter may wait on.
type Key struct {
	// Kind is the type of the Key.
	Kind KeyKind

	// Mappable is the memory-mapped object that is represented by the Key.
	// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
	// even if it is.
	Mappable memmap.Mappable

	// MappingIdentity is the MappingIdentity associated with Mappable.
	// MappingIdentity is always nil is Mappable is nil, and may be nil even if
	// it isn't.
	MappingIdentity memmap.MappingIdentity

	// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
	// memory address. Otherwise, Offset is the represented offset into
	// Mappable.
	Offset uint64
}

func (k *Key) release() {
	if k.MappingIdentity != nil {
		k.MappingIdentity.DecRef()
	}
	k.Mappable = nil
	k.MappingIdentity = nil
}

func (k *Key) clone() Key {
	if k.MappingIdentity != nil {
		k.MappingIdentity.IncRef()
	}
	return *k
}

// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
func (k *Key) addr() usermem.Addr {
	return usermem.Addr(k.Offset)
}

// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
func (k *Key) matches(k2 *Key) bool {
	// k.MappingIdentity is ignored; it's only used for reference counting.
	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
}

// Target abstracts memory accesses and keys.
type Target interface {
	// SwapUint32 gives access to usermem.IO.SwapUint32.
	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)

	// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)

	// LoadUint32 gives access to usermem.IO.LoadUint32.
	LoadUint32(addr usermem.Addr) (uint32, error)

	// GetSharedKey returns a Key with kind KindSharedPrivate or
	// KindSharedMappable corresponding to the memory mapped at address addr.
	//
	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
	// reference is held on the MappingIdentity, which must be dropped by the
	// caller when the Key is no longer in use.
	GetSharedKey(addr usermem.Addr) (Key, error)
}

// check performs a basic equality check on the given address.
func check(t Target, addr usermem.Addr, val uint32) error {
	cur, err := t.LoadUint32(addr)
	if err != nil {
		return err
	}
	if cur != val {
		return syserror.EAGAIN
	}
	return nil
}

// atomicOp performs a complex operation on the given address.
func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
	opType := (opIn >> 28) & 0xf
	cmp := (opIn >> 24) & 0xf
	opArg := (opIn >> 12) & 0xfff
	cmpArg := opIn & 0xfff

	if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
		opArg = 1 << opArg
		opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
	}

	var (
		oldVal uint32
		err    error
	)
	if opType == linux.FUTEX_OP_SET {
		oldVal, err = t.SwapUint32(addr, opArg)
		if err != nil {
			return false, err
		}
	} else {
		for {
			oldVal, err = t.LoadUint32(addr)
			if err != nil {
				return false, err
			}
			var newVal uint32
			switch opType {
			case linux.FUTEX_OP_ADD:
				newVal = oldVal + opArg
			case linux.FUTEX_OP_OR:
				newVal = oldVal | opArg
			case linux.FUTEX_OP_ANDN:
				newVal = oldVal &^ opArg
			case linux.FUTEX_OP_XOR:
				newVal = oldVal ^ opArg
			default:
				return false, syserror.ENOSYS
			}
			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
			if err != nil {
				return false, err
			}
			if prev == oldVal {
				break // Success.
			}
		}
	}

	switch cmp {
	case linux.FUTEX_OP_CMP_EQ:
		return oldVal == cmpArg, nil
	case linux.FUTEX_OP_CMP_NE:
		return oldVal != cmpArg, nil
	case linux.FUTEX_OP_CMP_LT:
		return oldVal < cmpArg, nil
	case linux.FUTEX_OP_CMP_LE:
		return oldVal <= cmpArg, nil
	case linux.FUTEX_OP_CMP_GT:
		return oldVal > cmpArg, nil
	case linux.FUTEX_OP_CMP_GE:
		return oldVal >= cmpArg, nil
	default:
		return false, syserror.ENOSYS
	}
}

// Waiter is the struct which gets enqueued into buckets for wake up routines
// and requeue routines to scan and notify. Once a Waiter has been enqueued by
// WaitPrepare(), callers may listen on C for wake up events.
type Waiter struct {
	// Synchronization:
	//
	// - A Waiter that is not enqueued in a bucket is exclusively owned (no
	// synchronization applies).
	//
	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
	// waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
	// lock") of the containing bucket, and bitmask is immutable. Note that
	// since bucket is mutated using atomic memory operations, bucket.Load()
	// may be called without holding the bucket lock, although it may change
	// racily. See WaitComplete().
	//
	// - A Waiter is only guaranteed to be no longer queued after calling
	// WaitComplete().

	// waiterEntry links Waiter into bucket.waiters.
	waiterEntry

	// bucket is the bucket this waiter is queued in. If bucket is nil, the
	// waiter is not waiting and is not in any bucket.
	bucket AtomicPtrBucket

	// C is sent to when the Waiter is woken.
	C chan struct{}

	// key is what this waiter is waiting on.
	key Key

	// The bitmask we're waiting on.
	// This is used the case of a FUTEX_WAKE_BITSET.
	bitmask uint32

	// tid is the thread ID for the waiter in case this is a PI mutex.
	tid uint32
}

// NewWaiter returns a new unqueued Waiter.
func NewWaiter() *Waiter {
	return &Waiter{
		C: make(chan struct{}, 1),
	}
}

// woken returns true if w has been woken since the last call to WaitPrepare.
func (w *Waiter) woken() bool {
	return len(w.C) != 0
}

// bucket holds a list of waiters for a given address hash.
//
// +stateify savable
type bucket struct {
	// mu protects waiters and contained Waiter state. See comment in Waiter.
	mu sync.Mutex `state:"nosave"`

	waiters waiterList `state:"zerovalue"`
}

// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
// bucket and returns the number of waiters woken.
//
// Preconditions: b.mu must be locked.
func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
	done := 0
	for w := b.waiters.Front(); done < n && w != nil; {
		if !w.key.matches(key) || w.bitmask&bitmask == 0 {
			// Not matching.
			w = w.Next()
			continue
		}

		// Remove from the bucket and wake the waiter.
		woke := w
		w = w.Next() // Next iteration.
		b.wakeWaiterLocked(woke)
		done++
	}
	return done
}

func (b *bucket) wakeWaiterLocked(w *Waiter) {
	// Remove from the bucket and wake the waiter.
	b.waiters.Remove(w)
	w.C <- struct{}{}

	// NOTE: The above channel write establishes a write barrier according
	// to the memory model, so nothing may be ordered around it. Since
	// we've dequeued w and will never touch it again, we can safely
	// store nil to w.bucket here and allow the WaitComplete() to
	// short-circuit grabbing the bucket lock. If they somehow miss the
	// store, we are still holding the lock, so we can know that they won't
	// dequeue w, assume it's free and have the below operation
	// afterwards.
	w.bucket.Store(nil)
}

// requeueLocked takes n waiters from the bucket and moves them to naddr on the
// bucket "to".
//
// Preconditions: b and to must be locked.
func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
	done := 0
	for w := b.waiters.Front(); done < n && w != nil; {
		if !w.key.matches(key) {
			// Not matching.
			w = w.Next()
			continue
		}

		requeued := w
		w = w.Next() // Next iteration.
		b.waiters.Remove(requeued)
		requeued.key.release()
		requeued.key = nkey.clone()
		to.waiters.PushBack(requeued)
		requeued.bucket.Store(to)
		done++
	}
	return done
}

const (
	// bucketCount is the number of buckets per Manager. By having many of
	// these we reduce contention when concurrent yet unrelated calls are made.
	bucketCount     = 1 << bucketCountBits
	bucketCountBits = 10
)

// getKey returns a Key representing address addr in c.
func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
	// Ensure the address is aligned.
	// It must be a DWORD boundary.
	if addr&0x3 != 0 {
		return Key{}, syserror.EINVAL
	}
	if private {
		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
	}
	return t.GetSharedKey(addr)
}

// bucketIndexForAddr returns the index into Manager.buckets for addr.
func bucketIndexForAddr(addr usermem.Addr) uintptr {
	// - The bottom 2 bits of addr must be 0, per getKey.
	//
	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
	// for a canonical address, and (on all existing platforms) bit 47 must be
	// 0 for an application address.
	//
	// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
	// bits. We choose one of the simplest possible hash functions that at
	// least uses all 45 useful bits in the output, given that bucketCountBits
	// == 10. This hash function also has the property that it will usually map
	// adjacent addresses to adjacent buckets, slightly improving memory
	// locality when an application synchronization structure uses multiple
	// nearby futexes.
	//
	// Note that despite the large number of arithmetic operations in the
	// function, many components can be computed in parallel, such that the
	// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
	// (addr >> 42)" without any additional grouping, the compiler puts all 4
	// additions in the critical path.
	h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
	h2 := uintptr(addr>>32) + uintptr(addr>>42)
	return (h1 + h2) % bucketCount
}

// Manager holds futex state for a single virtual address space.
//
// +stateify savable
type Manager struct {
	// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
	// futexes.
	privateBuckets [bucketCount]bucket `state:"zerovalue"`

	// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
	// may be shared by multiple Managers. The sharedBucket pointer is
	// immutable.
	sharedBucket *bucket
}

// NewManager returns an initialized futex manager.
func NewManager() *Manager {
	return &Manager{
		sharedBucket: &bucket{},
	}
}

// Fork returns a new Manager. Shared futex clients using the returned Manager
// may interoperate with those using m.
func (m *Manager) Fork() *Manager {
	return &Manager{
		sharedBucket: m.sharedBucket,
	}
}

// lockBucket returns a locked bucket for the given key.
func (m *Manager) lockBucket(k *Key) *bucket {
	var b *bucket
	if k.Kind == KindSharedMappable {
		b = m.sharedBucket
	} else {
		b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
	}
	b.mu.Lock()
	return b
}

// lockBuckets returns locked buckets for the given keys.
func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
	// Buckets must be consistently ordered to avoid circular lock
	// dependencies. We order buckets in m.privateBuckets by index (lowest
	// index first), and all buckets in m.privateBuckets precede
	// m.sharedBucket.

	// Handle the common case first:
	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
		i1 := bucketIndexForAddr(k1.addr())
		i2 := bucketIndexForAddr(k2.addr())
		b1 := &m.privateBuckets[i1]
		b2 := &m.privateBuckets[i2]
		switch {
		case i1 < i2:
			b1.mu.Lock()
			b2.mu.Lock()
		case i2 < i1:
			b2.mu.Lock()
			b1.mu.Lock()
		default:
			b1.mu.Lock()
		}
		return b1, b2
	}

	// At least one of b1 or b2 should be m.sharedBucket.
	b1 := m.sharedBucket
	b2 := m.sharedBucket
	if k1.Kind != KindSharedMappable {
		b1 = m.lockBucket(k1)
	} else if k2.Kind != KindSharedMappable {
		b2 = m.lockBucket(k2)
	}
	m.sharedBucket.mu.Lock()
	return b1, b2
}

// Wake wakes up to n waiters matching the bitmask on the given addr.
// The number of waiters woken is returned.
func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) {
	// This function is very hot; avoid defer.
	k, err := getKey(t, addr, private)
	if err != nil {
		return 0, err
	}

	b := m.lockBucket(&k)
	r := b.wakeLocked(&k, bitmask, n)

	b.mu.Unlock()
	k.release()
	return r, nil
}

func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
	k1, err := getKey(t, addr, private)
	if err != nil {
		return 0, err
	}
	defer k1.release()
	k2, err := getKey(t, naddr, private)
	if err != nil {
		return 0, err
	}
	defer k2.release()

	b1, b2 := m.lockBuckets(&k1, &k2)
	defer b1.mu.Unlock()
	if b2 != b1 {
		defer b2.mu.Unlock()
	}

	if checkval {
		if err := check(t, addr, val); err != nil {
			return 0, err
		}
	}

	// Wake the number required.
	done := b1.wakeLocked(&k1, ^uint32(0), nwake)

	// Requeue the number required.
	b1.requeueLocked(b2, &k1, &k2, nreq)

	return done, nil
}

// Requeue wakes up to nwake waiters on the given addr, and unconditionally
// requeues up to nreq waiters on naddr.
func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) {
	return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
}

// RequeueCmp atomically checks that the addr contains val (via the Target),
// wakes up to nwake waiters on addr and then unconditionally requeues nreq
// waiters on naddr.
func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
	return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
}

// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
// waiters unconditionally from addr1, and, based on the original value at addr2
// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
// It returns the total number of waiters woken.
func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
	k1, err := getKey(t, addr1, private)
	if err != nil {
		return 0, err
	}
	defer k1.release()
	k2, err := getKey(t, addr2, private)
	if err != nil {
		return 0, err
	}
	defer k2.release()

	b1, b2 := m.lockBuckets(&k1, &k2)
	defer b1.mu.Unlock()
	if b2 != b1 {
		defer b2.mu.Unlock()
	}

	done := 0
	cond, err := atomicOp(t, addr2, op)
	if err != nil {
		return 0, err
	}

	// Wake up up to nwake1 entries from the first bucket.
	done = b1.wakeLocked(&k1, ^uint32(0), nwake1)

	// Wake up up to nwake2 entries from the second bucket if the
	// operation yielded true.
	if cond {
		done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
	}

	return done, nil
}

// WaitPrepare atomically checks that addr contains val (via the Checker), then
// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
// Waiter must be subsequently removed by calling WaitComplete, whether or not
// a wakeup is received on w.C.
func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error {
	k, err := getKey(t, addr, private)
	if err != nil {
		return err
	}
	// Ownership of k is transferred to w below.

	// Prepare the Waiter before taking the bucket lock.
	select {
	case <-w.C:
	default:
	}
	w.key = k
	w.bitmask = bitmask

	b := m.lockBucket(&k)
	// This function is very hot; avoid defer.

	// Perform our atomic check.
	if err := check(t, addr, val); err != nil {
		b.mu.Unlock()
		w.key.release()
		return err
	}

	// Add the waiter to the bucket.
	b.waiters.PushBack(w)
	w.bucket.Store(b)

	b.mu.Unlock()
	return nil
}

// WaitComplete must be called when a Waiter previously added by WaitPrepare is
// no longer eligible to be woken.
func (m *Manager) WaitComplete(w *Waiter) {
	// Remove w from the bucket it's in.
	for {
		b := w.bucket.Load()

		// If b is nil, the waiter isn't in any bucket anymore. This can't be
		// racy because the waiter can't be concurrently re-queued in another
		// bucket.
		if b == nil {
			break
		}

		// Take the bucket lock. Note that without holding the bucket lock, the
		// waiter is not guaranteed to stay in that bucket, so after we take
		// the bucket lock, we must ensure that the bucket hasn't changed: if
		// it happens to have changed, we release the old bucket lock and try
		// again with the new bucket; if it hasn't changed, we know it won't
		// change now because we hold the lock.
		b.mu.Lock()
		if b != w.bucket.Load() {
			b.mu.Unlock()
			continue
		}

		// Remove waiter from bucket.
		b.waiters.Remove(w)
		w.bucket.Store(nil)
		b.mu.Unlock()
		break
	}

	// Release references held by the waiter.
	w.key.release()
}

// LockPI attempts to lock the futex following the Priority-inheritance futex
// rules. The lock is acquired only when 'addr' points to 0. The TID of the
// calling task is set to 'addr' to indicate the futex is owned. It returns true
// if the futex was successfully acquired.
//
// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
// exit_robust_list()). Given we don't support robust lists, although handled
// below, it's never set.
func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
	k, err := getKey(t, addr, private)
	if err != nil {
		return false, err
	}
	// Ownership of k is transferred to w below.

	// Prepare the Waiter before taking the bucket lock.
	select {
	case <-w.C:
	default:
	}
	w.key = k
	w.tid = tid

	b := m.lockBucket(&k)
	// Hot function: avoid defers.

	success, err := m.lockPILocked(w, t, addr, tid, b, try)
	if err != nil {
		w.key.release()
		b.mu.Unlock()
		return false, err
	}
	if success || try {
		// Release waiter if it's not going to be a wait.
		w.key.release()
	}
	b.mu.Unlock()
	return success, nil
}

func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
	for {
		cur, err := t.LoadUint32(addr)
		if err != nil {
			return false, err
		}
		if (cur & linux.FUTEX_TID_MASK) == tid {
			return false, syserror.EDEADLK
		}

		if (cur & linux.FUTEX_TID_MASK) == 0 {
			// No owner and no waiters, try to acquire the futex.

			// Set TID and preserve owner died status.
			val := tid
			val |= cur & linux.FUTEX_OWNER_DIED
			prev, err := t.CompareAndSwapUint32(addr, cur, val)
			if err != nil {
				return false, err
			}
			if prev != cur {
				// CAS failed, retry...
				// Linux reacquires the bucket lock on retries, which will re-lookup the
				// mapping at the futex address. However, retrying while holding the
				// lock is more efficient and reduces the chance of another conflict.
				continue
			}
			// Futex acquired.
			return true, nil
		}

		// Futex is already owned, prepare to wait.

		if try {
			// Caller doesn't want to wait.
			return false, nil
		}

		// Set waiters bit if not set yet.
		if cur&linux.FUTEX_WAITERS == 0 {
			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
			if err != nil {
				return false, err
			}
			if prev != cur {
				// CAS failed, retry...
				continue
			}
		}

		// Add the waiter to the bucket.
		b.waiters.PushBack(w)
		w.bucket.Store(b)
		return false, nil
	}
}

// UnlockPI unlock the futex following the Priority-inheritance futex
// rules. The address provided must contain the caller's TID. If there are
// waiters, TID of the next waiter (FIFO) is set to the given address, and the
// waiter woken up. If there are no waiters, 0 is set to the address.
func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
	k, err := getKey(t, addr, private)
	if err != nil {
		return err
	}
	b := m.lockBucket(&k)

	err = m.unlockPILocked(t, addr, tid, b)

	k.release()
	b.mu.Unlock()
	return err
}

func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
	cur, err := t.LoadUint32(addr)
	if err != nil {
		return err
	}

	if (cur & linux.FUTEX_TID_MASK) != tid {
		return syserror.EPERM
	}

	if b.waiters.Empty() {
		// It's safe to set 0 because there are no waiters, no new owner, and the
		// executing task is the current owner (no owner died bit).
		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
		if err != nil {
			return err
		}
		if prev != cur {
			// Let user mode handle CAS races. This is different than lock, which
			// retries when CAS fails.
			return syserror.EAGAIN
		}
		return nil
	}

	next := b.waiters.Front()

	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
	// set, because the executing task takes over as the owner.
	val := next.tid
	if next.Next() != nil {
		val |= linux.FUTEX_WAITERS
	}

	prev, err := t.CompareAndSwapUint32(addr, cur, val)
	if err != nil {
		return err
	}
	if prev != cur {
		return syserror.EINVAL
	}

	b.wakeWaiterLocked(next)
	return nil
}