1 files changed, 783 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
new file mode 100644
index 000000000..bb38eb81e
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -0,0 +1,783 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package futex provides an implementation of the futex interface as found in
+// the Linux kernel. It allows one to easily transform Wait() calls into waits
+// on a channel, which is useful in a Go-based kernel, for example.
+package futex
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// KeyKind indicates the type of a Key.
+type KeyKind int
+
+const (
+	// KindPrivate indicates a private futex (a futex syscall with the
+	// FUTEX_PRIVATE_FLAG set).
+	KindPrivate KeyKind = iota
+
+	// KindSharedPrivate indicates a shared futex on a private memory mapping.
+	// Although KindPrivate and KindSharedPrivate futexes both use memory
+	// addresses to identify futexes, they do not interoperate (in Linux, the
+	// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
+	// comparison).
+	KindSharedPrivate
+
+	// KindSharedMappable indicates a shared futex on a memory mapping other
+	// than a private anonymous memory mapping.
+	KindSharedMappable
+)
+
+// Key represents something that a futex waiter may wait on.
+type Key struct {
+	// Kind is the type of the Key.
+	Kind KeyKind
+
+	// Mappable is the memory-mapped object that is represented by the Key.
+	// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
+	// even if it is.
+	Mappable memmap.Mappable
+
+	// MappingIdentity is the MappingIdentity associated with Mappable.
+	// MappingIdentity is always nil is Mappable is nil, and may be nil even if
+	// it isn't.
+	MappingIdentity memmap.MappingIdentity
+
+	// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
+	// memory address. Otherwise, Offset is the represented offset into
+	// Mappable.
+	Offset uint64
+}
+
+func (k *Key) release() {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.DecRef()
+	}
+	k.Mappable = nil
+	k.MappingIdentity = nil
+}
+
+func (k *Key) clone() Key {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.IncRef()
+	}
+	return *k
+}
+
+// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
+func (k *Key) addr() usermem.Addr {
+	return usermem.Addr(k.Offset)
+}
+
+// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
+func (k *Key) matches(k2 *Key) bool {
+	// k.MappingIdentity is ignored; it's only used for reference counting.
+	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
+}
+
+// Target abstracts memory accesses and keys.
+type Target interface {
+	// SwapUint32 gives access to usermem.IO.SwapUint32.
+	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
+
+	// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
+	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+
+	// LoadUint32 gives access to usermem.IO.LoadUint32.
+	LoadUint32(addr usermem.Addr) (uint32, error)
+
+	// GetSharedKey returns a Key with kind KindSharedPrivate or
+	// KindSharedMappable corresponding to the memory mapped at address addr.
+	//
+	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
+	// reference is held on the MappingIdentity, which must be dropped by the
+	// caller when the Key is no longer in use.
+	GetSharedKey(addr usermem.Addr) (Key, error)
+}
+
+// check performs a basic equality check on the given address.
+func check(t Target, addr usermem.Addr, val uint32) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+	if cur != val {
+		return syserror.EAGAIN
+	}
+	return nil
+}
+
+// atomicOp performs a complex operation on the given address.
+func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
+	opType := (opIn >> 28) & 0xf
+	cmp := (opIn >> 24) & 0xf
+	opArg := (opIn >> 12) & 0xfff
+	cmpArg := opIn & 0xfff
+
+	if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
+		opArg = 1 << opArg
+		opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
+	}
+
+	var (
+		oldVal uint32
+		err    error
+	)
+	if opType == linux.FUTEX_OP_SET {
+		oldVal, err = t.SwapUint32(addr, opArg)
+		if err != nil {
+			return false, err
+		}
+	} else {
+		for {
+			oldVal, err = t.LoadUint32(addr)
+			if err != nil {
+				return false, err
+			}
+			var newVal uint32
+			switch opType {
+			case linux.FUTEX_OP_ADD:
+				newVal = oldVal + opArg
+			case linux.FUTEX_OP_OR:
+				newVal = oldVal | opArg
+			case linux.FUTEX_OP_ANDN:
+				newVal = oldVal &^ opArg
+			case linux.FUTEX_OP_XOR:
+				newVal = oldVal ^ opArg
+			default:
+				return false, syserror.ENOSYS
+			}
+			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
+			if err != nil {
+				return false, err
+			}
+			if prev == oldVal {
+				break // Success.
+			}
+		}
+	}
+
+	switch cmp {
+	case linux.FUTEX_OP_CMP_EQ:
+		return oldVal == cmpArg, nil
+	case linux.FUTEX_OP_CMP_NE:
+		return oldVal != cmpArg, nil
+	case linux.FUTEX_OP_CMP_LT:
+		return oldVal < cmpArg, nil
+	case linux.FUTEX_OP_CMP_LE:
+		return oldVal <= cmpArg, nil
+	case linux.FUTEX_OP_CMP_GT:
+		return oldVal > cmpArg, nil
+	case linux.FUTEX_OP_CMP_GE:
+		return oldVal >= cmpArg, nil
+	default:
+		return false, syserror.ENOSYS
+	}
+}
+
+// Waiter is the struct which gets enqueued into buckets for wake up routines
+// and requeue routines to scan and notify. Once a Waiter has been enqueued by
+// WaitPrepare(), callers may listen on C for wake up events.
+type Waiter struct {
+	// Synchronization:
+	//
+	// - A Waiter that is not enqueued in a bucket is exclusively owned (no
+	// synchronization applies).
+	//
+	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
+	// waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
+	// lock") of the containing bucket, and bitmask is immutable. Note that
+	// since bucket is mutated using atomic memory operations, bucket.Load()
+	// may be called without holding the bucket lock, although it may change
+	// racily. See WaitComplete().
+	//
+	// - A Waiter is only guaranteed to be no longer queued after calling
+	// WaitComplete().
+
+	// waiterEntry links Waiter into bucket.waiters.
+	waiterEntry
+
+	// bucket is the bucket this waiter is queued in. If bucket is nil, the
+	// waiter is not waiting and is not in any bucket.
+	bucket AtomicPtrBucket
+
+	// C is sent to when the Waiter is woken.
+	C chan struct{}
+
+	// key is what this waiter is waiting on.
+	key Key
+
+	// The bitmask we're waiting on.
+	// This is used the case of a FUTEX_WAKE_BITSET.
+	bitmask uint32
+
+	// tid is the thread ID for the waiter in case this is a PI mutex.
+	tid uint32
+}
+
+// NewWaiter returns a new unqueued Waiter.
+func NewWaiter() *Waiter {
+	return &Waiter{
+		C: make(chan struct{}, 1),
+	}
+}
+
+// woken returns true if w has been woken since the last call to WaitPrepare.
+func (w *Waiter) woken() bool {
+	return len(w.C) != 0
+}
+
+// bucket holds a list of waiters for a given address hash.
+//
+// +stateify savable
+type bucket struct {
+	// mu protects waiters and contained Waiter state. See comment in Waiter.
+	mu sync.Mutex `state:"nosave"`
+
+	waiters waiterList `state:"zerovalue"`
+}
+
+// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
+// bucket and returns the number of waiters woken.
+//
+// Preconditions: b.mu must be locked.
+func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if !w.key.matches(key) || w.bitmask&bitmask == 0 {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		// Remove from the bucket and wake the waiter.
+		woke := w
+		w = w.Next() // Next iteration.
+		b.wakeWaiterLocked(woke)
+		done++
+	}
+	return done
+}
+
+func (b *bucket) wakeWaiterLocked(w *Waiter) {
+	// Remove from the bucket and wake the waiter.
+	b.waiters.Remove(w)
+	w.C <- struct{}{}
+
+	// NOTE: The above channel write establishes a write barrier according
+	// to the memory model, so nothing may be ordered around it. Since
+	// we've dequeued w and will never touch it again, we can safely
+	// store nil to w.bucket here and allow the WaitComplete() to
+	// short-circuit grabbing the bucket lock. If they somehow miss the
+	// store, we are still holding the lock, so we can know that they won't
+	// dequeue w, assume it's free and have the below operation
+	// afterwards.
+	w.bucket.Store(nil)
+}
+
+// requeueLocked takes n waiters from the bucket and moves them to naddr on the
+// bucket "to".
+//
+// Preconditions: b and to must be locked.
+func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if !w.key.matches(key) {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		requeued := w
+		w = w.Next() // Next iteration.
+		b.waiters.Remove(requeued)
+		requeued.key.release()
+		requeued.key = nkey.clone()
+		to.waiters.PushBack(requeued)
+		requeued.bucket.Store(to)
+		done++
+	}
+	return done
+}
+
+const (
+	// bucketCount is the number of buckets per Manager. By having many of
+	// these we reduce contention when concurrent yet unrelated calls are made.
+	bucketCount     = 1 << bucketCountBits
+	bucketCountBits = 10
+)
+
+// getKey returns a Key representing address addr in c.
+func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
+	// Ensure the address is aligned.
+	// It must be a DWORD boundary.
+	if addr&0x3 != 0 {
+		return Key{}, syserror.EINVAL
+	}
+	if private {
+		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
+	}
+	return t.GetSharedKey(addr)
+}
+
+// bucketIndexForAddr returns the index into Manager.buckets for addr.
+func bucketIndexForAddr(addr usermem.Addr) uintptr {
+	// - The bottom 2 bits of addr must be 0, per getKey.
+	//
+	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
+	// for a canonical address, and (on all existing platforms) bit 47 must be
+	// 0 for an application address.
+	//
+	// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
+	// bits. We choose one of the simplest possible hash functions that at
+	// least uses all 45 useful bits in the output, given that bucketCountBits
+	// == 10. This hash function also has the property that it will usually map
+	// adjacent addresses to adjacent buckets, slightly improving memory
+	// locality when an application synchronization structure uses multiple
+	// nearby futexes.
+	//
+	// Note that despite the large number of arithmetic operations in the
+	// function, many components can be computed in parallel, such that the
+	// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
+	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
+	// (addr >> 42)" without any additional grouping, the compiler puts all 4
+	// additions in the critical path.
+	h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
+	h2 := uintptr(addr>>32) + uintptr(addr>>42)
+	return (h1 + h2) % bucketCount
+}
+
+// Manager holds futex state for a single virtual address space.
+//
+// +stateify savable
+type Manager struct {
+	// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
+	// futexes.
+	privateBuckets [bucketCount]bucket `state:"zerovalue"`
+
+	// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
+	// may be shared by multiple Managers. The sharedBucket pointer is
+	// immutable.
+	sharedBucket *bucket
+}
+
+// NewManager returns an initialized futex manager.
+func NewManager() *Manager {
+	return &Manager{
+		sharedBucket: &bucket{},
+	}
+}
+
+// Fork returns a new Manager. Shared futex clients using the returned Manager
+// may interoperate with those using m.
+func (m *Manager) Fork() *Manager {
+	return &Manager{
+		sharedBucket: m.sharedBucket,
+	}
+}
+
+// lockBucket returns a locked bucket for the given key.
+func (m *Manager) lockBucket(k *Key) *bucket {
+	var b *bucket
+	if k.Kind == KindSharedMappable {
+		b = m.sharedBucket
+	} else {
+		b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
+	}
+	b.mu.Lock()
+	return b
+}
+
+// lockBuckets returns locked buckets for the given keys.
+func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
+	// Buckets must be consistently ordered to avoid circular lock
+	// dependencies. We order buckets in m.privateBuckets by index (lowest
+	// index first), and all buckets in m.privateBuckets precede
+	// m.sharedBucket.
+
+	// Handle the common case first:
+	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
+		i1 := bucketIndexForAddr(k1.addr())
+		i2 := bucketIndexForAddr(k2.addr())
+		b1 := &m.privateBuckets[i1]
+		b2 := &m.privateBuckets[i2]
+		switch {
+		case i1 < i2:
+			b1.mu.Lock()
+			b2.mu.Lock()
+		case i2 < i1:
+			b2.mu.Lock()
+			b1.mu.Lock()
+		default:
+			b1.mu.Lock()
+		}
+		return b1, b2
+	}
+
+	// At least one of b1 or b2 should be m.sharedBucket.
+	b1 := m.sharedBucket
+	b2 := m.sharedBucket
+	if k1.Kind != KindSharedMappable {
+		b1 = m.lockBucket(k1)
+	} else if k2.Kind != KindSharedMappable {
+		b2 = m.lockBucket(k2)
+	}
+	m.sharedBucket.mu.Lock()
+	return b1, b2
+}
+
+// Wake wakes up to n waiters matching the bitmask on the given addr.
+// The number of waiters woken is returned.
+func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) {
+	// This function is very hot; avoid defer.
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return 0, err
+	}
+
+	b := m.lockBucket(&k)
+	r := b.wakeLocked(&k, bitmask, n)
+
+	b.mu.Unlock()
+	k.release()
+	return r, nil
+}
+
+func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
+	k1, err := getKey(t, addr, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k1.release()
+	k2, err := getKey(t, naddr, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k2.release()
+
+	b1, b2 := m.lockBuckets(&k1, &k2)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
+
+	if checkval {
+		if err := check(t, addr, val); err != nil {
+			return 0, err
+		}
+	}
+
+	// Wake the number required.
+	done := b1.wakeLocked(&k1, ^uint32(0), nwake)
+
+	// Requeue the number required.
+	b1.requeueLocked(b2, &k1, &k2, nreq)
+
+	return done, nil
+}
+
+// Requeue wakes up to nwake waiters on the given addr, and unconditionally
+// requeues up to nreq waiters on naddr.
+func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) {
+	return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
+}
+
+// RequeueCmp atomically checks that the addr contains val (via the Target),
+// wakes up to nwake waiters on addr and then unconditionally requeues nreq
+// waiters on naddr.
+func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
+	return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
+}
+
+// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
+// waiters unconditionally from addr1, and, based on the original value at addr2
+// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
+// It returns the total number of waiters woken.
+func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
+	k1, err := getKey(t, addr1, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k1.release()
+	k2, err := getKey(t, addr2, private)
+	if err != nil {
+		return 0, err
+	}
+	defer k2.release()
+
+	b1, b2 := m.lockBuckets(&k1, &k2)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
+
+	done := 0
+	cond, err := atomicOp(t, addr2, op)
+	if err != nil {
+		return 0, err
+	}
+
+	// Wake up up to nwake1 entries from the first bucket.
+	done = b1.wakeLocked(&k1, ^uint32(0), nwake1)
+
+	// Wake up up to nwake2 entries from the second bucket if the
+	// operation yielded true.
+	if cond {
+		done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
+	}
+
+	return done, nil
+}
+
+// WaitPrepare atomically checks that addr contains val (via the Checker), then
+// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
+// Waiter must be subsequently removed by calling WaitComplete, whether or not
+// a wakeup is received on w.C.
+func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.bitmask = bitmask
+
+	b := m.lockBucket(&k)
+	// This function is very hot; avoid defer.
+
+	// Perform our atomic check.
+	if err := check(t, addr, val); err != nil {
+		b.mu.Unlock()
+		w.key.release()
+		return err
+	}
+
+	// Add the waiter to the bucket.
+	b.waiters.PushBack(w)
+	w.bucket.Store(b)
+
+	b.mu.Unlock()
+	return nil
+}
+
+// WaitComplete must be called when a Waiter previously added by WaitPrepare is
+// no longer eligible to be woken.
+func (m *Manager) WaitComplete(w *Waiter) {
+	// Remove w from the bucket it's in.
+	for {
+		b := w.bucket.Load()
+
+		// If b is nil, the waiter isn't in any bucket anymore. This can't be
+		// racy because the waiter can't be concurrently re-queued in another
+		// bucket.
+		if b == nil {
+			break
+		}
+
+		// Take the bucket lock. Note that without holding the bucket lock, the
+		// waiter is not guaranteed to stay in that bucket, so after we take
+		// the bucket lock, we must ensure that the bucket hasn't changed: if
+		// it happens to have changed, we release the old bucket lock and try
+		// again with the new bucket; if it hasn't changed, we know it won't
+		// change now because we hold the lock.
+		b.mu.Lock()
+		if b != w.bucket.Load() {
+			b.mu.Unlock()
+			continue
+		}
+
+		// Remove waiter from bucket.
+		b.waiters.Remove(w)
+		w.bucket.Store(nil)
+		b.mu.Unlock()
+		break
+	}
+
+	// Release references held by the waiter.
+	w.key.release()
+}
+
+// LockPI attempts to lock the futex following the Priority-inheritance futex
+// rules. The lock is acquired only when 'addr' points to 0. The TID of the
+// calling task is set to 'addr' to indicate the futex is owned. It returns true
+// if the futex was successfully acquired.
+//
+// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
+// exit_robust_list()). Given we don't support robust lists, although handled
+// below, it's never set.
+func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return false, err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.tid = tid
+
+	b := m.lockBucket(&k)
+	// Hot function: avoid defers.
+
+	success, err := m.lockPILocked(w, t, addr, tid, b, try)
+	if err != nil {
+		w.key.release()
+		b.mu.Unlock()
+		return false, err
+	}
+	if success || try {
+		// Release waiter if it's not going to be a wait.
+		w.key.release()
+	}
+	b.mu.Unlock()
+	return success, nil
+}
+
+func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
+	for {
+		cur, err := t.LoadUint32(addr)
+		if err != nil {
+			return false, err
+		}
+		if (cur & linux.FUTEX_TID_MASK) == tid {
+			return false, syserror.EDEADLK
+		}
+
+		if (cur & linux.FUTEX_TID_MASK) == 0 {
+			// No owner and no waiters, try to acquire the futex.
+
+			// Set TID and preserve owner died status.
+			val := tid
+			val |= cur & linux.FUTEX_OWNER_DIED
+			prev, err := t.CompareAndSwapUint32(addr, cur, val)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				// Linux reacquires the bucket lock on retries, which will re-lookup the
+				// mapping at the futex address. However, retrying while holding the
+				// lock is more efficient and reduces the chance of another conflict.
+				continue
+			}
+			// Futex acquired.
+			return true, nil
+		}
+
+		// Futex is already owned, prepare to wait.
+
+		if try {
+			// Caller doesn't want to wait.
+			return false, nil
+		}
+
+		// Set waiters bit if not set yet.
+		if cur&linux.FUTEX_WAITERS == 0 {
+			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				continue
+			}
+		}
+
+		// Add the waiter to the bucket.
+		b.waiters.PushBack(w)
+		w.bucket.Store(b)
+		return false, nil
+	}
+}
+
+// UnlockPI unlock the futex following the Priority-inheritance futex
+// rules. The address provided must contain the caller's TID. If there are
+// waiters, TID of the next waiter (FIFO) is set to the given address, and the
+// waiter woken up. If there are no waiters, 0 is set to the address.
+func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	b := m.lockBucket(&k)
+
+	err = m.unlockPILocked(t, addr, tid, b)
+
+	k.release()
+	b.mu.Unlock()
+	return err
+}
+
+func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+
+	if (cur & linux.FUTEX_TID_MASK) != tid {
+		return syserror.EPERM
+	}
+
+	if b.waiters.Empty() {
+		// It's safe to set 0 because there are no waiters, no new owner, and the
+		// executing task is the current owner (no owner died bit).
+		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
+		if err != nil {
+			return err
+		}
+		if prev != cur {
+			// Let user mode handle CAS races. This is different than lock, which
+			// retries when CAS fails.
+			return syserror.EAGAIN
+		}
+		return nil
+	}
+
+	next := b.waiters.Front()
+
+	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
+	// set, because the executing task takes over as the owner.
+	val := next.tid
+	if next.Next() != nil {
+		val |= linux.FUTEX_WAITERS
+	}
+
+	prev, err := t.CompareAndSwapUint32(addr, cur, val)
+	if err != nil {
+		return err
+	}
+	if prev != cur {
+		return syserror.EINVAL
+	}
+
+	b.wakeWaiterLocked(next)
+	return nil
+}