1 files changed, 257 insertions, 125 deletions
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 4a1f2a0ef..54b1982a0 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -19,11 +19,78 @@ package futex
 
 import (
 	"sync"
-	"sync/atomic"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
+// KeyKind indicates the type of a Key.
+type KeyKind int
+
+const (
+	// KindPrivate indicates a private futex (a futex syscall with the
+	// FUTEX_PRIVATE_FLAG set).
+	KindPrivate KeyKind = iota
+
+	// KindSharedPrivate indicates a shared futex on a private memory mapping.
+	// Although KindPrivate and KindSharedPrivate futexes both use memory
+	// addresses to identify futexes, they do not interoperate (in Linux, the
+	// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
+	// comparison).
+	KindSharedPrivate
+
+	// KindSharedMappable indicates a shared futex on a memory mapping other
+	// than a private anonymous memory mapping.
+	KindSharedMappable
+)
+
+// Key represents something that a futex waiter may wait on.
+type Key struct {
+	// Kind is the type of the Key.
+	Kind KeyKind
+
+	// Mappable is the memory-mapped object that is represented by the Key.
+	// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
+	// even if it is.
+	Mappable memmap.Mappable
+
+	// MappingIdentity is the MappingIdentity associated with Mappable.
+	// MappingIdentity is always nil is Mappable is nil, and may be nil even if
+	// it isn't.
+	MappingIdentity memmap.MappingIdentity
+
+	// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
+	// memory address. Otherwise, Offset is the represented offset into
+	// Mappable.
+	Offset uint64
+}
+
+func (k *Key) release() {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.DecRef()
+	}
+	k.Mappable = nil
+	k.MappingIdentity = nil
+}
+
+func (k *Key) clone() Key {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.IncRef()
+	}
+	return *k
+}
+
+// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
+func (k *Key) addr() uintptr {
+	return uintptr(k.Offset)
+}
+
+// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
+func (k *Key) matches(k2 *Key) bool {
+	// k.MappingIdentity is ignored; it's only used for reference counting.
+	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
+}
+
 // Checker abstracts memory accesses. This is useful because the "addresses"
 // used in this package may not be real addresses (they could be indices of an
 // array, for example), or they could be mapped via some special mechanism.
@@ -41,6 +108,14 @@ type Checker interface {
 	// Note that op is an opaque operation whose behaviour is defined
 	// outside of the futex manager.
 	Op(addr uintptr, op uint32) (bool, error)
+
+	// GetSharedKey returns a Key with kind KindSharedPrivate or
+	// KindSharedMappable corresponding to the memory mapped at address addr.
+	//
+	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
+	// reference is held on the MappingIdentity, which must be dropped by the
+	// caller when the Key is no longer in use.
+	GetSharedKey(addr uintptr) (Key, error)
 }
 
 // Waiter is the struct which gets enqueued into buckets for wake up routines
@@ -53,11 +128,11 @@ type Waiter struct {
 	// synchronization applies).
 	//
 	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
-	// waiterEntry, complete, and addr are protected by the bucket.mu ("bucket
-	// lock") of the containing bucket, and bitmask is immutable. complete and
-	// addr are additionally mutated using atomic memory operations, ensuring
-	// that they can be read using atomic memory operations without holding the
-	// bucket lock.
+	// waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
+	// lock") of the containing bucket, and bitmask is immutable. Note that
+	// since bucket is mutated using atomic memory operations, bucket.Load()
+	// may be called without holding the bucket lock, although it may change
+	// racily. See WaitComplete().
 	//
 	// - A Waiter is only guaranteed to be no longer queued after calling
 	// WaitComplete().
@@ -65,15 +140,15 @@ type Waiter struct {
 	// waiterEntry links Waiter into bucket.waiters.
 	waiterEntry
 
-	// complete is 1 if the Waiter was removed from its bucket by a wakeup and
-	// 0 otherwise.
-	complete int32
+	// bucket is the bucket this waiter is queued in. If bucket is nil, the
+	// waiter is not waiting and is not in any bucket.
+	bucket AtomicPtrBucket
 
 	// C is sent to when the Waiter is woken.
 	C chan struct{}
 
-	// addr is the address being waited on.
-	addr uintptr
+	// key is what this waiter is waiting on.
+	key Key
 
 	// The bitmask we're waiting on.
 	// This is used the case of a FUTEX_WAKE_BITSET.
@@ -87,7 +162,14 @@ func NewWaiter() *Waiter {
 	}
 }
 
+// woken returns true if w has been woken since the last call to WaitPrepare.
+func (w *Waiter) woken() bool {
+	return len(w.C) != 0
+}
+
 // bucket holds a list of waiters for a given address hash.
+//
+// +stateify savable
 type bucket struct {
 	// mu protects waiters and contained Waiter state. See comment in Waiter.
 	mu sync.Mutex `state:"nosave"`
@@ -99,10 +181,10 @@ type bucket struct {
 // bucket and returns the number of waiters woken.
 //
 // Preconditions: b.mu must be locked.
-func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
+func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
 	done := 0
 	for w := b.waiters.Front(); done < n && w != nil; {
-		if w.addr != addr || w.bitmask&bitmask == 0 {
+		if !w.key.matches(key) || w.bitmask&bitmask == 0 {
 			// Not matching.
 			w = w.Next()
 			continue
@@ -114,15 +196,15 @@ func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
 		b.waiters.Remove(woke)
 		woke.C <- struct{}{}
 
-		// NOTE: The above channel write establishes a write barrier
-		// according to the memory model, so nothing may be ordered
-		// around it. Since we've dequeued w and will never touch it
-		// again, we can safely store 1 to w.complete here and allow
-		// the WaitComplete() to short-circuit grabbing the bucket
-		// lock. If they somehow miss the w.complete, we are still
-		// holding the lock, so we can know that they won't dequeue w,
-		// assume it's free and have the below operation afterwards.
-		atomic.StoreInt32(&woke.complete, 1)
+		// NOTE: The above channel write establishes a write barrier according
+		// to the memory model, so nothing may be ordered around it. Since
+		// we've dequeued woke and will never touch it again, we can safely
+		// store nil to woke.bucket here and allow the WaitComplete() to
+		// short-circuit grabbing the bucket lock. If they somehow miss the
+		// store, we are still holding the lock, so we can know that they won't
+		// dequeue woke, assume it's free and have the below operation
+		// afterwards.
+		woke.bucket.Store(nil)
 		done++
 	}
 	return done
@@ -132,10 +214,10 @@ func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
 // bucket "to".
 //
 // Preconditions: b and to must be locked.
-func (b *bucket) requeueLocked(to *bucket, addr, naddr uintptr, n int) int {
+func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
 	done := 0
 	for w := b.waiters.Front(); done < n && w != nil; {
-		if w.addr != addr {
+		if !w.key.matches(key) {
 			// Not matching.
 			w = w.Next()
 			continue
@@ -144,8 +226,10 @@ func (b *bucket) requeueLocked(to *bucket, addr, naddr uintptr, n int) int {
 		requeued := w
 		w = w.Next() // Next iteration.
 		b.waiters.Remove(requeued)
-		atomic.StoreUintptr(&requeued.addr, naddr)
+		requeued.key.release()
+		requeued.key = nkey.clone()
 		to.waiters.PushBack(requeued)
+		requeued.bucket.Store(to)
 		done++
 	}
 	return done
@@ -158,19 +242,22 @@ const (
 	bucketCountBits = 10
 )
 
-func checkAddr(addr uintptr) error {
+// getKey returns a Key representing address addr in c.
+func getKey(c Checker, addr uintptr, private bool) (Key, error) {
 	// Ensure the address is aligned.
 	// It must be a DWORD boundary.
 	if addr&0x3 != 0 {
-		return syserror.EINVAL
+		return Key{}, syserror.EINVAL
 	}
-
-	return nil
+	if private {
+		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
+	}
+	return c.GetSharedKey(addr)
 }
 
 // bucketIndexForAddr returns the index into Manager.buckets for addr.
 func bucketIndexForAddr(addr uintptr) uintptr {
-	// - The bottom 2 bits of addr must be 0, per checkAddr.
+	// - The bottom 2 bits of addr must be 0, per getKey.
 	//
 	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
 	// for a canonical address, and (on all existing platforms) bit 47 must be
@@ -199,171 +286,216 @@ func bucketIndexForAddr(addr uintptr) uintptr {
 //
 // +stateify savable
 type Manager struct {
-	buckets [bucketCount]bucket `state:"zerovalue"`
+	// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
+	// futexes.
+	privateBuckets [bucketCount]bucket `state:"zerovalue"`
+
+	// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
+	// may be shared by multiple Managers. The sharedBucket pointer is
+	// immutable.
+	sharedBucket *bucket
 }
 
 // NewManager returns an initialized futex manager.
-// N.B. we use virtual address to tag futexes, so it only works for private
-// (within a single process) futex.
 func NewManager() *Manager {
-	return &Manager{}
+	return &Manager{
+		sharedBucket: &bucket{},
+	}
 }
 
-// lockBucket returns a locked bucket for the given addr.
-//
-// Preconditions: checkAddr(addr) == nil.
-func (m *Manager) lockBucket(addr uintptr) *bucket {
-	b := &m.buckets[bucketIndexForAddr(addr)]
+// Fork returns a new Manager. Shared futex clients using the returned Manager
+// may interoperate with those using m.
+func (m *Manager) Fork() *Manager {
+	return &Manager{
+		sharedBucket: m.sharedBucket,
+	}
+}
+
+// lockBucket returns a locked bucket for the given key.
+func (m *Manager) lockBucket(k *Key) *bucket {
+	var b *bucket
+	if k.Kind == KindSharedMappable {
+		b = m.sharedBucket
+	} else {
+		b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
+	}
 	b.mu.Lock()
 	return b
 }
 
-// lockBuckets returns locked buckets for the given addrs.
-//
-// Preconditions: checkAddr(addr1) == checkAddr(addr2) == nil.
-func (m *Manager) lockBuckets(addr1 uintptr, addr2 uintptr) (*bucket, *bucket) {
-	i1 := bucketIndexForAddr(addr1)
-	i2 := bucketIndexForAddr(addr2)
-	b1 := &m.buckets[i1]
-	b2 := &m.buckets[i2]
-
-	// Ensure that buckets are locked in a consistent order (lowest index
-	// first) to avoid circular locking.
-	switch {
-	case i1 < i2:
-		b1.mu.Lock()
-		b2.mu.Lock()
-	case i2 < i1:
-		b2.mu.Lock()
-		b1.mu.Lock()
-	default:
-		b1.mu.Lock()
+// lockBuckets returns locked buckets for the given keys.
+func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
+	// Buckets must be consistently ordered to avoid circular lock
+	// dependencies. We order buckets in m.privateBuckets by index (lowest
+	// index first), and all buckets in m.privateBuckets precede
+	// m.sharedBucket.
+
+	// Handle the common case first:
+	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
+		i1 := bucketIndexForAddr(k1.addr())
+		i2 := bucketIndexForAddr(k2.addr())
+		b1 := &m.privateBuckets[i1]
+		b2 := &m.privateBuckets[i2]
+		switch {
+		case i1 < i2:
+			b1.mu.Lock()
+			b2.mu.Lock()
+		case i2 < i1:
+			b2.mu.Lock()
+			b1.mu.Lock()
+		default:
+			b1.mu.Lock()
+		}
+		return b1, b2
 	}
 
+	// At least one of b1 or b2 should be m.sharedBucket.
+	b1 := m.sharedBucket
+	b2 := m.sharedBucket
+	if k1.Kind != KindSharedMappable {
+		b1 = m.lockBucket(k1)
+	} else if k2.Kind != KindSharedMappable {
+		b2 = m.lockBucket(k2)
+	}
+	m.sharedBucket.mu.Lock()
 	return b1, b2
 }
 
 // Wake wakes up to n waiters matching the bitmask on the given addr.
 // The number of waiters woken is returned.
-func (m *Manager) Wake(addr uintptr, bitmask uint32, n int) (int, error) {
-	if err := checkAddr(addr); err != nil {
+func (m *Manager) Wake(c Checker, addr uintptr, private bool, bitmask uint32, n int) (int, error) {
+	// This function is very hot; avoid defer.
+	k, err := getKey(c, addr, private)
+	if err != nil {
 		return 0, err
 	}
 
-	b := m.lockBucket(addr)
-	// This function is very hot; avoid defer.
-	r := b.wakeLocked(addr, bitmask, n)
+	b := m.lockBucket(&k)
+	r := b.wakeLocked(&k, bitmask, n)
+
 	b.mu.Unlock()
+	k.release()
 	return r, nil
 }
 
-func (m *Manager) doRequeue(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
-	if err := checkAddr(addr); err != nil {
+func (m *Manager) doRequeue(c Checker, addr, naddr uintptr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
+	k1, err := getKey(c, addr, private)
+	if err != nil {
 		return 0, err
 	}
-	if err := checkAddr(naddr); err != nil {
+	defer k1.release()
+	k2, err := getKey(c, naddr, private)
+	if err != nil {
 		return 0, err
 	}
+	defer k2.release()
 
-	b1, b2 := m.lockBuckets(addr, naddr)
+	b1, b2 := m.lockBuckets(&k1, &k2)
 	defer b1.mu.Unlock()
 	if b2 != b1 {
 		defer b2.mu.Unlock()
 	}
 
-	// Check our value.
-	// This only applied for RequeueCmp().
-	if c != nil {
+	if checkval {
 		if err := c.Check(addr, val); err != nil {
 			return 0, err
 		}
 	}
 
 	// Wake the number required.
-	done := b1.wakeLocked(addr, ^uint32(0), nwake)
+	done := b1.wakeLocked(&k1, ^uint32(0), nwake)
 
 	// Requeue the number required.
-	b1.requeueLocked(b2, addr, naddr, nreq)
+	b1.requeueLocked(b2, &k1, &k2, nreq)
 
 	return done, nil
 }
 
 // Requeue wakes up to nwake waiters on the given addr, and unconditionally
 // requeues up to nreq waiters on naddr.
-func (m *Manager) Requeue(addr uintptr, naddr uintptr, nwake int, nreq int) (int, error) {
-	return m.doRequeue(nil, addr, 0, naddr, nwake, nreq)
+func (m *Manager) Requeue(c Checker, addr, naddr uintptr, private bool, nwake int, nreq int) (int, error) {
+	return m.doRequeue(c, addr, naddr, private, false, 0, nwake, nreq)
 }
 
 // RequeueCmp atomically checks that the addr contains val (via the Checker),
 // wakes up to nwake waiters on addr and then unconditionally requeues nreq
 // waiters on naddr.
-func (m *Manager) RequeueCmp(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
-	return m.doRequeue(c, addr, val, naddr, nwake, nreq)
+func (m *Manager) RequeueCmp(c Checker, addr, naddr uintptr, private bool, val uint32, nwake int, nreq int) (int, error) {
+	return m.doRequeue(c, addr, naddr, private, true, val, nwake, nreq)
 }
 
 // WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
 // waiters unconditionally from addr1, and, based on the original value at addr2
 // and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
 // It returns the total number of waiters woken.
-func (m *Manager) WakeOp(c Checker, addr1 uintptr, addr2 uintptr, nwake1 int, nwake2 int, op uint32) (int, error) {
-	if err := checkAddr(addr1); err != nil {
+func (m *Manager) WakeOp(c Checker, addr1, addr2 uintptr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
+	k1, err := getKey(c, addr1, private)
+	if err != nil {
 		return 0, err
 	}
-	if err := checkAddr(addr2); err != nil {
+	defer k1.release()
+	k2, err := getKey(c, addr2, private)
+	if err != nil {
 		return 0, err
 	}
+	defer k2.release()
 
-	b1, b2 := m.lockBuckets(addr1, addr2)
+	b1, b2 := m.lockBuckets(&k1, &k2)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
 
 	done := 0
 	cond, err := c.Op(addr2, op)
-	if err == nil {
-		// Wake up up to nwake1 entries from the first bucket.
-		done = b1.wakeLocked(addr1, ^uint32(0), nwake1)
-
-		// Wake up up to nwake2 entries from the second bucket if the
-		// operation yielded true.
-		if cond {
-			done += b2.wakeLocked(addr2, ^uint32(0), nwake2)
-		}
+	if err != nil {
+		return 0, err
 	}
 
-	b1.mu.Unlock()
-	if b2 != b1 {
-		b2.mu.Unlock()
+	// Wake up up to nwake1 entries from the first bucket.
+	done = b1.wakeLocked(&k1, ^uint32(0), nwake1)
+
+	// Wake up up to nwake2 entries from the second bucket if the
+	// operation yielded true.
+	if cond {
+		done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
 	}
-	return done, err
+
+	return done, nil
 }
 
 // WaitPrepare atomically checks that addr contains val (via the Checker), then
 // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
 // Waiter must be subsequently removed by calling WaitComplete, whether or not
 // a wakeup is received on w.C.
-func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, val uint32, bitmask uint32) error {
-	if err := checkAddr(addr); err != nil {
+func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, private bool, val uint32, bitmask uint32) error {
+	k, err := getKey(c, addr, private)
+	if err != nil {
 		return err
 	}
+	// Ownership of k is transferred to w below.
 
 	// Prepare the Waiter before taking the bucket lock.
-	w.complete = 0
 	select {
 	case <-w.C:
 	default:
 	}
-	w.addr = addr
+	w.key = k
 	w.bitmask = bitmask
 
-	b := m.lockBucket(addr)
+	b := m.lockBucket(&k)
 	// This function is very hot; avoid defer.
 
 	// Perform our atomic check.
 	if err := c.Check(addr, val); err != nil {
 		b.mu.Unlock()
+		w.key.release()
 		return err
 	}
 
 	// Add the waiter to the bucket.
 	b.waiters.PushBack(w)
+	w.bucket.Store(b)
 
 	b.mu.Unlock()
 	return nil
@@ -372,36 +504,36 @@ func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, val uint32, bi
 // WaitComplete must be called when a Waiter previously added by WaitPrepare is
 // no longer eligible to be woken.
 func (m *Manager) WaitComplete(w *Waiter) {
-	// Can we short-circuit acquiring the lock?
-	// This is the happy path where a notification
-	// was received and we don't need to dequeue this
-	// waiter from any list (or take any locks).
-	if atomic.LoadInt32(&w.complete) != 0 {
-		return
-	}
-
-	// Take the bucket lock. Note that without holding the bucket lock, the
-	// waiter is not guaranteed to stay in that bucket, so after we take the
-	// bucket lock, we must ensure that the bucket hasn't changed: if it
-	// happens to have changed, we release the old bucket lock and try again
-	// with the new bucket; if it hasn't changed, we know it won't change now
-	// because we hold the lock.
-	var b *bucket
+	// Remove w from the bucket it's in.
 	for {
-		addr := atomic.LoadUintptr(&w.addr)
-		b = m.lockBucket(addr)
-		// We still have to use an atomic load here, because if w was racily
-		// requeued then w.addr is not protected by b.mu.
-		if addr == atomic.LoadUintptr(&w.addr) {
+		b := w.bucket.Load()
+
+		// If b is nil, the waiter isn't in any bucket anymore. This can't be
+		// racy because the waiter can't be concurrently re-queued in another
+		// bucket.
+		if b == nil {
 			break
 		}
-		b.mu.Unlock()
-	}
 
-	// Remove waiter from the bucket. w.complete can only be stored with b.mu
-	// locked, so this load doesn't need to use sync/atomic.
-	if w.complete == 0 {
+		// Take the bucket lock. Note that without holding the bucket lock, the
+		// waiter is not guaranteed to stay in that bucket, so after we take
+		// the bucket lock, we must ensure that the bucket hasn't changed: if
+		// it happens to have changed, we release the old bucket lock and try
+		// again with the new bucket; if it hasn't changed, we know it won't
+		// change now because we hold the lock.
+		b.mu.Lock()
+		if b != w.bucket.Load() {
+			b.mu.Unlock()
+			continue
+		}
+
+		// Remove w from b.
 		b.waiters.Remove(w)
+		w.bucket.Store(nil)
+		b.mu.Unlock()
+		break
 	}
-	b.mu.Unlock()
+
+	// Release references held by the waiter.
+	w.key.release()
 }