3 files changed, 102 insertions, 69 deletions
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index b7cb54b1d..2145a8496 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -117,15 +117,18 @@ type conn struct {
 	// update the state of tcb. It is immutable.
 	tcbHook Hook
 
-	// mu protects all mutable state.
 	mu sync.Mutex `state:"nosave"`
 	// tcb is TCB control block. It is used to keep track of states
-	// of tcp connection and is protected by mu.
+	// of tcp connection.
+	//
+	// +checklocks:mu
 	tcb tcpconntrack.TCB
 	// lastUsed is the last time the connection saw a relevant packet, and
-	// is updated by each packet on the connection. It is protected by mu.
+	// is updated by each packet on the connection.
 	//
 	// TODO(gvisor.dev/issue/5939): do not use the ambient clock.
+	//
+	// +checklocks:mu
 	lastUsed time.Time `state:".(unixTime)"`
 }
 
@@ -159,7 +162,8 @@ func (cn *conn) timedOut(now time.Time) bool {
 
 // update the connection tracking state.
 //
-// Precondition: cn.mu must be held.
+// TODO(https://gvisor.dev/issue/6590): annotate r/w locking requirements.
+// +checklocks:cn.mu
 func (cn *conn) updateLocked(pkt *PacketBuffer, hook Hook) {
 	if pkt.TransportProtocolNumber != header.TCPProtocolNumber {
 		return
@@ -200,18 +204,18 @@ type ConnTrack struct {
 	// It is immutable.
 	seed uint32
 
+	mu sync.RWMutex `state:"nosave"`
 	// mu protects the buckets slice, but not buckets' contents. Only take
 	// the write lock if you are modifying the slice or saving for S/R.
-	mu sync.RWMutex `state:"nosave"`
-
-	// buckets is protected by mu.
+	//
+	// +checklocks:mu
 	buckets []bucket
 }
 
 // +stateify savable
 type bucket struct {
-	// mu protects tuples.
-	mu     sync.Mutex `state:"nosave"`
+	mu sync.Mutex `state:"nosave"`
+	// +checklocks:mu
 	tuples tupleList
 }
 
@@ -270,19 +274,20 @@ func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) {
 }
 
 func (ct *ConnTrack) connForTID(tid tupleID) (*conn, direction) {
-	bucket := ct.bucket(tid)
+	bktID := ct.bucket(tid)
 	now := time.Now()
 
 	ct.mu.RLock()
 	defer ct.mu.RUnlock()
-	ct.buckets[bucket].mu.Lock()
-	defer ct.buckets[bucket].mu.Unlock()
+	bkt := &ct.buckets[bktID]
+	bkt.mu.Lock()
+	defer bkt.mu.Unlock()
 
 	// Iterate over the tuples in a bucket, cleaning up any unused
 	// connections we find.
-	for other := ct.buckets[bucket].tuples.Front(); other != nil; other = other.Next() {
+	for other := bkt.tuples.Front(); other != nil; other = other.Next() {
 		// Clean up any timed-out connections we happen to find.
-		if ct.reapTupleLocked(other, bucket, now) {
+		if ct.reapTupleLocked(other, bktID, bkt, now) {
 			// The tuple expired.
 			continue
 		}
@@ -344,27 +349,46 @@ func (ct *ConnTrack) insertSNATConn(pkt *PacketBuffer, hook Hook, port uint16, a
 
 // insertConn inserts conn into the appropriate table bucket.
 func (ct *ConnTrack) insertConn(conn *conn) {
-	// Lock the buckets in the correct order.
-	tupleBucket := ct.bucket(conn.original.tupleID)
-	replyBucket := ct.bucket(conn.reply.tupleID)
+	tupleBktID := ct.bucket(conn.original.tupleID)
+	replyBktID := ct.bucket(conn.reply.tupleID)
+
 	ct.mu.RLock()
 	defer ct.mu.RUnlock()
-	if tupleBucket < replyBucket {
-		ct.buckets[tupleBucket].mu.Lock()
-		ct.buckets[replyBucket].mu.Lock()
-	} else if tupleBucket > replyBucket {
-		ct.buckets[replyBucket].mu.Lock()
-		ct.buckets[tupleBucket].mu.Lock()
-	} else {
+
+	tupleBkt := &ct.buckets[tupleBktID]
+	if tupleBktID == replyBktID {
 		// Both tuples are in the same bucket.
-		ct.buckets[tupleBucket].mu.Lock()
+		tupleBkt.mu.Lock()
+		defer tupleBkt.mu.Unlock()
+		insertConn(tupleBkt, tupleBkt, conn)
+		return
 	}
 
+	// Lock the buckets in the correct order.
+	replyBkt := &ct.buckets[replyBktID]
+	if tupleBktID < replyBktID {
+		tupleBkt.mu.Lock()
+		defer tupleBkt.mu.Unlock()
+		replyBkt.mu.Lock()
+		defer replyBkt.mu.Unlock()
+	} else {
+		replyBkt.mu.Lock()
+		defer replyBkt.mu.Unlock()
+		tupleBkt.mu.Lock()
+		defer tupleBkt.mu.Unlock()
+	}
+	insertConn(tupleBkt, replyBkt, conn)
+}
+
+// TODO(https://gvisor.dev/issue/6590): annotate r/w locking requirements.
+// +checklocks:tupleBkt.mu
+// +checklocks:replyBkt.mu
+func insertConn(tupleBkt *bucket, replyBkt *bucket, conn *conn) {
 	// Now that we hold the locks, ensure the tuple hasn't been inserted by
 	// another thread.
 	// TODO(gvisor.dev/issue/5773): Should check conn.reply.tupleID, too?
 	alreadyInserted := false
-	for other := ct.buckets[tupleBucket].tuples.Front(); other != nil; other = other.Next() {
+	for other := tupleBkt.tuples.Front(); other != nil; other = other.Next() {
 		if other.tupleID == conn.original.tupleID {
 			alreadyInserted = true
 			break
@@ -373,14 +397,8 @@ func (ct *ConnTrack) insertConn(conn *conn) {
 
 	if !alreadyInserted {
 		// Add the tuple to the map.
-		ct.buckets[tupleBucket].tuples.PushFront(&conn.original)
-		ct.buckets[replyBucket].tuples.PushFront(&conn.reply)
-	}
-
-	// Unlocking can happen in any order.
-	ct.buckets[tupleBucket].mu.Unlock()
-	if tupleBucket != replyBucket {
-		ct.buckets[replyBucket].mu.Unlock() // +checklocksforce
+		tupleBkt.tuples.PushFront(&conn.original)
+		replyBkt.tuples.PushFront(&conn.reply)
 	}
 }
 
@@ -529,8 +547,10 @@ func (ct *ConnTrack) maybeInsertNoop(pkt *PacketBuffer, hook Hook) {
 		return
 	}
 	conn := newConn(tid, tid.reply(), manipNone, hook)
-	conn.updateLocked(pkt, hook)
 	ct.insertConn(conn)
+	conn.mu.Lock()
+	defer conn.mu.Unlock()
+	conn.updateLocked(pkt, hook)
 }
 
 // bucket gets the conntrack bucket for a tupleID.
@@ -582,14 +602,15 @@ func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, tim
 	defer ct.mu.RUnlock()
 	for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ {
 		idx = (i + start) % len(ct.buckets)
-		ct.buckets[idx].mu.Lock()
-		for tuple := ct.buckets[idx].tuples.Front(); tuple != nil; tuple = tuple.Next() {
+		bkt := &ct.buckets[idx]
+		bkt.mu.Lock()
+		for tuple := bkt.tuples.Front(); tuple != nil; tuple = tuple.Next() {
 			checked++
-			if ct.reapTupleLocked(tuple, idx, now) {
+			if ct.reapTupleLocked(tuple, idx, bkt, now) {
 				expired++
 			}
 		}
-		ct.buckets[idx].mu.Unlock()
+		bkt.mu.Unlock()
 	}
 	// We already checked buckets[idx].
 	idx++
@@ -614,41 +635,45 @@ func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, tim
 // reapTupleLocked tries to remove tuple and its reply from the table. It
 // returns whether the tuple's connection has timed out.
 //
-// Preconditions:
-// * ct.mu is locked for reading.
-// * bucket is locked.
-func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool {
+// Precondition: ct.mu is read locked and bkt.mu is write locked.
+// TODO(https://gvisor.dev/issue/6590): annotate r/w locking requirements.
+// +checklocks:ct.mu
+// +checklocks:bkt.mu
+func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bktID int, bkt *bucket, now time.Time) bool {
 	if !tuple.conn.timedOut(now) {
 		return false
 	}
 
 	// To maintain lock order, we can only reap these tuples if the reply
 	// appears later in the table.
-	replyBucket := ct.bucket(tuple.reply())
-	if bucket > replyBucket {
+	replyBktID := ct.bucket(tuple.reply())
+	if bktID > replyBktID {
 		return true
 	}
 
 	// Don't re-lock if both tuples are in the same bucket.
-	differentBuckets := bucket != replyBucket
-	if differentBuckets {
-		ct.buckets[replyBucket].mu.Lock()
+	if bktID != replyBktID {
+		replyBkt := &ct.buckets[replyBktID]
+		replyBkt.mu.Lock()
+		removeConnFromBucket(replyBkt, tuple)
+		replyBkt.mu.Unlock()
+	} else {
+		removeConnFromBucket(bkt, tuple)
 	}
 
 	// We have the buckets locked and can remove both tuples.
+	bkt.tuples.Remove(tuple)
+	return true
+}
+
+// TODO(https://gvisor.dev/issue/6590): annotate r/w locking requirements.
+// +checklocks:b.mu
+func removeConnFromBucket(b *bucket, tuple *tuple) {
 	if tuple.direction == dirOriginal {
-		ct.buckets[replyBucket].tuples.Remove(&tuple.conn.reply)
+		b.tuples.Remove(&tuple.conn.reply)
 	} else {
-		ct.buckets[replyBucket].tuples.Remove(&tuple.conn.original)
+		b.tuples.Remove(&tuple.conn.original)
 	}
-	ct.buckets[bucket].tuples.Remove(tuple)
-
-	// Don't re-unlock if both tuples are in the same bucket.
-	if differentBuckets {
-		ct.buckets[replyBucket].mu.Unlock() // +checklocksforce
-	}
-
-	return true
 }
 
 func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber) (tcpip.Address, uint16, tcpip.Error) {
diff --git a/pkg/tcpip/stack/iptables_state.go b/pkg/tcpip/stack/iptables_state.go
index 529e02a07..3d3c39c20 100644
--- a/pkg/tcpip/stack/iptables_state.go
+++ b/pkg/tcpip/stack/iptables_state.go
@@ -26,11 +26,15 @@ type unixTime struct {
 
 // saveLastUsed is invoked by stateify.
 func (cn *conn) saveLastUsed() unixTime {
+	cn.mu.Lock()
+	defer cn.mu.Unlock()
 	return unixTime{cn.lastUsed.Unix(), cn.lastUsed.UnixNano()}
 }
 
 // loadLastUsed is invoked by stateify.
 func (cn *conn) loadLastUsed(unix unixTime) {
+	cn.mu.Lock()
+	defer cn.mu.Unlock()
 	cn.lastUsed = time.Unix(unix.second, unix.nano)
 }
 
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index 976194124..50f73f173 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -81,17 +81,6 @@ const (
 //
 // +stateify savable
 type IPTables struct {
-	// mu protects v4Tables, v6Tables, and modified.
-	mu sync.RWMutex
-	// v4Tables and v6tables map tableIDs to tables. They hold builtin
-	// tables only, not user tables. mu must be locked for accessing.
-	v4Tables [NumTables]Table
-	v6Tables [NumTables]Table
-	// modified is whether tables have been modified at least once. It is
-	// used to elide the iptables performance overhead for workloads that
-	// don't utilize iptables.
-	modified bool
-
 	// priorities maps each hook to a list of table names. The order of the
 	// list is the order in which each table should be visited for that
 	// hook. It is immutable.
@@ -101,6 +90,21 @@ type IPTables struct {
 
 	// reaperDone can be signaled to stop the reaper goroutine.
 	reaperDone chan struct{}
+
+	mu sync.RWMutex
+	// v4Tables and v6tables map tableIDs to tables. They hold builtin
+	// tables only, not user tables.
+	//
+	// +checklocks:mu
+	v4Tables [NumTables]Table
+	// +checklocks:mu
+	v6Tables [NumTables]Table
+	// modified is whether tables have been modified at least once. It is
+	// used to elide the iptables performance overhead for workloads that
+	// don't utilize iptables.
+	//
+	// +checklocks:mu
+	modified bool
 }
 
 // VisitTargets traverses all the targets of all tables and replaces each with