Mix checklocks and atomic analyzers.

This change makes the checklocks analyzer considerable more powerful, adding: * The ability to traverse complex structures, e.g. to have multiple nested fields as part of the annotation. * The ability to resolve simple anonymous functions and closures, and perform lock analysis across these invocations. This does not apply to closures that are passed elsewhere, since it is not possible to know the context in which they might be invoked. * The ability to annotate return values in addition to receivers and other parameters, with the same complex structures noted above. * Ignoring locking semantics for "fresh" objects, i.e. objects that are allocated in the local frame (typically a new-style function). * Sanity checking of locking state across block transitions and returns, to ensure that no unexpected locks are held. Note that initially, most of these findings are excluded by a comprehensive nogo.yaml. The findings that are included are fundamental lock violations. The changes here should be relatively low risk, minor refactorings to either include necessary annotations to simplify the code structure (in general removing closures in favor of methods) so that the analyzer can be easily track the lock state. This change additional includes two changes to nogo itself: * Sanity checking of all types to ensure that the binary and ast-derived types have a consistent objectpath, to prevent the bug above from occurring silently (and causing much confusion). This also requires a trick in order to ensure that serialized facts are consumable downstream. This can be removed with https://go-review.googlesource.com/c/tools/+/331789 merged. * A minor refactoring to isolation the objdump settings in its own package. This was originally used to implement the sanity check above, but this information is now being passed another way. The minor refactor is preserved however, since it cleans up the code slightly and is minimal risk. PiperOrigin-RevId: 382613300
author: Adin Scannell <ascannell@google.com> 2021-07-01 15:05:28 -0700
committer: gVisor bot <gvisor-bot@google.com> 2021-07-01 15:07:56 -0700
commit: 16b751b6c610ec2c5a913cb8a818e9239ee7da71 (patch)
tree: 5596ea010c6afbbe79d1196197cd4bfc5d517e79 /pkg
parent: 570ca571805d6939c4c24b6a88660eefaf558ae7 (diff)
36 files changed, 458 insertions, 332 deletions
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 3a45e9041..8d7660e79 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -488,11 +488,11 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
 	// expensive, if possible release the lock and re-acquire it.
 	if walkMayUnlock {
-		d.mu.Unlock()
+		d.mu.Unlock() // +checklocksforce: results in an inconsistent block.
 	}
 	c, err := d.Inode.Lookup(ctx, name)
 	if walkMayUnlock {
-		d.mu.Lock()
+		d.mu.Lock() // +checklocksforce: see above.
 	}
 	// No dice.
 	if err != nil {
@@ -594,21 +594,27 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
 
 // lockDirectory should be called for any operation that changes this `d`s
 // children (creating or removing them).
-func (d *Dirent) lockDirectory() func() {
+// +checklocksacquire:d.dirMu
+// +checklocksacquire:d.mu
+func (d *Dirent) lockDirectory() {
 	renameMu.RLock()
 	d.dirMu.Lock()
 	d.mu.Lock()
-	return func() {
-		d.mu.Unlock()
-		d.dirMu.Unlock()
-		renameMu.RUnlock()
-	}
+}
+
+// unlockDirectory is the reverse of lockDirectory.
+// +checklocksrelease:d.dirMu
+// +checklocksrelease:d.mu
+func (d *Dirent) unlockDirectory() {
+	d.mu.Unlock()
+	d.dirMu.Unlock()
+	renameMu.RUnlock() // +checklocksforce: see lockDirectory.
 }
 
 // Create creates a new regular file in this directory.
 func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) {
-	unlock := d.lockDirectory()
-	defer unlock()
+	d.lockDirectory()
+	defer d.unlockDirectory()
 
 	// Does something already exist?
 	if d.exists(ctx, root, name) {
@@ -670,8 +676,8 @@ func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) {
 // genericCreate executes create if name does not exist. Removes a negative Dirent at name if
 // create succeeds.
 func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error {
-	unlock := d.lockDirectory()
-	defer unlock()
+	d.lockDirectory()
+	defer d.unlockDirectory()
 
 	// Does something already exist?
 	if d.exists(ctx, root, name) {
@@ -1021,8 +1027,8 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
 		panic("Dirent.Remove: root must not be nil")
 	}
 
-	unlock := d.lockDirectory()
-	defer unlock()
+	d.lockDirectory()
+	defer d.unlockDirectory()
 
 	// Try to walk to the node.
 	child, err := d.walk(ctx, root, name, false /* may unlock */)
@@ -1082,8 +1088,8 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
 		panic("Dirent.Remove: root must not be nil")
 	}
 
-	unlock := d.lockDirectory()
-	defer unlock()
+	d.lockDirectory()
+	defer d.unlockDirectory()
 
 	// Check for dots.
 	if name == "." {
@@ -1259,17 +1265,15 @@ func (d *Dirent) dropExtendedReference() {
 	d.Inode.MountSource.fscache.Remove(d)
 }
 
-// lockForRename takes locks on oldParent and newParent as required by Rename
-// and returns a function that will unlock the locks taken. The returned
-// function must be called even if a non-nil error is returned.
-func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) {
+// lockForRename takes locks on oldParent and newParent as required by Rename.
+// On return, unlockForRename must always be called, even with an error.
+// +checklocksacquire:oldParent.mu
+// +checklocksacquire:newParent.mu
+func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) error {
 	renameMu.Lock()
 	if oldParent == newParent {
 		oldParent.mu.Lock()
-		return func() {
-			oldParent.mu.Unlock()
-			renameMu.Unlock()
-		}, nil
+		return nil // +checklocksforce: only one lock exists.
 	}
 
 	// Renaming between directories is a bit subtle:
@@ -1297,11 +1301,7 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName
 				// itself.
 				err = unix.EINVAL
 			}
-			return func() {
-				newParent.mu.Unlock()
-				oldParent.mu.Unlock()
-				renameMu.Unlock()
-			}, err
+			return err
 		}
 		child = p
 	}
@@ -1310,11 +1310,21 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName
 	// have no relationship; in either case we can do this:
 	newParent.mu.Lock()
 	oldParent.mu.Lock()
-	return func() {
+	return nil
+}
+
+// unlockForRename is the opposite of lockForRename.
+// +checklocksrelease:oldParent.mu
+// +checklocksrelease:newParent.mu
+func unlockForRename(oldParent, newParent *Dirent) {
+	if oldParent == newParent {
 		oldParent.mu.Unlock()
-		newParent.mu.Unlock()
-		renameMu.Unlock()
-	}, nil
+		renameMu.Unlock() // +checklocksforce: only one lock exists.
+		return
+	}
+	newParent.mu.Unlock()
+	oldParent.mu.Unlock()
+	renameMu.Unlock() // +checklocksforce: not tracked.
 }
 
 func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error {
@@ -1353,8 +1363,8 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error
 		return err
 	}
 
-	unlock := d.lockDirectory()
-	defer unlock()
+	d.lockDirectory()
+	defer d.unlockDirectory()
 
 	victim, err := d.walk(ctx, root, name, true /* may unlock */)
 	if err != nil {
@@ -1392,8 +1402,8 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	}
 
 	// Acquire global renameMu lock, and mu locks on oldParent/newParent.
-	unlock, err := lockForRename(oldParent, oldName, newParent, newName)
-	defer unlock()
+	err := lockForRename(oldParent, oldName, newParent, newName)
+	defer unlockForRename(oldParent, newParent)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 44587bb37..a346c316b 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -80,23 +80,33 @@ func AsyncBarrier() {
 // Async executes a function asynchronously.
 //
 // Async must not be called recursively.
+// +checklocksignore
 func Async(f func()) {
 	workMu.RLock()
-	go func() { // S/R-SAFE: AsyncBarrier must be called.
-		defer workMu.RUnlock() // Ensure RUnlock in case of panic.
-		f()
-	}()
+	go asyncWork(f) // S/R-SAFE: AsyncBarrier must be called.
+}
+
+// +checklocksignore
+func asyncWork(f func()) {
+	// Ensure RUnlock in case of panic.
+	defer workMu.RUnlock()
+	f()
 }
 
 // AsyncWithContext is just like Async, except that it calls the asynchronous
 // function with the given context as argument. This function exists to avoid
 // needing to allocate an extra function on the heap in a hot path.
+// +checklocksignore
 func AsyncWithContext(ctx context.Context, f func(context.Context)) {
 	workMu.RLock()
-	go func() { // S/R-SAFE: AsyncBarrier must be called.
-		defer workMu.RUnlock() // Ensure RUnlock in case of panic.
-		f(ctx)
-	}()
+	go asyncWorkWithContext(ctx, f)
+}
+
+// +checklocksignore
+func asyncWorkWithContext(ctx context.Context, f func(context.Context)) {
+	// Ensure RUnlock in case of panic.
+	defer workMu.RUnlock()
+	f(ctx)
 }
 
 // AsyncErrorBarrier waits for all outstanding asynchronous work to complete, or
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index e2af1d2ae..19f91f010 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -112,13 +112,6 @@ func (i *inodeFileState) loadLoading(_ struct{}) {
 // +checklocks:i.loading
 func (i *inodeFileState) afterLoad() {
 	load := func() (err error) {
-		// See comment on i.loading().
-		defer func() {
-			if err == nil {
-				i.loading.Unlock()
-			}
-		}()
-
 		// Manually restore the p9.File.
 		name, ok := i.s.inodeMappings[i.sattr.InodeID]
 		if !ok {
@@ -167,6 +160,9 @@ func (i *inodeFileState) afterLoad() {
 			i.savedUAttr = nil
 		}
 
+		// See comment on i.loading(). This only unlocks on the
+		// non-error path.
+		i.loading.Unlock() // +checklocksforce: per comment.
 		return nil
 	}
 
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index aa2405f68..958f46bd6 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -47,7 +47,8 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 		return nil, linuxerr.ENAMETOOLONG
 	}
 
-	cp := i.session().cachePolicy
+	s := i.session()
+	cp := s.cachePolicy
 	if cp.cacheReaddir() {
 		// Check to see if we have readdirCache that indicates the
 		// child does not exist.  Avoid holding readdirMu longer than
@@ -78,7 +79,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 		return nil, err
 	}
 
-	if i.session().overrides != nil {
+	if s.overrides != nil {
 		// Check if file belongs to a internal named pipe. Note that it doesn't need
 		// to check for sockets because it's done in newInodeOperations below.
 		deviceKey := device.MultiDeviceKey{
@@ -86,13 +87,13 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 			SecondaryDevice: i.session().connID,
 			Inode:           qids[0].Path,
 		}
-		unlock := i.session().overrides.lock()
-		if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil {
-			unlock()
+		s.overrides.lock()
+		if pipeInode := s.overrides.getPipe(deviceKey); pipeInode != nil {
+			s.overrides.unlock()
 			pipeInode.IncRef()
 			return fs.NewDirent(ctx, pipeInode, name), nil
 		}
-		unlock()
+		s.overrides.unlock()
 	}
 
 	// Construct the Inode operations.
@@ -221,17 +222,20 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, t
 	if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil {
 		return err
 	}
-	if i.session().cachePolicy.cacheUAttrs(inode) {
+
+	s := i.session()
+	if s.cachePolicy.cacheUAttrs(inode) {
 		// Increase link count.
 		targetOpts.cachingInodeOps.IncLinks(ctx)
 	}
+
 	i.touchModificationAndStatusChangeTime(ctx, inode)
 	return nil
 }
 
 // CreateDirectory uses Create to create a directory named s under inodeOperations.
-func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s string, perm fs.FilePermissions) error {
-	if len(s) > maxFilenameLen {
+func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	if len(name) > maxFilenameLen {
 		return linuxerr.ENAMETOOLONG
 	}
 
@@ -247,16 +251,18 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s
 		perm.SetGID = true
 	}
 
-	if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
+	if _, err := i.fileState.file.mkdir(ctx, name, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
 		return err
 	}
-	if i.session().cachePolicy.cacheUAttrs(dir) {
+
+	s := i.session()
+	if s.cachePolicy.cacheUAttrs(dir) {
 		// Increase link count.
 		//
 		// N.B. This will update the modification time.
 		i.cachingInodeOps.IncLinks(ctx)
 	}
-	if i.session().cachePolicy.cacheReaddir() {
+	if s.cachePolicy.cacheReaddir() {
 		// Invalidate readdir cache.
 		i.markDirectoryDirty()
 	}
@@ -269,13 +275,14 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 		return nil, linuxerr.ENAMETOOLONG
 	}
 
-	if i.session().overrides == nil {
+	s := i.session()
+	if s.overrides == nil {
 		return nil, syserror.EOPNOTSUPP
 	}
 
 	// Stabilize the override map while creation is in progress.
-	unlock := i.session().overrides.lock()
-	defer unlock()
+	s.overrides.lock()
+	defer s.overrides.unlock()
 
 	sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket)
 	if err != nil {
@@ -284,7 +291,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 
 	// Construct the positive Dirent.
 	childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
-	i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep)
+	s.overrides.addBoundEndpoint(iops.fileState.key, childDir, ep)
 	return childDir, nil
 }
 
@@ -298,8 +305,9 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st
 	mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe
 
 	// N.B. FIFOs use major/minor numbers 0.
+	s := i.session()
 	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
-		if i.session().overrides == nil || !linuxerr.Equals(linuxerr.EPERM, err) {
+		if s.overrides == nil || !linuxerr.Equals(linuxerr.EPERM, err) {
 			return err
 		}
 		// If gofer doesn't support mknod, check if we can create an internal fifo.
@@ -311,13 +319,14 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st
 }
 
 func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error {
-	if i.session().overrides == nil {
+	s := i.session()
+	if s.overrides == nil {
 		return linuxerr.EPERM
 	}
 
 	// Stabilize the override map while creation is in progress.
-	unlock := i.session().overrides.lock()
-	defer unlock()
+	s.overrides.lock()
+	defer s.overrides.unlock()
 
 	sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe)
 	if err != nil {
@@ -336,7 +345,7 @@ func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode,
 
 	// Construct the positive Dirent.
 	childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
-	i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode)
+	s.overrides.addPipe(fileOps.fileState.key, childDir, inode)
 	return nil
 }
 
@@ -386,8 +395,9 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 		return linuxerr.ENAMETOOLONG
 	}
 
+	s := i.session()
 	var key *device.MultiDeviceKey
-	if i.session().overrides != nil {
+	if s.overrides != nil {
 		// Find out if file being deleted is a socket or pipe that needs to be
 		// removed from endpoint map.
 		if d, err := i.Lookup(ctx, dir, name); err == nil {
@@ -402,8 +412,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 				}
 
 				// Stabilize the override map while deletion is in progress.
-				unlock := i.session().overrides.lock()
-				defer unlock()
+				s.overrides.lock()
+				defer s.overrides.unlock()
 			}
 		}
 	}
@@ -412,7 +422,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 		return err
 	}
 	if key != nil {
-		i.session().overrides.remove(ctx, *key)
+		s.overrides.remove(ctx, *key)
 	}
 	i.touchModificationAndStatusChangeTime(ctx, dir)
 
@@ -429,11 +439,13 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na
 	if err := i.fileState.file.unlinkAt(ctx, name, 0x200); err != nil {
 		return err
 	}
-	if i.session().cachePolicy.cacheUAttrs(dir) {
+
+	s := i.session()
+	if s.cachePolicy.cacheUAttrs(dir) {
 		// Decrease link count and updates atime.
 		i.cachingInodeOps.DecLinks(ctx)
 	}
-	if i.session().cachePolicy.cacheReaddir() {
+	if s.cachePolicy.cacheReaddir() {
 		// Invalidate readdir cache.
 		i.markDirectoryDirty()
 	}
@@ -463,12 +475,13 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent
 	}
 
 	// Is the renamed entity a directory? Fix link counts.
+	s := i.session()
 	if fs.IsDir(i.fileState.sattr) {
 		// Update cached state.
-		if i.session().cachePolicy.cacheUAttrs(oldParent) {
+		if s.cachePolicy.cacheUAttrs(oldParent) {
 			oldParentInodeOperations.cachingInodeOps.DecLinks(ctx)
 		}
-		if i.session().cachePolicy.cacheUAttrs(newParent) {
+		if s.cachePolicy.cacheUAttrs(newParent) {
 			// Only IncLinks if there is a new addition to
 			// newParent. If this is replacement, then the total
 			// count remains the same.
@@ -477,7 +490,7 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent
 			}
 		}
 	}
-	if i.session().cachePolicy.cacheReaddir() {
+	if s.cachePolicy.cacheReaddir() {
 		// Mark old directory dirty.
 		oldParentInodeOperations.markDirectoryDirty()
 		if oldParent != newParent {
@@ -487,17 +500,18 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent
 	}
 
 	// Rename always updates ctime.
-	if i.session().cachePolicy.cacheUAttrs(inode) {
+	if s.cachePolicy.cacheUAttrs(inode) {
 		i.cachingInodeOps.TouchStatusChangeTime(ctx)
 	}
 	return nil
 }
 
 func (i *inodeOperations) touchModificationAndStatusChangeTime(ctx context.Context, inode *fs.Inode) {
-	if i.session().cachePolicy.cacheUAttrs(inode) {
+	s := i.session()
+	if s.cachePolicy.cacheUAttrs(inode) {
 		i.cachingInodeOps.TouchModificationAndStatusChangeTime(ctx)
 	}
-	if i.session().cachePolicy.cacheReaddir() {
+	if s.cachePolicy.cacheReaddir() {
 		// Invalidate readdir cache.
 		i.markDirectoryDirty()
 	}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 7cf3522ff..b7debeecb 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -98,9 +98,14 @@ func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) {
 // lock blocks other addition and removal operations from happening while
 // the backing file is being created or deleted. Returns a function that unlocks
 // the endpoint map.
-func (e *overrideMaps) lock() func() {
+// +checklocksacquire:e.mu
+func (e *overrideMaps) lock() {
 	e.mu.Lock()
-	return func() { e.mu.Unlock() }
+}
+
+// +checklocksrelease:e.mu
+func (e *overrideMaps) unlock() {
+	e.mu.Unlock()
 }
 
 // getBoundEndpoint returns the bound endpoint mapped to the given key.
@@ -366,8 +371,8 @@ func newOverrideMaps() *overrideMaps {
 
 // fillKeyMap populates key and dirent maps upon restore from saved pathmap.
 func (s *session) fillKeyMap(ctx context.Context) error {
-	unlock := s.overrides.lock()
-	defer unlock()
+	s.overrides.lock()
+	defer s.overrides.unlock()
 
 	for ep, dirPath := range s.overrides.pathMap {
 		_, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath))
@@ -394,8 +399,8 @@ func (s *session) fillKeyMap(ctx context.Context) error {
 // fillPathMap populates paths for overrides from dirents in direntMap
 // before save.
 func (s *session) fillPathMap(ctx context.Context) error {
-	unlock := s.overrides.lock()
-	defer unlock()
+	s.overrides.lock()
+	defer s.overrides.unlock()
 
 	for _, endpoint := range s.overrides.keyMap {
 		mountRoot := endpoint.dirent.MountRoot()
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 8a1c69ac2..1fd8a0910 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -32,10 +32,11 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.
 		return nil
 	}
 
-	if i.session().overrides != nil {
-		unlock := i.session().overrides.lock()
-		defer unlock()
-		ep := i.session().overrides.getBoundEndpoint(i.fileState.key)
+	s := i.session()
+	if s.overrides != nil {
+		s.overrides.lock()
+		defer s.overrides.unlock()
+		ep := s.overrides.getBoundEndpoint(i.fileState.key)
 		if ep != nil {
 			return ep
 		}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 237d17921..652e5fe77 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -147,6 +147,7 @@ func putDentrySlice(ds *[]*dentry) {
 // but dentry slices are allocated lazily, and it's much easier to say "defer
 // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
 // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
+// +checklocksrelease:fs.renameMu
 func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **[]*dentry) {
 	fs.renameMu.RUnlock()
 	if *dsp == nil {
@@ -159,6 +160,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **
 	putDentrySlice(*dsp)
 }
 
+// +checklocksrelease:fs.renameMu
 func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
 	if *ds == nil {
 		fs.renameMu.Unlock()
@@ -540,7 +542,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 			if child.syntheticChildren != 0 {
 				// This is definitely not an empty directory, irrespective of
 				// fs.opts.interop.
-				vfsObj.AbortDeleteDentry(&child.vfsd)
+				vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: PrepareDeleteDentry called if child != nil.
 				return linuxerr.ENOTEMPTY
 			}
 			// If InteropModeShared is in effect and the first call to
@@ -550,12 +552,12 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 			// still exist) would be a waste of time.
 			if child.cachedMetadataAuthoritative() {
 				if !child.isDir() {
-					vfsObj.AbortDeleteDentry(&child.vfsd)
+					vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
 					return syserror.ENOTDIR
 				}
 				for _, grandchild := range child.children {
 					if grandchild != nil {
-						vfsObj.AbortDeleteDentry(&child.vfsd)
+						vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
 						return linuxerr.ENOTEMPTY
 					}
 				}
@@ -565,12 +567,12 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	} else {
 		// child must be a non-directory file.
 		if child != nil && child.isDir() {
-			vfsObj.AbortDeleteDentry(&child.vfsd)
+			vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
 			return syserror.EISDIR
 		}
 		if rp.MustBeDir() {
 			if child != nil {
-				vfsObj.AbortDeleteDentry(&child.vfsd)
+				vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
 			}
 			return syserror.ENOTDIR
 		}
@@ -583,7 +585,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 		err = parent.file.unlinkAt(ctx, name, flags)
 		if err != nil {
 			if child != nil {
-				vfsObj.AbortDeleteDentry(&child.vfsd)
+				vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
 			}
 			return err
 		}
@@ -601,7 +603,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	}
 
 	if child != nil {
-		vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+		vfsObj.CommitDeleteDentry(ctx, &child.vfsd) // +checklocksforce: see above.
 		child.setDeleted()
 		if child.isSynthetic() {
 			parent.syntheticChildren--
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index fe4c2e0e1..2f85215d9 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -947,10 +947,10 @@ func (d *dentry) cachedMetadataAuthoritative() bool {
 // updateFromP9Attrs is called to update d's metadata after an update from the
 // remote filesystem.
 // Precondition: d.metadataMu must be locked.
+// +checklocks:d.metadataMu
 func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
 	if mask.Mode {
 		if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
-			d.metadataMu.Unlock()
 			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
 		}
 		atomic.StoreUint32(&d.mode, uint32(attr.Mode))
@@ -989,6 +989,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
 
 // Preconditions: !d.isSynthetic().
 // Preconditions: d.metadataMu is locked.
+// +checklocks:d.metadataMu
 func (d *dentry) refreshSizeLocked(ctx context.Context) error {
 	d.handleMu.RLock()
 
@@ -1020,6 +1021,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
 // Preconditions:
 // * !d.isSynthetic().
 // * d.metadataMu is locked.
+// +checklocks:d.metadataMu
 func (d *dentry) updateFromGetattrLocked(ctx context.Context) error {
 	// Use d.readFile or d.writeFile, which represent 9P FIDs that have been
 	// opened, in preference to d.file, which represents a 9P fid that has not.
@@ -1044,7 +1046,8 @@ func (d *dentry) updateFromGetattrLocked(ctx context.Context) error {
 
 	_, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
 	if handleMuRLocked {
-		d.handleMu.RUnlock() // must be released before updateFromP9AttrsLocked()
+		// handleMu must be released before updateFromP9AttrsLocked().
+		d.handleMu.RUnlock() // +checklocksforce: complex case.
 	}
 	if err != nil {
 		return err
@@ -1470,7 +1473,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo
 		if d.isDeleted() {
 			d.watches.HandleDeletion(ctx)
 		}
-		d.destroyLocked(ctx)
+		d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point.
 		return
 	}
 	// If d still has inotify watches and it is not deleted or invalidated, it
@@ -1498,7 +1501,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo
 			delete(d.parent.children, d.name)
 			d.parent.dirMu.Unlock()
 		}
-		d.destroyLocked(ctx)
+		d.destroyLocked(ctx) // +checklocksforce: see above.
 		return
 	}
 
@@ -1527,7 +1530,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo
 			d.fs.renameMu.Lock()
 			defer d.fs.renameMu.Unlock()
 		}
-		d.fs.evictCachedDentryLocked(ctx)
+		d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above.
 	}
 }
 
@@ -1544,6 +1547,7 @@ func (d *dentry) removeFromCacheLocked() {
 
 // Precondition: fs.renameMu must be locked for writing; it may be temporarily
 // unlocked.
+// +checklocks:fs.renameMu
 func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
 	for fs.cachedDentriesLen != 0 {
 		fs.evictCachedDentryLocked(ctx)
@@ -1552,6 +1556,7 @@ func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
 
 // Preconditions:
 // * fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// +checklocks:fs.renameMu
 func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
 	fs.cacheMu.Lock()
 	victim := fs.cachedDentries.Back()
@@ -1588,7 +1593,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
 	// will try to acquire fs.renameMu (which we have already acquired). Hence,
 	// fs.renameMu will synchronize the destroy attempts.
 	victim.cachingMu.Unlock()
-	victim.destroyLocked(ctx)
+	victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs.
 }
 
 // destroyLocked destroys the dentry.
@@ -1598,6 +1603,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
 // * d.refs == 0.
 // * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
 //   from its former parent dentry.
+// +checklocks:d.fs.renameMu
 func (d *dentry) destroyLocked(ctx context.Context) {
 	switch atomic.LoadInt64(&d.refs) {
 	case 0:
diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go
index 8f81f0822..226790a11 100644
--- a/pkg/sentry/fsimpl/gofer/revalidate.go
+++ b/pkg/sentry/fsimpl/gofer/revalidate.go
@@ -247,16 +247,16 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF
 			if found && !d.isSynthetic() {
 				// First dentry is where the search is starting, just update attributes
 				// since it cannot be replaced.
-				d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr)
+				d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata.
 			}
-			d.metadataMu.Unlock()
+			d.metadataMu.Unlock() // +checklocksforce: see above.
 			continue
 		}
 
 		// Note that synthetic dentries will always fails the comparison check
 		// below.
 		if !found || d.qidPath != stats[i].QID.Path {
-			d.metadataMu.Unlock()
+			d.metadataMu.Unlock() // +checklocksforce: see above.
 			if !found && d.isSynthetic() {
 				// We have a synthetic file, and no remote file has arisen to replace
 				// it.
@@ -298,7 +298,7 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF
 		}
 
 		// The file at this path hasn't changed. Just update cached metadata.
-		d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr)
+		d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above.
 		d.metadataMu.Unlock()
 	}
 
@@ -354,6 +354,7 @@ func (r *revalidateState) add(name string, d *dentry) {
 	r.dentries = append(r.dentries, d)
 }
 
+// +checklocksignore
 func (r *revalidateState) lockAllMetadata() {
 	for _, d := range r.dentries {
 		d.metadataMu.Lock()
@@ -372,6 +373,7 @@ func (r *revalidateState) popFront() *dentry {
 
 // reset releases all metadata locks and resets all fields to allow this
 // instance to be reused.
+// +checklocksignore
 func (r *revalidateState) reset() {
 	if r.locked {
 		// Unlock any remaining dentries.
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
index 2ec819f86..dbd834c67 100644
--- a/pkg/sentry/fsimpl/gofer/symlink.go
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -41,7 +41,7 @@ func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
 			d.haveTarget = true
 			d.target = target
 		}
-		d.dataMu.Unlock()
+		d.dataMu.Unlock() // +checklocksforce: guaranteed locked from above.
 	}
 	return target, err
 }
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 38c2b6df1..20d2526ad 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -752,7 +752,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		fs.deferDecRef(replaced)
 		replaceVFSD = replaced.VFSDentry()
 	}
-	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD)
+	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) // +checklocksforce: to may be nil, that's okay.
 	return nil
 }
 
@@ -788,7 +788,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	defer mntns.DecRef(ctx)
 	vfsd := d.VFSDentry()
 	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
-		return err
+		return err // +checklocksforce: vfsd is not locked.
 	}
 
 	if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil {
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 41207211a..77f9affc1 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -87,7 +87,7 @@ func putDentrySlice(ds *[]*dentry) {
 // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
 // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
 //
-// +checklocks:fs.renameMu
+// +checklocksrelease:fs.renameMu
 func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*dentry) {
 	fs.renameMu.RUnlock()
 	if *dsp == nil {
@@ -113,7 +113,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*
 	putDentrySlice(*dsp)
 }
 
-// +checklocks:fs.renameMu
+// +checklocksrelease:fs.renameMu
 func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
 	if *ds == nil {
 		fs.renameMu.Unlock()
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index e4bfbd3c9..358a66072 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -75,6 +75,7 @@ func putDentrySlice(ds *[]*dentry) {
 // but dentry slices are allocated lazily, and it's much easier to say "defer
 // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
 // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+// +checklocksrelease:fs.renameMu
 func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
 	fs.renameMu.RUnlock()
 	if *ds == nil {
@@ -90,6 +91,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*d
 	putDentrySlice(*ds)
 }
 
+// +checklocksrelease:fs.renameMu
 func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
 	if *ds == nil {
 		fs.renameMu.Unlock()
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 6377abb94..f5c364c96 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -398,8 +398,8 @@ func (m *Manager) Fork() *Manager {
 }
 
 // lockBucket returns a locked bucket for the given key.
-func (m *Manager) lockBucket(k *Key) *bucket {
-	var b *bucket
+// +checklocksacquire:b.mu
+func (m *Manager) lockBucket(k *Key) (b *bucket) {
 	if k.Kind == KindSharedMappable {
 		b = m.sharedBucket
 	} else {
@@ -410,7 +410,9 @@ func (m *Manager) lockBucket(k *Key) *bucket {
 }
 
 // lockBuckets returns locked buckets for the given keys.
-func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
+// +checklocksacquire:b1.mu
+// +checklocksacquire:b2.mu
+func (m *Manager) lockBuckets(k1, k2 *Key) (b1 *bucket, b2 *bucket) {
 	// Buckets must be consistently ordered to avoid circular lock
 	// dependencies. We order buckets in m.privateBuckets by index (lowest
 	// index first), and all buckets in m.privateBuckets precede
@@ -420,8 +422,8 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
 	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
 		i1 := bucketIndexForAddr(k1.addr())
 		i2 := bucketIndexForAddr(k2.addr())
-		b1 := &m.privateBuckets[i1]
-		b2 := &m.privateBuckets[i2]
+		b1 = &m.privateBuckets[i1]
+		b2 = &m.privateBuckets[i2]
 		switch {
 		case i1 < i2:
 			b1.mu.Lock()
@@ -432,19 +434,30 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
 		default:
 			b1.mu.Lock()
 		}
-		return b1, b2
+		return b1, b2 // +checklocksforce
 	}
 
 	// At least one of b1 or b2 should be m.sharedBucket.
-	b1 := m.sharedBucket
-	b2 := m.sharedBucket
+	b1 = m.sharedBucket
+	b2 = m.sharedBucket
 	if k1.Kind != KindSharedMappable {
 		b1 = m.lockBucket(k1)
 	} else if k2.Kind != KindSharedMappable {
 		b2 = m.lockBucket(k2)
 	}
 	m.sharedBucket.mu.Lock()
-	return b1, b2
+	return b1, b2 // +checklocksforce
+}
+
+// unlockBuckets unlocks two buckets.
+// +checklocksrelease:b1.mu
+// +checklocksrelease:b2.mu
+func (m *Manager) unlockBuckets(b1, b2 *bucket) {
+	b1.mu.Unlock()
+	if b1 != b2 {
+		b2.mu.Unlock()
+	}
+	return // +checklocksforce
 }
 
 // Wake wakes up to n waiters matching the bitmask on the given addr.
@@ -477,10 +490,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, c
 	defer k2.release(t)
 
 	b1, b2 := m.lockBuckets(&k1, &k2)
-	defer b1.mu.Unlock()
-	if b2 != b1 {
-		defer b2.mu.Unlock()
-	}
+	defer m.unlockBuckets(b1, b2)
 
 	if checkval {
 		if err := check(t, addr, val); err != nil {
@@ -527,10 +537,7 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwa
 	defer k2.release(t)
 
 	b1, b2 := m.lockBuckets(&k1, &k2)
-	defer b1.mu.Unlock()
-	if b2 != b1 {
-		defer b2.mu.Unlock()
-	}
+	defer m.unlockBuckets(b1, b2)
 
 	done := 0
 	cond, err := atomicOp(t, addr2, op)
diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go
index dd60cba24..077c5d596 100644
--- a/pkg/sentry/kernel/pipe/pipe_unsafe.go
+++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go
@@ -23,6 +23,8 @@ import (
 // concurrent calls cannot deadlock.
 //
 // Preconditions: x != y.
+// +checklocksacquire:x.mu
+// +checklocksacquire:y.mu
 func lockTwoPipes(x, y *Pipe) {
 	// Lock the two pipes in order of increasing address.
 	if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 84f9f6234..c883a9014 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -157,6 +157,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
 //
 // mu must be held by the caller. waitFor returns with mu held, but it will
 // drop mu before blocking for any reader/writers.
+// +checklocks:mu
 func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
 	// Ideally this function would simply use a condition variable. However, the
 	// wait needs to be interruptible via 'sleeper', so we must sychronize via a
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index cdaee5d7f..161140980 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -652,6 +652,7 @@ func (t *Task) forgetTracerLocked() {
 // Preconditions:
 // * The signal mutex must be locked.
 // * The caller must be running on the task goroutine.
+// +checklocks:t.tg.signalHandlers.mu
 func (t *Task) ptraceSignalLocked(info *linux.SignalInfo) bool {
 	if linux.Signal(info.Signo) == linux.SIGKILL {
 		return false
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index c0c1f1f13..ae21a55da 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -121,8 +121,9 @@ func (pg *ProcessGroup) Originator() *ThreadGroup {
 
 // IsOrphan returns true if this process group is an orphan.
 func (pg *ProcessGroup) IsOrphan() bool {
-	pg.originator.TaskSet().mu.RLock()
-	defer pg.originator.TaskSet().mu.RUnlock()
+	ts := pg.originator.TaskSet()
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
 	return pg.ancestors == 0
 }
 
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 125fd855b..b51ec6aa7 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -204,6 +204,7 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar h
 // * vseg.Range().IsSupersetOf(ar).
 //
 // Postconditions: mm.mappingMu will be unlocked.
+// +checklocksrelease:mm.mappingMu
 func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) {
 	// See populateVMA above for commentary.
 	if !vseg.ValuePtr().effectivePerms.Any() {
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index f7d5a1800..0c8542485 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -945,7 +945,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 				// NOTE(b/165896008): mincore (which is passed as checkCommitted)
 				// by f.UpdateUsage() might take a really long time. So unlock f.mu
 				// while checkCommitted runs.
-				f.mu.Unlock()
+				f.mu.Unlock() // +checklocksforce
 				err := checkCommitted(s, buf)
 				f.mu.Lock()
 				if err != nil {
diff --git a/pkg/sentry/time/calibrated_clock_test.go b/pkg/sentry/time/calibrated_clock_test.go
index d6622bfe2..0a4b1f1bf 100644
--- a/pkg/sentry/time/calibrated_clock_test.go
+++ b/pkg/sentry/time/calibrated_clock_test.go
@@ -50,6 +50,7 @@ func TestConstantFrequency(t *testing.T) {
 	if !c.ready {
 		c.mu.RUnlock()
 		t.Fatalf("clock not ready")
+		return // For checklocks consistency.
 	}
 	// A bit after the last sample.
 	now, ok := c.params.ComputeTime(750000)
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 242eb5ecb..cb92b6eee 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -196,11 +196,12 @@ func (d *Dentry) OnZeroWatches(ctx context.Context) {
 // PrepareDeleteDentry must be called before attempting to delete the file
 // represented by d. If PrepareDeleteDentry succeeds, the caller must call
 // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
+// +checklocksacquire:d.mu
 func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[d] != 0 {
 		vfs.mountMu.Unlock()
-		return linuxerr.EBUSY
+		return linuxerr.EBUSY // +checklocksforce: inconsistent return.
 	}
 	d.mu.Lock()
 	vfs.mountMu.Unlock()
@@ -211,14 +212,14 @@ func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dent
 
 // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion
 // fails.
-// +checklocks:d.mu
+// +checklocksrelease:d.mu
 func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
 	d.mu.Unlock()
 }
 
 // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
 // succeeds.
-// +checklocks:d.mu
+// +checklocksrelease:d.mu
 func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) {
 	d.dead = true
 	d.mu.Unlock()
@@ -249,16 +250,18 @@ func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) {
 // Preconditions:
 // * If to is not nil, it must be a child Dentry from the same Filesystem.
 // * from != to.
+// +checklocksacquire:from.mu
+// +checklocksacquire:to.mu
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[from] != 0 {
 		vfs.mountMu.Unlock()
-		return linuxerr.EBUSY
+		return linuxerr.EBUSY // +checklocksforce: no locks acquired.
 	}
 	if to != nil {
 		if mntns.mountpoints[to] != 0 {
 			vfs.mountMu.Unlock()
-			return linuxerr.EBUSY
+			return linuxerr.EBUSY // +checklocksforce: no locks acquired.
 		}
 		to.mu.Lock()
 	}
@@ -267,13 +270,13 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t
 	// Return with from.mu and to.mu locked, which will be unlocked by
 	// AbortRenameDentry, CommitRenameReplaceDentry, or
 	// CommitRenameExchangeDentry.
-	return nil
+	return nil // +checklocksforce: to may not be acquired.
 }
 
 // AbortRenameDentry must be called after PrepareRenameDentry if the rename
 // fails.
-// +checklocks:from.mu
-// +checklocks:to.mu
+// +checklocksrelease:from.mu
+// +checklocksrelease:to.mu
 func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
 	from.mu.Unlock()
 	if to != nil {
@@ -286,8 +289,8 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
 // that was replaced by from.
 //
 // Preconditions: PrepareRenameDentry was previously called on from and to.
-// +checklocks:from.mu
-// +checklocks:to.mu
+// +checklocksrelease:from.mu
+// +checklocksrelease:to.mu
 func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) {
 	from.mu.Unlock()
 	if to != nil {
@@ -303,8 +306,8 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, fro
 // from and to are exchanged by rename(RENAME_EXCHANGE).
 //
 // Preconditions: PrepareRenameDentry was previously called on from and to.
-// +checklocks:from.mu
-// +checklocks:to.mu
+// +checklocksrelease:from.mu
+// +checklocksrelease:to.mu
 func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
 	from.mu.Unlock()
 	to.mu.Unlock()
diff --git a/pkg/sync/mutex_test.go b/pkg/sync/mutex_test.go
index 4fb51a8ab..9e4e3f0b2 100644
--- a/pkg/sync/mutex_test.go
+++ b/pkg/sync/mutex_test.go
@@ -64,7 +64,7 @@ func TestTryLockUnlock(t *testing.T) {
 	if !m.TryLock() {
 		t.Fatal("failed to aquire lock")
 	}
-	m.Unlock()
+	m.Unlock() // +checklocksforce
 	if !m.TryLock() {
 		t.Fatal("failed to aquire lock after unlock")
 	}
diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go
index 411a80a8a..b829765d9 100644
--- a/pkg/sync/mutex_unsafe.go
+++ b/pkg/sync/mutex_unsafe.go
@@ -32,6 +32,18 @@ func (m *CrossGoroutineMutex) state() *int32 {
 	return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state
 }
 
+// Lock locks the underlying Mutex.
+// +checklocksignore
+func (m *CrossGoroutineMutex) Lock() {
+	m.Mutex.Lock()
+}
+
+// Unlock unlocks the underlying Mutex.
+// +checklocksignore
+func (m *CrossGoroutineMutex) Unlock() {
+	m.Mutex.Unlock()
+}
+
 const (
 	mutexUnlocked = 0
 	mutexLocked   = 1
@@ -62,6 +74,7 @@ type Mutex struct {
 
 // Lock locks m. If the lock is already in use, the calling goroutine blocks
 // until the mutex is available.
+// +checklocksignore
 func (m *Mutex) Lock() {
 	noteLock(unsafe.Pointer(m))
 	m.m.Lock()
@@ -80,6 +93,7 @@ func (m *Mutex) Unlock() {
 
 // TryLock tries to acquire the mutex. It returns true if it succeeds and false
 // otherwise. TryLock does not block.
+// +checklocksignore
 func (m *Mutex) TryLock() bool {
 	// Note lock first to enforce proper locking even if unsuccessful.
 	noteLock(unsafe.Pointer(m))
diff --git a/pkg/sync/rwmutex_test.go b/pkg/sync/rwmutex_test.go
index 5ca96d12b..56a88e712 100644
--- a/pkg/sync/rwmutex_test.go
+++ b/pkg/sync/rwmutex_test.go
@@ -172,7 +172,7 @@ func TestRWTryLockUnlock(t *testing.T) {
 	if !rwm.TryLock() {
 		t.Fatal("failed to aquire lock")
 	}
-	rwm.Unlock()
+	rwm.Unlock() // +checklocksforce
 	if !rwm.TryLock() {
 		t.Fatal("failed to aquire lock after unlock")
 	}
diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go
index 892d3e641..7829b06db 100644
--- a/pkg/sync/rwmutex_unsafe.go
+++ b/pkg/sync/rwmutex_unsafe.go
@@ -37,6 +37,7 @@ const rwmutexMaxReaders = 1 << 30
 
 // TryRLock locks rw for reading. It returns true if it succeeds and false
 // otherwise. It does not block.
+// +checklocksignore
 func (rw *CrossGoroutineRWMutex) TryRLock() bool {
 	if RaceEnabled {
 		RaceDisable()
@@ -65,6 +66,7 @@ func (rw *CrossGoroutineRWMutex) TryRLock() bool {
 // It should not be used for recursive read locking; a blocked Lock call
 // excludes new readers from acquiring the lock. See the documentation on the
 // RWMutex type.
+// +checklocksignore
 func (rw *CrossGoroutineRWMutex) RLock() {
 	if RaceEnabled {
 		RaceDisable()
@@ -83,6 +85,7 @@ func (rw *CrossGoroutineRWMutex) RLock() {
 //
 // Preconditions:
 // * rw is locked for reading.
+// +checklocksignore
 func (rw *CrossGoroutineRWMutex) RUnlock() {
 	if RaceEnabled {
 		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
@@ -134,6 +137,7 @@ func (rw *CrossGoroutineRWMutex) TryLock() bool {
 
 // Lock locks rw for writing. If the lock is already locked for reading or
 // writing, Lock blocks until the lock is available.
+// +checklocksignore
 func (rw *CrossGoroutineRWMutex) Lock() {
 	if RaceEnabled {
 		RaceDisable()
@@ -228,6 +232,7 @@ type RWMutex struct {
 
 // TryRLock locks rw for reading. It returns true if it succeeds and false
 // otherwise. It does not block.
+// +checklocksignore
 func (rw *RWMutex) TryRLock() bool {
 	// Note lock first to enforce proper locking even if unsuccessful.
 	noteLock(unsafe.Pointer(rw))
@@ -243,6 +248,7 @@ func (rw *RWMutex) TryRLock() bool {
 // It should not be used for recursive read locking; a blocked Lock call
 // excludes new readers from acquiring the lock. See the documentation on the
 // RWMutex type.
+// +checklocksignore
 func (rw *RWMutex) RLock() {
 	noteLock(unsafe.Pointer(rw))
 	rw.m.RLock()
@@ -261,6 +267,7 @@ func (rw *RWMutex) RUnlock() {
 
 // TryLock locks rw for writing. It returns true if it succeeds and false
 // otherwise. It does not block.
+// +checklocksignore
 func (rw *RWMutex) TryLock() bool {
 	// Note lock first to enforce proper locking even if unsuccessful.
 	noteLock(unsafe.Pointer(rw))
@@ -273,6 +280,7 @@ func (rw *RWMutex) TryLock() bool {
 
 // Lock locks rw for writing. If the lock is already locked for reading or
 // writing, Lock blocks until the lock is available.
+// +checklocksignore
 func (rw *RWMutex) Lock() {
 	noteLock(unsafe.Pointer(rw))
 	rw.m.Lock()
diff --git a/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go b/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go
index 0b51563cd..1261ad414 100644
--- a/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go
+++ b/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go
@@ -126,7 +126,7 @@ func (m *mockMulticastGroupProtocol) sendQueuedReports() {
 // Precondition: m.mu must be read locked.
 func (m *mockMulticastGroupProtocol) Enabled() bool {
 	if m.mu.TryLock() {
-		m.mu.Unlock()
+		m.mu.Unlock() // +checklocksforce: TryLock.
 		m.t.Fatal("got write lock, expected to not take the lock; generic multicast protocol must take the read or write lock before calling Enabled")
 	}
 
@@ -138,11 +138,11 @@ func (m *mockMulticastGroupProtocol) Enabled() bool {
 // Precondition: m.mu must be locked.
 func (m *mockMulticastGroupProtocol) SendReport(groupAddress tcpip.Address) (bool, tcpip.Error) {
 	if m.mu.TryLock() {
-		m.mu.Unlock()
+		m.mu.Unlock() // +checklocksforce: TryLock.
 		m.t.Fatalf("got write lock, expected to not take the lock; generic multicast protocol must take the write lock before sending report for %s", groupAddress)
 	}
 	if m.mu.TryRLock() {
-		m.mu.RUnlock()
+		m.mu.RUnlock() // +checklocksforce: TryLock.
 		m.t.Fatalf("got read lock, expected to not take the lock; generic multicast protocol must take the write lock before sending report for %s", groupAddress)
 	}
 
@@ -155,11 +155,11 @@ func (m *mockMulticastGroupProtocol) SendReport(groupAddress tcpip.Address) (boo
 // Precondition: m.mu must be locked.
 func (m *mockMulticastGroupProtocol) SendLeave(groupAddress tcpip.Address) tcpip.Error {
 	if m.mu.TryLock() {
-		m.mu.Unlock()
+		m.mu.Unlock() // +checklocksforce: TryLock.
 		m.t.Fatalf("got write lock, expected to not take the lock; generic multicast protocol must take the write lock before sending leave for %s", groupAddress)
 	}
 	if m.mu.TryRLock() {
-		m.mu.RUnlock()
+		m.mu.RUnlock() // +checklocksforce: TryLock.
 		m.t.Fatalf("got read lock, expected to not take the lock; generic multicast protocol must take the write lock before sending leave for %s", groupAddress)
 	}
 
diff --git a/pkg/tcpip/stack/addressable_endpoint_state.go b/pkg/tcpip/stack/addressable_endpoint_state.go
index ce9cebdaa..ae0bb4ace 100644
--- a/pkg/tcpip/stack/addressable_endpoint_state.go
+++ b/pkg/tcpip/stack/addressable_endpoint_state.go
@@ -249,7 +249,7 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address
 	// or we are adding a new temporary or permanent address.
 	//
 	// The address MUST be write locked at this point.
-	defer addrState.mu.Unlock()
+	defer addrState.mu.Unlock() // +checklocksforce
 
 	if permanent {
 		if addrState.mu.kind.IsPermanent() {
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index 782e74b24..068dab7ce 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -363,7 +363,7 @@ func (ct *ConnTrack) insertConn(conn *conn) {
 	// Unlocking can happen in any order.
 	ct.buckets[tupleBucket].mu.Unlock()
 	if tupleBucket != replyBucket {
-		ct.buckets[replyBucket].mu.Unlock()
+		ct.buckets[replyBucket].mu.Unlock() // +checklocksforce
 	}
 }
 
@@ -626,7 +626,7 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bo
 
 	// Don't re-unlock if both tuples are in the same bucket.
 	if differentBuckets {
-		ct.buckets[replyBucket].mu.Unlock()
+		ct.buckets[replyBucket].mu.Unlock() // +checklocksforce
 	}
 
 	return true
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index cb316d27a..f9a15efb2 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -213,6 +213,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult
 // reacquire the mutex in exclusive mode.
 //
 // Returns true for retry if preparation should be retried.
+// +checklocks:e.mu
 func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) {
 	switch e.state {
 	case stateInitial:
@@ -229,10 +230,8 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip
 	}
 
 	e.mu.RUnlock()
-	defer e.mu.RLock()
-
 	e.mu.Lock()
-	defer e.mu.Unlock()
+	defer e.mu.DowngradeLock()
 
 	// The state changed when we released the shared locked and re-acquired
 	// it in exclusive mode. Try again.
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d807b13b7..aa413ad05 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -330,7 +330,9 @@ func (l *listenContext) performHandshake(s *segment, opts *header.TCPSynOptions,
 	}
 	ep := h.ep
 
-	if err := h.complete(); err != nil {
+	// N.B. the endpoint is generated above by startHandshake, and will be
+	// returned locked. This first call is forced.
+	if err := h.complete(); err != nil { // +checklocksforce
 		ep.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		ep.stats.FailedConnectionAttempts.Increment()
 		l.cleanupFailedHandshake(h)
@@ -364,6 +366,7 @@ func (l *listenContext) closeAllPendingEndpoints() {
 }
 
 // Precondition: h.ep.mu must be held.
+// +checklocks:h.ep.mu
 func (l *listenContext) cleanupFailedHandshake(h *handshake) {
 	e := h.ep
 	e.mu.Unlock()
@@ -504,7 +507,9 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	}
 
 	go func() {
-		if err := h.complete(); err != nil {
+		// Note that startHandshake returns a locked endpoint. The
+		// force call here just makes it so.
+		if err := h.complete(); err != nil { // +checklocksforce
 			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 			e.stats.FailedConnectionAttempts.Increment()
 			ctx.cleanupFailedHandshake(h)
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index e39d1623d..93ed161f9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -511,6 +511,7 @@ func (h *handshake) start() {
 }
 
 // complete completes the TCP 3-way handshake initiated by h.start().
+// +checklocks:h.ep.mu
 func (h *handshake) complete() tcpip.Error {
 	// Set up the wakers.
 	var s sleep.Sleeper
@@ -1283,42 +1284,45 @@ func (e *endpoint) disableKeepaliveTimer() {
 	e.keepalive.Unlock()
 }
 
-// protocolMainLoop is the main loop of the TCP protocol. It runs in its own
-// goroutine and is responsible for sending segments and handling received
-// segments.
-func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error {
-	e.mu.Lock()
-	var closeTimer tcpip.Timer
-	var closeWaker sleep.Waker
-
-	epilogue := func() {
-		// e.mu is expected to be hold upon entering this section.
-		if e.snd != nil {
-			e.snd.resendTimer.cleanup()
-			e.snd.probeTimer.cleanup()
-			e.snd.reorderTimer.cleanup()
-		}
+// protocolMainLoopDone is called at the end of protocolMainLoop.
+// +checklocksrelease:e.mu
+func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer, closeWaker *sleep.Waker) {
+	if e.snd != nil {
+		e.snd.resendTimer.cleanup()
+		e.snd.probeTimer.cleanup()
+		e.snd.reorderTimer.cleanup()
+	}
 
-		if closeTimer != nil {
-			closeTimer.Stop()
-		}
+	if closeTimer != nil {
+		closeTimer.Stop()
+	}
 
-		e.completeWorkerLocked()
+	e.completeWorkerLocked()
 
-		if e.drainDone != nil {
-			close(e.drainDone)
-		}
+	if e.drainDone != nil {
+		close(e.drainDone)
+	}
 
-		e.mu.Unlock()
+	e.mu.Unlock()
 
-		e.drainClosingSegmentQueue()
+	e.drainClosingSegmentQueue()
 
-		// When the protocol loop exits we should wake up our waiters.
-		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
-	}
+	// When the protocol loop exits we should wake up our waiters.
+	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
+}
 
+// protocolMainLoop is the main loop of the TCP protocol. It runs in its own
+// goroutine and is responsible for sending segments and handling received
+// segments.
+func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error {
+	var (
+		closeTimer tcpip.Timer
+		closeWaker sleep.Waker
+	)
+
+	e.mu.Lock()
 	if handshake {
-		if err := e.h.complete(); err != nil {
+		if err := e.h.complete(); err != nil { // +checklocksforce
 			e.lastErrorMu.Lock()
 			e.lastError = err
 			e.lastErrorMu.Unlock()
@@ -1327,8 +1331,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 			e.hardError = err
 
 			e.workerCleanup = true
-			// Lock released below.
-			epilogue()
+			e.protocolMainLoopDone(closeTimer, &closeWaker)
 			return err
 		}
 	}
@@ -1472,7 +1475,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 						// Only block the worker if the endpoint
 						// is not in closed state or error state.
 						close(e.drainDone)
-						e.mu.Unlock()
+						e.mu.Unlock() // +checklocksforce
 						<-e.undrain
 						e.mu.Lock()
 					}
@@ -1533,8 +1536,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		if err != nil {
 			e.resetConnectionLocked(err)
 		}
-		// Lock released below.
-		epilogue()
 	}
 
 loop:
@@ -1558,6 +1559,7 @@ loop:
 			// just want to terminate the loop and cleanup the
 			// endpoint.
 			cleanupOnError(nil)
+			e.protocolMainLoopDone(closeTimer, &closeWaker)
 			return nil
 		case StateTimeWait:
 			fallthrough
@@ -1566,6 +1568,7 @@ loop:
 		default:
 			if err := funcs[v].f(); err != nil {
 				cleanupOnError(err)
+				e.protocolMainLoopDone(closeTimer, &closeWaker)
 				return nil
 			}
 		}
@@ -1589,13 +1592,13 @@ loop:
 	// Handle any StateError transition from StateTimeWait.
 	if e.EndpointState() == StateError {
 		cleanupOnError(nil)
+		e.protocolMainLoopDone(closeTimer, &closeWaker)
 		return nil
 	}
 
 	e.transitionToStateCloseLocked()
 
-	// Lock released below.
-	epilogue()
+	e.protocolMainLoopDone(closeTimer, &closeWaker)
 
 	// A new SYN was received during TIME_WAIT and we need to abort
 	// the timewait and redirect the segment to the listener queue
@@ -1665,6 +1668,7 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()
 // should be executed after releasing the endpoint registrations. This is
 // done in cases where a new SYN is received during TIME_WAIT that carries
 // a sequence number larger than one see on the connection.
+// +checklocks:e.mu
 func (e *endpoint) doTimeWait() (twReuse func()) {
 	// Trigger a 2 * MSL time wait state. During this period
 	// we will drop all incoming segments.
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index dff7cb89c..7d110516b 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -127,7 +127,7 @@ func (p *processor) start(wg *sync.WaitGroup) {
 				case !ep.segmentQueue.empty():
 					p.epQ.enqueue(ep)
 				}
-				ep.mu.Unlock()
+				ep.mu.Unlock() // +checklocksforce
 			} else {
 				ep.newSegmentWaker.Assert()
 			}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4acddc959..1ed4ba419 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -664,6 +664,7 @@ func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
 // The assumption behind spinning here being that background packet processing
 // should not be holding the lock for long and spinning reduces latency as we
 // avoid an expensive sleep/wakeup of of the syscall goroutine).
+// +checklocksacquire:e.mu
 func (e *endpoint) LockUser() {
 	for {
 		// Try first if the sock is locked then check if it's owned
@@ -683,7 +684,7 @@ func (e *endpoint) LockUser() {
 			continue
 		}
 		atomic.StoreUint32(&e.ownedByUser, 1)
-		return
+		return // +checklocksforce
 	}
 }
 
@@ -700,7 +701,7 @@ func (e *endpoint) LockUser() {
 // protocol goroutine altogether.
 //
 // Precondition: e.LockUser() must have been called before calling e.UnlockUser()
-// +checklocks:e.mu
+// +checklocksrelease:e.mu
 func (e *endpoint) UnlockUser() {
 	// Lock segment queue before checking so that we avoid a race where
 	// segments can be queued between the time we check if queue is empty
@@ -736,12 +737,13 @@ func (e *endpoint) UnlockUser() {
 }
 
 // StopWork halts packet processing. Only to be used in tests.
+// +checklocksacquire:e.mu
 func (e *endpoint) StopWork() {
 	e.mu.Lock()
 }
 
 // ResumeWork resumes packet processing. Only to be used in tests.
-// +checklocks:e.mu
+// +checklocksrelease:e.mu
 func (e *endpoint) ResumeWork() {
 	e.mu.Unlock()
 }
@@ -1480,86 +1482,95 @@ func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
 	return avail, nil
 }
 
-// Write writes data to the endpoint's peer.
-func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
-	// Linux completely ignores any address passed to sendto(2) for TCP sockets
-	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
-	// and opts.EndOfRecord are also ignored.
+// readFromPayloader reads a slice from the Payloader.
+// +checklocks:e.mu
+// +checklocks:e.sndQueueInfo.sndQueueMu
+func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) {
+	// We can release locks while copying data.
+	//
+	// This is not possible if atomic is set, because we can't allow the
+	// available buffer space to be consumed by some other caller while we
+	// are copying data in.
+	if !opts.Atomic {
+		e.sndQueueInfo.sndQueueMu.Unlock()
+		defer e.sndQueueInfo.sndQueueMu.Lock()
 
-	e.LockUser()
-	defer e.UnlockUser()
+		e.UnlockUser()
+		defer e.LockUser()
+	}
 
-	nextSeg, n, err := func() (*segment, int, tcpip.Error) {
-		e.sndQueueInfo.sndQueueMu.Lock()
-		defer e.sndQueueInfo.sndQueueMu.Unlock()
+	// Fetch data.
+	if l := p.Len(); l < avail {
+		avail = l
+	}
+	if avail == 0 {
+		return nil, nil
+	}
+	v := make([]byte, avail)
+	n, err := p.Read(v)
+	if err != nil && err != io.EOF {
+		return nil, &tcpip.ErrBadBuffer{}
+	}
+	return v[:n], nil
+}
+
+// queueSegment reads data from the payloader and returns a segment to be sent.
+// +checklocks:e.mu
+func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) {
+	e.sndQueueInfo.sndQueueMu.Lock()
+	defer e.sndQueueInfo.sndQueueMu.Unlock()
+
+	avail, err := e.isEndpointWritableLocked()
+	if err != nil {
+		e.stats.WriteErrors.WriteClosed.Increment()
+		return nil, 0, err
+	}
 
+	v, err := e.readFromPayloader(p, opts, avail)
+	if err != nil {
+		return nil, 0, err
+	}
+	if !opts.Atomic {
+		// Since we released locks in between it's possible that the
+		// endpoint transitioned to a CLOSED/ERROR states so make
+		// sure endpoint is still writable before trying to write.
 		avail, err := e.isEndpointWritableLocked()
 		if err != nil {
 			e.stats.WriteErrors.WriteClosed.Increment()
 			return nil, 0, err
 		}
 
-		v, err := func() ([]byte, tcpip.Error) {
-			// We can release locks while copying data.
-			//
-			// This is not possible if atomic is set, because we can't allow the
-			// available buffer space to be consumed by some other caller while we
-			// are copying data in.
-			if !opts.Atomic {
-				e.sndQueueInfo.sndQueueMu.Unlock()
-				defer e.sndQueueInfo.sndQueueMu.Lock()
-
-				e.UnlockUser()
-				defer e.LockUser()
-			}
-
-			// Fetch data.
-			if l := p.Len(); l < avail {
-				avail = l
-			}
-			if avail == 0 {
-				return nil, nil
-			}
-			v := make([]byte, avail)
-			n, err := p.Read(v)
-			if err != nil && err != io.EOF {
-				return nil, &tcpip.ErrBadBuffer{}
-			}
-			return v[:n], nil
-		}()
-		if len(v) == 0 || err != nil {
-			return nil, 0, err
+		// Discard any excess data copied in due to avail being reduced due
+		// to a simultaneous write call to the socket.
+		if avail < len(v) {
+			v = v[:avail]
 		}
+	}
 
-		if !opts.Atomic {
-			// Since we released locks in between it's possible that the
-			// endpoint transitioned to a CLOSED/ERROR states so make
-			// sure endpoint is still writable before trying to write.
-			avail, err := e.isEndpointWritableLocked()
-			if err != nil {
-				e.stats.WriteErrors.WriteClosed.Increment()
-				return nil, 0, err
-			}
+	// Add data to the send queue.
+	s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v)
+	e.sndQueueInfo.SndBufUsed += len(v)
+	e.snd.writeList.PushBack(s)
 
-			// Discard any excess data copied in due to avail being reduced due
-			// to a simultaneous write call to the socket.
-			if avail < len(v) {
-				v = v[:avail]
-			}
-		}
+	return s, len(v), nil
+}
 
-		// Add data to the send queue.
-		s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v)
-		e.sndQueueInfo.SndBufUsed += len(v)
-		e.snd.writeList.PushBack(s)
+// Write writes data to the endpoint's peer.
+func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
+	// Linux completely ignores any address passed to sendto(2) for TCP sockets
+	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
+	// and opts.EndOfRecord are also ignored.
+
+	e.LockUser()
+	defer e.UnlockUser()
 
-		return s, len(v), nil
-	}()
 	// Return if either we didn't queue anything or if an error occurred while
 	// attempting to queue data.
+	nextSeg, n, err := e.queueSegment(p, opts)
 	if n == 0 || err != nil {
 		return 0, err
 	}
+
 	e.sendData(nextSeg)
 	return int64(n), nil
 }
@@ -2504,6 +2515,7 @@ func (e *endpoint) listen(backlog int) tcpip.Error {
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
+// +checklocksrelease:e.mu
 func (e *endpoint) startAcceptedLoop() {
 	e.workerRunning = true
 	e.mu.Unlock()
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 65c86823a..2e709ed78 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -164,8 +164,9 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 		return nil, err
 	}
 
-	// Start the protocol goroutine.
-	ep.startAcceptedLoop()
+	// Start the protocol goroutine. Note that the endpoint is returned
+	// from performHandshake locked.
+	ep.startAcceptedLoop() // +checklocksforce
 
 	return ep, nil
 }
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index def9d7186..82a3f2287 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -364,6 +364,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult
 // reacquire the mutex in exclusive mode.
 //
 // Returns true for retry if preparation should be retried.
+// +checklocks:e.mu
 func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) {
 	switch e.EndpointState() {
 	case StateInitial:
@@ -380,10 +381,8 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip
 	}
 
 	e.mu.RUnlock()
-	defer e.mu.RLock()
-
 	e.mu.Lock()
-	defer e.mu.Unlock()
+	defer e.mu.DowngradeLock()
 
 	// The state changed when we released the shared locked and re-acquired
 	// it in exclusive mode. Try again.
@@ -449,37 +448,20 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp
 	return n, err
 }
 
-func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
-	if err := e.LastError(); err != nil {
-		return 0, err
-	}
-
-	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
-	if opts.More {
-		return 0, &tcpip.ErrInvalidOptionValue{}
-	}
-
-	to := opts.To
-
+func (e *endpoint) buildUDPPacketInfo(p tcpip.Payloader, opts tcpip.WriteOptions) (udpPacketInfo, tcpip.Error) {
 	e.mu.RLock()
-	lockReleased := false
-	defer func() {
-		if lockReleased {
-			return
-		}
-		e.mu.RUnlock()
-	}()
+	defer e.mu.RUnlock()
 
 	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
 	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
-		return 0, &tcpip.ErrClosedForSend{}
+		return udpPacketInfo{}, &tcpip.ErrClosedForSend{}
 	}
 
 	// Prepare for write.
 	for {
-		retry, err := e.prepareForWrite(to)
+		retry, err := e.prepareForWrite(opts.To)
 		if err != nil {
-			return 0, err
+			return udpPacketInfo{}, err
 		}
 
 		if !retry {
@@ -489,34 +471,34 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp
 
 	route := e.route
 	dstPort := e.dstPort
-	if to != nil {
+	if opts.To != nil {
 		// Reject destination address if it goes through a different
 		// NIC than the endpoint was bound to.
-		nicID := to.NIC
+		nicID := opts.To.NIC
 		if nicID == 0 {
 			nicID = tcpip.NICID(e.ops.GetBindToDevice())
 		}
 		if e.BindNICID != 0 {
 			if nicID != 0 && nicID != e.BindNICID {
-				return 0, &tcpip.ErrNoRoute{}
+				return udpPacketInfo{}, &tcpip.ErrNoRoute{}
 			}
 
 			nicID = e.BindNICID
 		}
 
-		if to.Port == 0 {
+		if opts.To.Port == 0 {
 			// Port 0 is an invalid port to send to.
-			return 0, &tcpip.ErrInvalidEndpointState{}
+			return udpPacketInfo{}, &tcpip.ErrInvalidEndpointState{}
 		}
 
-		dst, netProto, err := e.checkV4MappedLocked(*to)
+		dst, netProto, err := e.checkV4MappedLocked(*opts.To)
 		if err != nil {
-			return 0, err
+			return udpPacketInfo{}, err
 		}
 
 		r, _, err := e.connectRoute(nicID, dst, netProto)
 		if err != nil {
-			return 0, err
+			return udpPacketInfo{}, err
 		}
 		defer r.Release()
 
@@ -525,12 +507,12 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp
 	}
 
 	if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() {
-		return 0, &tcpip.ErrBroadcastDisabled{}
+		return udpPacketInfo{}, &tcpip.ErrBroadcastDisabled{}
 	}
 
 	v := make([]byte, p.Len())
 	if _, err := io.ReadFull(p, v); err != nil {
-		return 0, &tcpip.ErrBadBuffer{}
+		return udpPacketInfo{}, &tcpip.ErrBadBuffer{}
 	}
 	if len(v) > header.UDPMaximumPacketSize {
 		// Payload can't possibly fit in a packet.
@@ -548,24 +530,39 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp
 				v,
 			)
 		}
-		return 0, &tcpip.ErrMessageTooLong{}
+		return udpPacketInfo{}, &tcpip.ErrMessageTooLong{}
 	}
 
 	ttl := e.ttl
 	useDefaultTTL := ttl == 0
-
 	if header.IsV4MulticastAddress(route.RemoteAddress()) || header.IsV6MulticastAddress(route.RemoteAddress()) {
 		ttl = e.multicastTTL
 		// Multicast allows a 0 TTL.
 		useDefaultTTL = false
 	}
 
-	localPort := e.ID.LocalPort
-	sendTOS := e.sendTOS
-	owner := e.owner
-	noChecksum := e.SocketOptions().GetNoChecksum()
-	lockReleased = true
-	e.mu.RUnlock()
+	return udpPacketInfo{
+		route:         route,
+		data:          buffer.View(v),
+		localPort:     e.ID.LocalPort,
+		remotePort:    dstPort,
+		ttl:           ttl,
+		useDefaultTTL: useDefaultTTL,
+		tos:           e.sendTOS,
+		owner:         e.owner,
+		noChecksum:    e.SocketOptions().GetNoChecksum(),
+	}, nil
+}
+
+func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
+	if err := e.LastError(); err != nil {
+		return 0, err
+	}
+
+	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
+	if opts.More {
+		return 0, &tcpip.ErrInvalidOptionValue{}
+	}
 
 	// Do not hold lock when sending as loopback is synchronous and if the UDP
 	// datagram ends up generating an ICMP response then it can result in a
@@ -577,10 +574,15 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp
 	//
 	// See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read
 	// locking is prohibited.
-	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil {
+	u, err := e.buildUDPPacketInfo(p, opts)
+	if err != nil {
 		return 0, err
 	}
-	return int64(len(v)), nil
+	n, err := u.send()
+	if err != nil {
+		return 0, err
+	}
+	return int64(n), nil
 }
 
 // OnReuseAddressSet implements tcpip.SocketOptionsHandler.
@@ -817,14 +819,30 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
 	return nil
 }
 
-// sendUDP sends a UDP segment via the provided network endpoint and under the
-// provided identity.
-func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner, noChecksum bool) tcpip.Error {
+// udpPacketInfo contains all information required to send a UDP packet.
+//
+// This should be used as a value-only type, which exists in order to simplify
+// return value syntax. It should not be exported or extended.
+type udpPacketInfo struct {
+	route         *stack.Route
+	data          buffer.View
+	localPort     uint16
+	remotePort    uint16
+	ttl           uint8
+	useDefaultTTL bool
+	tos           uint8
+	owner         tcpip.PacketOwner
+	noChecksum    bool
+}
+
+// send sends the given packet.
+func (u *udpPacketInfo) send() (int, tcpip.Error) {
+	vv := u.data.ToVectorisedView()
 	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		ReserveHeaderBytes: header.UDPMinimumSize + int(r.MaxHeaderLength()),
-		Data:               data,
+		ReserveHeaderBytes: header.UDPMinimumSize + int(u.route.MaxHeaderLength()),
+		Data:               vv,
 	})
-	pkt.Owner = owner
+	pkt.Owner = u.owner
 
 	// Initialize the UDP header.
 	udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize))
@@ -832,8 +850,8 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 
 	length := uint16(pkt.Size())
 	udp.Encode(&header.UDPFields{
-		SrcPort: localPort,
-		DstPort: remotePort,
+		SrcPort: u.localPort,
+		DstPort: u.remotePort,
 		Length:  length,
 	})
 
@@ -841,30 +859,30 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	// On IPv4, UDP checksum is optional, and a zero value indicates the
 	// transmitter skipped the checksum generation (RFC768).
 	// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
-	if r.RequiresTXTransportChecksum() &&
-		(!noChecksum || r.NetProto() == header.IPv6ProtocolNumber) {
-		xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
-		for _, v := range data.Views() {
+	if u.route.RequiresTXTransportChecksum() &&
+		(!u.noChecksum || u.route.NetProto() == header.IPv6ProtocolNumber) {
+		xsum := u.route.PseudoHeaderChecksum(ProtocolNumber, length)
+		for _, v := range vv.Views() {
 			xsum = header.Checksum(v, xsum)
 		}
 		udp.SetChecksum(^udp.CalculateChecksum(xsum))
 	}
 
-	if useDefaultTTL {
-		ttl = r.DefaultTTL()
+	if u.useDefaultTTL {
+		u.ttl = u.route.DefaultTTL()
 	}
-	if err := r.WritePacket(stack.NetworkHeaderParams{
+	if err := u.route.WritePacket(stack.NetworkHeaderParams{
 		Protocol: ProtocolNumber,
-		TTL:      ttl,
-		TOS:      tos,
+		TTL:      u.ttl,
+		TOS:      u.tos,
 	}, pkt); err != nil {
-		r.Stats().UDP.PacketSendErrors.Increment()
-		return err
+		u.route.Stats().UDP.PacketSendErrors.Increment()
+		return 0, err
 	}
 
 	// Track count of packets sent.
-	r.Stats().UDP.PacketsSent.Increment()
-	return nil
+	u.route.Stats().UDP.PacketsSent.Increment()
+	return len(u.data), nil
 }
 
 // checkV4MappedLocked determines the effective network protocol and converts
author	Adin Scannell <ascannell@google.com>	2021-07-01 15:05:28 -0700
committer	gVisor bot <gvisor-bot@google.com>	2021-07-01 15:07:56 -0700
commit	16b751b6c610ec2c5a913cb8a818e9239ee7da71 (patch)
tree	5596ea010c6afbbe79d1196197cd4bfc5d517e79 /pkg
parent	570ca571805d6939c4c24b6a88660eefaf558ae7 (diff)