diff options
Diffstat (limited to 'pkg')
32 files changed, 450 insertions, 325 deletions
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 3a45e9041..8d7660e79 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -488,11 +488,11 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be // expensive, if possible release the lock and re-acquire it. if walkMayUnlock { - d.mu.Unlock() + d.mu.Unlock() // +checklocksforce: results in an inconsistent block. } c, err := d.Inode.Lookup(ctx, name) if walkMayUnlock { - d.mu.Lock() + d.mu.Lock() // +checklocksforce: see above. } // No dice. if err != nil { @@ -594,21 +594,27 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { // lockDirectory should be called for any operation that changes this `d`s // children (creating or removing them). -func (d *Dirent) lockDirectory() func() { +// +checklocksacquire:d.dirMu +// +checklocksacquire:d.mu +func (d *Dirent) lockDirectory() { renameMu.RLock() d.dirMu.Lock() d.mu.Lock() - return func() { - d.mu.Unlock() - d.dirMu.Unlock() - renameMu.RUnlock() - } +} + +// unlockDirectory is the reverse of lockDirectory. +// +checklocksrelease:d.dirMu +// +checklocksrelease:d.mu +func (d *Dirent) unlockDirectory() { + d.mu.Unlock() + d.dirMu.Unlock() + renameMu.RUnlock() // +checklocksforce: see lockDirectory. } // Create creates a new regular file in this directory. func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) { - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Does something already exist? if d.exists(ctx, root, name) { @@ -670,8 +676,8 @@ func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) { // genericCreate executes create if name does not exist. Removes a negative Dirent at name if // create succeeds. func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error { - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Does something already exist? if d.exists(ctx, root, name) { @@ -1021,8 +1027,8 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath panic("Dirent.Remove: root must not be nil") } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Try to walk to the node. child, err := d.walk(ctx, root, name, false /* may unlock */) @@ -1082,8 +1088,8 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) panic("Dirent.Remove: root must not be nil") } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Check for dots. if name == "." { @@ -1259,17 +1265,15 @@ func (d *Dirent) dropExtendedReference() { d.Inode.MountSource.fscache.Remove(d) } -// lockForRename takes locks on oldParent and newParent as required by Rename -// and returns a function that will unlock the locks taken. The returned -// function must be called even if a non-nil error is returned. -func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) { +// lockForRename takes locks on oldParent and newParent as required by Rename. +// On return, unlockForRename must always be called, even with an error. +// +checklocksacquire:oldParent.mu +// +checklocksacquire:newParent.mu +func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) error { renameMu.Lock() if oldParent == newParent { oldParent.mu.Lock() - return func() { - oldParent.mu.Unlock() - renameMu.Unlock() - }, nil + return nil // +checklocksforce: only one lock exists. } // Renaming between directories is a bit subtle: @@ -1297,11 +1301,7 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName // itself. err = unix.EINVAL } - return func() { - newParent.mu.Unlock() - oldParent.mu.Unlock() - renameMu.Unlock() - }, err + return err } child = p } @@ -1310,11 +1310,21 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName // have no relationship; in either case we can do this: newParent.mu.Lock() oldParent.mu.Lock() - return func() { + return nil +} + +// unlockForRename is the opposite of lockForRename. +// +checklocksrelease:oldParent.mu +// +checklocksrelease:newParent.mu +func unlockForRename(oldParent, newParent *Dirent) { + if oldParent == newParent { oldParent.mu.Unlock() - newParent.mu.Unlock() - renameMu.Unlock() - }, nil + renameMu.Unlock() // +checklocksforce: only one lock exists. + return + } + newParent.mu.Unlock() + oldParent.mu.Unlock() + renameMu.Unlock() // +checklocksforce: not tracked. } func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { @@ -1353,8 +1363,8 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error return err } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() victim, err := d.walk(ctx, root, name, true /* may unlock */) if err != nil { @@ -1392,8 +1402,8 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string } // Acquire global renameMu lock, and mu locks on oldParent/newParent. - unlock, err := lockForRename(oldParent, oldName, newParent, newName) - defer unlock() + err := lockForRename(oldParent, oldName, newParent, newName) + defer unlockForRename(oldParent, newParent) if err != nil { return err } diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index 44587bb37..a346c316b 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -80,23 +80,33 @@ func AsyncBarrier() { // Async executes a function asynchronously. // // Async must not be called recursively. +// +checklocksignore func Async(f func()) { workMu.RLock() - go func() { // S/R-SAFE: AsyncBarrier must be called. - defer workMu.RUnlock() // Ensure RUnlock in case of panic. - f() - }() + go asyncWork(f) // S/R-SAFE: AsyncBarrier must be called. +} + +// +checklocksignore +func asyncWork(f func()) { + // Ensure RUnlock in case of panic. + defer workMu.RUnlock() + f() } // AsyncWithContext is just like Async, except that it calls the asynchronous // function with the given context as argument. This function exists to avoid // needing to allocate an extra function on the heap in a hot path. +// +checklocksignore func AsyncWithContext(ctx context.Context, f func(context.Context)) { workMu.RLock() - go func() { // S/R-SAFE: AsyncBarrier must be called. - defer workMu.RUnlock() // Ensure RUnlock in case of panic. - f(ctx) - }() + go asyncWorkWithContext(ctx, f) +} + +// +checklocksignore +func asyncWorkWithContext(ctx context.Context, f func(context.Context)) { + // Ensure RUnlock in case of panic. + defer workMu.RUnlock() + f(ctx) } // AsyncErrorBarrier waits for all outstanding asynchronous work to complete, or diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index e2af1d2ae..19f91f010 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -112,13 +112,6 @@ func (i *inodeFileState) loadLoading(_ struct{}) { // +checklocks:i.loading func (i *inodeFileState) afterLoad() { load := func() (err error) { - // See comment on i.loading(). - defer func() { - if err == nil { - i.loading.Unlock() - } - }() - // Manually restore the p9.File. name, ok := i.s.inodeMappings[i.sattr.InodeID] if !ok { @@ -167,6 +160,9 @@ func (i *inodeFileState) afterLoad() { i.savedUAttr = nil } + // See comment on i.loading(). This only unlocks on the + // non-error path. + i.loading.Unlock() // +checklocksforce: per comment. return nil } diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index aa2405f68..958f46bd6 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -47,7 +47,8 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string return nil, linuxerr.ENAMETOOLONG } - cp := i.session().cachePolicy + s := i.session() + cp := s.cachePolicy if cp.cacheReaddir() { // Check to see if we have readdirCache that indicates the // child does not exist. Avoid holding readdirMu longer than @@ -78,7 +79,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string return nil, err } - if i.session().overrides != nil { + if s.overrides != nil { // Check if file belongs to a internal named pipe. Note that it doesn't need // to check for sockets because it's done in newInodeOperations below. deviceKey := device.MultiDeviceKey{ @@ -86,13 +87,13 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string SecondaryDevice: i.session().connID, Inode: qids[0].Path, } - unlock := i.session().overrides.lock() - if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil { - unlock() + s.overrides.lock() + if pipeInode := s.overrides.getPipe(deviceKey); pipeInode != nil { + s.overrides.unlock() pipeInode.IncRef() return fs.NewDirent(ctx, pipeInode, name), nil } - unlock() + s.overrides.unlock() } // Construct the Inode operations. @@ -221,17 +222,20 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, t if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(inode) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(inode) { // Increase link count. targetOpts.cachingInodeOps.IncLinks(ctx) } + i.touchModificationAndStatusChangeTime(ctx, inode) return nil } // CreateDirectory uses Create to create a directory named s under inodeOperations. -func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s string, perm fs.FilePermissions) error { - if len(s) > maxFilenameLen { +func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + if len(name) > maxFilenameLen { return linuxerr.ENAMETOOLONG } @@ -247,16 +251,18 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s perm.SetGID = true } - if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { + if _, err := i.fileState.file.mkdir(ctx, name, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(dir) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(dir) { // Increase link count. // // N.B. This will update the modification time. i.cachingInodeOps.IncLinks(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } @@ -269,13 +275,14 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, return nil, linuxerr.ENAMETOOLONG } - if i.session().overrides == nil { + s := i.session() + if s.overrides == nil { return nil, syserror.EOPNOTSUPP } // Stabilize the override map while creation is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket) if err != nil { @@ -284,7 +291,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, // Construct the positive Dirent. childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep) + s.overrides.addBoundEndpoint(iops.fileState.key, childDir, ep) return childDir, nil } @@ -298,8 +305,9 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe // N.B. FIFOs use major/minor numbers 0. + s := i.session() if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { - if i.session().overrides == nil || !linuxerr.Equals(linuxerr.EPERM, err) { + if s.overrides == nil || !linuxerr.Equals(linuxerr.EPERM, err) { return err } // If gofer doesn't support mknod, check if we can create an internal fifo. @@ -311,13 +319,14 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st } func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error { - if i.session().overrides == nil { + s := i.session() + if s.overrides == nil { return linuxerr.EPERM } // Stabilize the override map while creation is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe) if err != nil { @@ -336,7 +345,7 @@ func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, // Construct the positive Dirent. childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode) + s.overrides.addPipe(fileOps.fileState.key, childDir, inode) return nil } @@ -386,8 +395,9 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return linuxerr.ENAMETOOLONG } + s := i.session() var key *device.MultiDeviceKey - if i.session().overrides != nil { + if s.overrides != nil { // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { @@ -402,8 +412,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string } // Stabilize the override map while deletion is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() } } } @@ -412,7 +422,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return err } if key != nil { - i.session().overrides.remove(ctx, *key) + s.overrides.remove(ctx, *key) } i.touchModificationAndStatusChangeTime(ctx, dir) @@ -429,11 +439,13 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na if err := i.fileState.file.unlinkAt(ctx, name, 0x200); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(dir) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(dir) { // Decrease link count and updates atime. i.cachingInodeOps.DecLinks(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } @@ -463,12 +475,13 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } // Is the renamed entity a directory? Fix link counts. + s := i.session() if fs.IsDir(i.fileState.sattr) { // Update cached state. - if i.session().cachePolicy.cacheUAttrs(oldParent) { + if s.cachePolicy.cacheUAttrs(oldParent) { oldParentInodeOperations.cachingInodeOps.DecLinks(ctx) } - if i.session().cachePolicy.cacheUAttrs(newParent) { + if s.cachePolicy.cacheUAttrs(newParent) { // Only IncLinks if there is a new addition to // newParent. If this is replacement, then the total // count remains the same. @@ -477,7 +490,7 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } } } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Mark old directory dirty. oldParentInodeOperations.markDirectoryDirty() if oldParent != newParent { @@ -487,17 +500,18 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } // Rename always updates ctime. - if i.session().cachePolicy.cacheUAttrs(inode) { + if s.cachePolicy.cacheUAttrs(inode) { i.cachingInodeOps.TouchStatusChangeTime(ctx) } return nil } func (i *inodeOperations) touchModificationAndStatusChangeTime(ctx context.Context, inode *fs.Inode) { - if i.session().cachePolicy.cacheUAttrs(inode) { + s := i.session() + if s.cachePolicy.cacheUAttrs(inode) { i.cachingInodeOps.TouchModificationAndStatusChangeTime(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 7cf3522ff..b7debeecb 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -98,9 +98,14 @@ func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) { // lock blocks other addition and removal operations from happening while // the backing file is being created or deleted. Returns a function that unlocks // the endpoint map. -func (e *overrideMaps) lock() func() { +// +checklocksacquire:e.mu +func (e *overrideMaps) lock() { e.mu.Lock() - return func() { e.mu.Unlock() } +} + +// +checklocksrelease:e.mu +func (e *overrideMaps) unlock() { + e.mu.Unlock() } // getBoundEndpoint returns the bound endpoint mapped to the given key. @@ -366,8 +371,8 @@ func newOverrideMaps() *overrideMaps { // fillKeyMap populates key and dirent maps upon restore from saved pathmap. func (s *session) fillKeyMap(ctx context.Context) error { - unlock := s.overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() for ep, dirPath := range s.overrides.pathMap { _, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath)) @@ -394,8 +399,8 @@ func (s *session) fillKeyMap(ctx context.Context) error { // fillPathMap populates paths for overrides from dirents in direntMap // before save. func (s *session) fillPathMap(ctx context.Context) error { - unlock := s.overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() for _, endpoint := range s.overrides.keyMap { mountRoot := endpoint.dirent.MountRoot() diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 8a1c69ac2..1fd8a0910 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -32,10 +32,11 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. return nil } - if i.session().overrides != nil { - unlock := i.session().overrides.lock() - defer unlock() - ep := i.session().overrides.getBoundEndpoint(i.fileState.key) + s := i.session() + if s.overrides != nil { + s.overrides.lock() + defer s.overrides.unlock() + ep := s.overrides.getBoundEndpoint(i.fileState.key) if ep != nil { return ep } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 237d17921..652e5fe77 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -147,6 +147,7 @@ func putDentrySlice(ds *[]*dentry) { // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() if *dsp == nil { @@ -159,6 +160,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp ** putDentrySlice(*dsp) } +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() @@ -540,7 +542,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if child.syntheticChildren != 0 { // This is definitely not an empty directory, irrespective of // fs.opts.interop. - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: PrepareDeleteDentry called if child != nil. return linuxerr.ENOTEMPTY } // If InteropModeShared is in effect and the first call to @@ -550,12 +552,12 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // still exist) would be a waste of time. if child.cachedMetadataAuthoritative() { if !child.isDir() { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. return syserror.ENOTDIR } for _, grandchild := range child.children { if grandchild != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. return linuxerr.ENOTEMPTY } } @@ -565,12 +567,12 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } else { // child must be a non-directory file. if child != nil && child.isDir() { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. return syserror.EISDIR } if rp.MustBeDir() { if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } return syserror.ENOTDIR } @@ -583,7 +585,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b err = parent.file.unlinkAt(ctx, name, flags) if err != nil { if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } return err } @@ -601,7 +603,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } if child != nil { - vfsObj.CommitDeleteDentry(ctx, &child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) // +checklocksforce: see above. child.setDeleted() if child.isSynthetic() { parent.syntheticChildren-- diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index fe4c2e0e1..2f85215d9 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -947,10 +947,10 @@ func (d *dentry) cachedMetadataAuthoritative() bool { // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. // Precondition: d.metadataMu must be locked. +// +checklocks:d.metadataMu func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { if mask.Mode { if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { - d.metadataMu.Unlock() panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) } atomic.StoreUint32(&d.mode, uint32(attr.Mode)) @@ -989,6 +989,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { // Preconditions: !d.isSynthetic(). // Preconditions: d.metadataMu is locked. +// +checklocks:d.metadataMu func (d *dentry) refreshSizeLocked(ctx context.Context) error { d.handleMu.RLock() @@ -1020,6 +1021,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { // Preconditions: // * !d.isSynthetic(). // * d.metadataMu is locked. +// +checklocks:d.metadataMu func (d *dentry) updateFromGetattrLocked(ctx context.Context) error { // Use d.readFile or d.writeFile, which represent 9P FIDs that have been // opened, in preference to d.file, which represents a 9P fid that has not. @@ -1044,7 +1046,8 @@ func (d *dentry) updateFromGetattrLocked(ctx context.Context) error { _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) if handleMuRLocked { - d.handleMu.RUnlock() // must be released before updateFromP9AttrsLocked() + // handleMu must be released before updateFromP9AttrsLocked(). + d.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err @@ -1470,7 +1473,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo if d.isDeleted() { d.watches.HandleDeletion(ctx) } - d.destroyLocked(ctx) + d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. return } // If d still has inotify watches and it is not deleted or invalidated, it @@ -1498,7 +1501,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo delete(d.parent.children, d.name) d.parent.dirMu.Unlock() } - d.destroyLocked(ctx) + d.destroyLocked(ctx) // +checklocksforce: see above. return } @@ -1527,7 +1530,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo d.fs.renameMu.Lock() defer d.fs.renameMu.Unlock() } - d.fs.evictCachedDentryLocked(ctx) + d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. } } @@ -1544,6 +1547,7 @@ func (d *dentry) removeFromCacheLocked() { // Precondition: fs.renameMu must be locked for writing; it may be temporarily // unlocked. +// +checklocks:fs.renameMu func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { for fs.cachedDentriesLen != 0 { fs.evictCachedDentryLocked(ctx) @@ -1552,6 +1556,7 @@ func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { // Preconditions: // * fs.renameMu must be locked for writing; it may be temporarily unlocked. +// +checklocks:fs.renameMu func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { fs.cacheMu.Lock() victim := fs.cachedDentries.Back() @@ -1588,7 +1593,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { // will try to acquire fs.renameMu (which we have already acquired). Hence, // fs.renameMu will synchronize the destroy attempts. victim.cachingMu.Unlock() - victim.destroyLocked(ctx) + victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs. } // destroyLocked destroys the dentry. @@ -1598,6 +1603,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { // * d.refs == 0. // * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal // from its former parent dentry. +// +checklocks:d.fs.renameMu func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go index 8f81f0822..226790a11 100644 --- a/pkg/sentry/fsimpl/gofer/revalidate.go +++ b/pkg/sentry/fsimpl/gofer/revalidate.go @@ -247,16 +247,16 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF if found && !d.isSynthetic() { // First dentry is where the search is starting, just update attributes // since it cannot be replaced. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata. } - d.metadataMu.Unlock() + d.metadataMu.Unlock() // +checklocksforce: see above. continue } // Note that synthetic dentries will always fails the comparison check // below. if !found || d.qidPath != stats[i].QID.Path { - d.metadataMu.Unlock() + d.metadataMu.Unlock() // +checklocksforce: see above. if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. @@ -298,7 +298,7 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF } // The file at this path hasn't changed. Just update cached metadata. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above. d.metadataMu.Unlock() } @@ -354,6 +354,7 @@ func (r *revalidateState) add(name string, d *dentry) { r.dentries = append(r.dentries, d) } +// +checklocksignore func (r *revalidateState) lockAllMetadata() { for _, d := range r.dentries { d.metadataMu.Lock() @@ -372,6 +373,7 @@ func (r *revalidateState) popFront() *dentry { // reset releases all metadata locks and resets all fields to allow this // instance to be reused. +// +checklocksignore func (r *revalidateState) reset() { if r.locked { // Unlock any remaining dentries. diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index 2ec819f86..dbd834c67 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -41,7 +41,7 @@ func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { d.haveTarget = true d.target = target } - d.dataMu.Unlock() + d.dataMu.Unlock() // +checklocksforce: guaranteed locked from above. } return target, err } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 38c2b6df1..20d2526ad 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -752,7 +752,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa fs.deferDecRef(replaced) replaceVFSD = replaced.VFSDentry() } - virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) // +checklocksforce: to may be nil, that's okay. return nil } @@ -788,7 +788,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mntns.DecRef(ctx) vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { - return err + return err // +checklocksforce: vfsd is not locked. } if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil { diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 41207211a..77f9affc1 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -87,7 +87,7 @@ func putDentrySlice(ds *[]*dentry) { // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. // -// +checklocks:fs.renameMu +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() if *dsp == nil { @@ -113,7 +113,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]* putDentrySlice(*dsp) } -// +checklocks:fs.renameMu +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index e4bfbd3c9..358a66072 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -75,6 +75,7 @@ func putDentrySlice(ds *[]*dentry) { // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { @@ -90,6 +91,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*d putDentrySlice(*ds) } +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 6377abb94..f5c364c96 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -398,8 +398,8 @@ func (m *Manager) Fork() *Manager { } // lockBucket returns a locked bucket for the given key. -func (m *Manager) lockBucket(k *Key) *bucket { - var b *bucket +// +checklocksacquire:b.mu +func (m *Manager) lockBucket(k *Key) (b *bucket) { if k.Kind == KindSharedMappable { b = m.sharedBucket } else { @@ -410,7 +410,9 @@ func (m *Manager) lockBucket(k *Key) *bucket { } // lockBuckets returns locked buckets for the given keys. -func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { +// +checklocksacquire:b1.mu +// +checklocksacquire:b2.mu +func (m *Manager) lockBuckets(k1, k2 *Key) (b1 *bucket, b2 *bucket) { // Buckets must be consistently ordered to avoid circular lock // dependencies. We order buckets in m.privateBuckets by index (lowest // index first), and all buckets in m.privateBuckets precede @@ -420,8 +422,8 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable { i1 := bucketIndexForAddr(k1.addr()) i2 := bucketIndexForAddr(k2.addr()) - b1 := &m.privateBuckets[i1] - b2 := &m.privateBuckets[i2] + b1 = &m.privateBuckets[i1] + b2 = &m.privateBuckets[i2] switch { case i1 < i2: b1.mu.Lock() @@ -432,19 +434,30 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { default: b1.mu.Lock() } - return b1, b2 + return b1, b2 // +checklocksforce } // At least one of b1 or b2 should be m.sharedBucket. - b1 := m.sharedBucket - b2 := m.sharedBucket + b1 = m.sharedBucket + b2 = m.sharedBucket if k1.Kind != KindSharedMappable { b1 = m.lockBucket(k1) } else if k2.Kind != KindSharedMappable { b2 = m.lockBucket(k2) } m.sharedBucket.mu.Lock() - return b1, b2 + return b1, b2 // +checklocksforce +} + +// unlockBuckets unlocks two buckets. +// +checklocksrelease:b1.mu +// +checklocksrelease:b2.mu +func (m *Manager) unlockBuckets(b1, b2 *bucket) { + b1.mu.Unlock() + if b1 != b2 { + b2.mu.Unlock() + } + return // +checklocksforce } // Wake wakes up to n waiters matching the bitmask on the given addr. @@ -477,10 +490,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, c defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) - defer b1.mu.Unlock() - if b2 != b1 { - defer b2.mu.Unlock() - } + defer m.unlockBuckets(b1, b2) if checkval { if err := check(t, addr, val); err != nil { @@ -527,10 +537,7 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwa defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) - defer b1.mu.Unlock() - if b2 != b1 { - defer b2.mu.Unlock() - } + defer m.unlockBuckets(b1, b2) done := 0 cond, err := atomicOp(t, addr2, op) diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go index dd60cba24..077c5d596 100644 --- a/pkg/sentry/kernel/pipe/pipe_unsafe.go +++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go @@ -23,6 +23,8 @@ import ( // concurrent calls cannot deadlock. // // Preconditions: x != y. +// +checklocksacquire:x.mu +// +checklocksacquire:y.mu func lockTwoPipes(x, y *Pipe) { // Lock the two pipes in order of increasing address. if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) { diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 84f9f6234..c883a9014 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -157,6 +157,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume // // mu must be held by the caller. waitFor returns with mu held, but it will // drop mu before blocking for any reader/writers. +// +checklocks:mu func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { // Ideally this function would simply use a condition variable. However, the // wait needs to be interruptible via 'sleeper', so we must sychronize via a diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index cdaee5d7f..161140980 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -652,6 +652,7 @@ func (t *Task) forgetTracerLocked() { // Preconditions: // * The signal mutex must be locked. // * The caller must be running on the task goroutine. +// +checklocks:t.tg.signalHandlers.mu func (t *Task) ptraceSignalLocked(info *linux.SignalInfo) bool { if linux.Signal(info.Signo) == linux.SIGKILL { return false diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index c0c1f1f13..ae21a55da 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -121,8 +121,9 @@ func (pg *ProcessGroup) Originator() *ThreadGroup { // IsOrphan returns true if this process group is an orphan. func (pg *ProcessGroup) IsOrphan() bool { - pg.originator.TaskSet().mu.RLock() - defer pg.originator.TaskSet().mu.RUnlock() + ts := pg.originator.TaskSet() + ts.mu.RLock() + defer ts.mu.RUnlock() return pg.ancestors == 0 } diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 125fd855b..b51ec6aa7 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -204,6 +204,7 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar h // * vseg.Range().IsSupersetOf(ar). // // Postconditions: mm.mappingMu will be unlocked. +// +checklocksrelease:mm.mappingMu func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) { // See populateVMA above for commentary. if !vseg.ValuePtr().effectivePerms.Any() { diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index f7d5a1800..0c8542485 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -945,7 +945,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( // NOTE(b/165896008): mincore (which is passed as checkCommitted) // by f.UpdateUsage() might take a really long time. So unlock f.mu // while checkCommitted runs. - f.mu.Unlock() + f.mu.Unlock() // +checklocksforce err := checkCommitted(s, buf) f.mu.Lock() if err != nil { diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 242eb5ecb..cb92b6eee 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -196,11 +196,12 @@ func (d *Dentry) OnZeroWatches(ctx context.Context) { // PrepareDeleteDentry must be called before attempting to delete the file // represented by d. If PrepareDeleteDentry succeeds, the caller must call // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. +// +checklocksacquire:d.mu func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { vfs.mountMu.Lock() if mntns.mountpoints[d] != 0 { vfs.mountMu.Unlock() - return linuxerr.EBUSY + return linuxerr.EBUSY // +checklocksforce: inconsistent return. } d.mu.Lock() vfs.mountMu.Unlock() @@ -211,14 +212,14 @@ func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dent // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion // fails. -// +checklocks:d.mu +// +checklocksrelease:d.mu func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { d.mu.Unlock() } // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion // succeeds. -// +checklocks:d.mu +// +checklocksrelease:d.mu func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) { d.dead = true d.mu.Unlock() @@ -249,16 +250,18 @@ func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) { // Preconditions: // * If to is not nil, it must be a child Dentry from the same Filesystem. // * from != to. +// +checklocksacquire:from.mu +// +checklocksacquire:to.mu func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { vfs.mountMu.Lock() if mntns.mountpoints[from] != 0 { vfs.mountMu.Unlock() - return linuxerr.EBUSY + return linuxerr.EBUSY // +checklocksforce: no locks acquired. } if to != nil { if mntns.mountpoints[to] != 0 { vfs.mountMu.Unlock() - return linuxerr.EBUSY + return linuxerr.EBUSY // +checklocksforce: no locks acquired. } to.mu.Lock() } @@ -267,13 +270,13 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t // Return with from.mu and to.mu locked, which will be unlocked by // AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry. - return nil + return nil // +checklocksforce: to may not be acquired. } // AbortRenameDentry must be called after PrepareRenameDentry if the rename // fails. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { from.mu.Unlock() if to != nil { @@ -286,8 +289,8 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // that was replaced by from. // // Preconditions: PrepareRenameDentry was previously called on from and to. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) { from.mu.Unlock() if to != nil { @@ -303,8 +306,8 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, fro // from and to are exchanged by rename(RENAME_EXCHANGE). // // Preconditions: PrepareRenameDentry was previously called on from and to. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { from.mu.Unlock() to.mu.Unlock() diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go index 411a80a8a..b829765d9 100644 --- a/pkg/sync/mutex_unsafe.go +++ b/pkg/sync/mutex_unsafe.go @@ -32,6 +32,18 @@ func (m *CrossGoroutineMutex) state() *int32 { return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state } +// Lock locks the underlying Mutex. +// +checklocksignore +func (m *CrossGoroutineMutex) Lock() { + m.Mutex.Lock() +} + +// Unlock unlocks the underlying Mutex. +// +checklocksignore +func (m *CrossGoroutineMutex) Unlock() { + m.Mutex.Unlock() +} + const ( mutexUnlocked = 0 mutexLocked = 1 @@ -62,6 +74,7 @@ type Mutex struct { // Lock locks m. If the lock is already in use, the calling goroutine blocks // until the mutex is available. +// +checklocksignore func (m *Mutex) Lock() { noteLock(unsafe.Pointer(m)) m.m.Lock() @@ -80,6 +93,7 @@ func (m *Mutex) Unlock() { // TryLock tries to acquire the mutex. It returns true if it succeeds and false // otherwise. TryLock does not block. +// +checklocksignore func (m *Mutex) TryLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(m)) diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go index 892d3e641..7829b06db 100644 --- a/pkg/sync/rwmutex_unsafe.go +++ b/pkg/sync/rwmutex_unsafe.go @@ -37,6 +37,7 @@ const rwmutexMaxReaders = 1 << 30 // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *CrossGoroutineRWMutex) TryRLock() bool { if RaceEnabled { RaceDisable() @@ -65,6 +66,7 @@ func (rw *CrossGoroutineRWMutex) TryRLock() bool { // It should not be used for recursive read locking; a blocked Lock call // excludes new readers from acquiring the lock. See the documentation on the // RWMutex type. +// +checklocksignore func (rw *CrossGoroutineRWMutex) RLock() { if RaceEnabled { RaceDisable() @@ -83,6 +85,7 @@ func (rw *CrossGoroutineRWMutex) RLock() { // // Preconditions: // * rw is locked for reading. +// +checklocksignore func (rw *CrossGoroutineRWMutex) RUnlock() { if RaceEnabled { RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) @@ -134,6 +137,7 @@ func (rw *CrossGoroutineRWMutex) TryLock() bool { // Lock locks rw for writing. If the lock is already locked for reading or // writing, Lock blocks until the lock is available. +// +checklocksignore func (rw *CrossGoroutineRWMutex) Lock() { if RaceEnabled { RaceDisable() @@ -228,6 +232,7 @@ type RWMutex struct { // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *RWMutex) TryRLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(rw)) @@ -243,6 +248,7 @@ func (rw *RWMutex) TryRLock() bool { // It should not be used for recursive read locking; a blocked Lock call // excludes new readers from acquiring the lock. See the documentation on the // RWMutex type. +// +checklocksignore func (rw *RWMutex) RLock() { noteLock(unsafe.Pointer(rw)) rw.m.RLock() @@ -261,6 +267,7 @@ func (rw *RWMutex) RUnlock() { // TryLock locks rw for writing. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *RWMutex) TryLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(rw)) @@ -273,6 +280,7 @@ func (rw *RWMutex) TryLock() bool { // Lock locks rw for writing. If the lock is already locked for reading or // writing, Lock blocks until the lock is available. +// +checklocksignore func (rw *RWMutex) Lock() { noteLock(unsafe.Pointer(rw)) rw.m.Lock() diff --git a/pkg/tcpip/stack/addressable_endpoint_state.go b/pkg/tcpip/stack/addressable_endpoint_state.go index ce9cebdaa..ae0bb4ace 100644 --- a/pkg/tcpip/stack/addressable_endpoint_state.go +++ b/pkg/tcpip/stack/addressable_endpoint_state.go @@ -249,7 +249,7 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address // or we are adding a new temporary or permanent address. // // The address MUST be write locked at this point. - defer addrState.mu.Unlock() + defer addrState.mu.Unlock() // +checklocksforce if permanent { if addrState.mu.kind.IsPermanent() { diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index 782e74b24..068dab7ce 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -363,7 +363,7 @@ func (ct *ConnTrack) insertConn(conn *conn) { // Unlocking can happen in any order. ct.buckets[tupleBucket].mu.Unlock() if tupleBucket != replyBucket { - ct.buckets[replyBucket].mu.Unlock() + ct.buckets[replyBucket].mu.Unlock() // +checklocksforce } } @@ -626,7 +626,7 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bo // Don't re-unlock if both tuples are in the same bucket. if differentBuckets { - ct.buckets[replyBucket].mu.Unlock() + ct.buckets[replyBucket].mu.Unlock() // +checklocksforce } return true diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index cb316d27a..f9a15efb2 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -213,6 +213,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult // reacquire the mutex in exclusive mode. // // Returns true for retry if preparation should be retried. +// +checklocks:e.mu func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { switch e.state { case stateInitial: @@ -229,10 +230,8 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip } e.mu.RUnlock() - defer e.mu.RLock() - e.mu.Lock() - defer e.mu.Unlock() + defer e.mu.DowngradeLock() // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index d807b13b7..aa413ad05 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -330,7 +330,9 @@ func (l *listenContext) performHandshake(s *segment, opts *header.TCPSynOptions, } ep := h.ep - if err := h.complete(); err != nil { + // N.B. the endpoint is generated above by startHandshake, and will be + // returned locked. This first call is forced. + if err := h.complete(); err != nil { // +checklocksforce ep.stack.Stats().TCP.FailedConnectionAttempts.Increment() ep.stats.FailedConnectionAttempts.Increment() l.cleanupFailedHandshake(h) @@ -364,6 +366,7 @@ func (l *listenContext) closeAllPendingEndpoints() { } // Precondition: h.ep.mu must be held. +// +checklocks:h.ep.mu func (l *listenContext) cleanupFailedHandshake(h *handshake) { e := h.ep e.mu.Unlock() @@ -504,7 +507,9 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header } go func() { - if err := h.complete(); err != nil { + // Note that startHandshake returns a locked endpoint. The + // force call here just makes it so. + if err := h.complete(); err != nil { // +checklocksforce e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() ctx.cleanupFailedHandshake(h) diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index e39d1623d..93ed161f9 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -511,6 +511,7 @@ func (h *handshake) start() { } // complete completes the TCP 3-way handshake initiated by h.start(). +// +checklocks:h.ep.mu func (h *handshake) complete() tcpip.Error { // Set up the wakers. var s sleep.Sleeper @@ -1283,42 +1284,45 @@ func (e *endpoint) disableKeepaliveTimer() { e.keepalive.Unlock() } -// protocolMainLoop is the main loop of the TCP protocol. It runs in its own -// goroutine and is responsible for sending segments and handling received -// segments. -func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error { - e.mu.Lock() - var closeTimer tcpip.Timer - var closeWaker sleep.Waker - - epilogue := func() { - // e.mu is expected to be hold upon entering this section. - if e.snd != nil { - e.snd.resendTimer.cleanup() - e.snd.probeTimer.cleanup() - e.snd.reorderTimer.cleanup() - } +// protocolMainLoopDone is called at the end of protocolMainLoop. +// +checklocksrelease:e.mu +func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer, closeWaker *sleep.Waker) { + if e.snd != nil { + e.snd.resendTimer.cleanup() + e.snd.probeTimer.cleanup() + e.snd.reorderTimer.cleanup() + } - if closeTimer != nil { - closeTimer.Stop() - } + if closeTimer != nil { + closeTimer.Stop() + } - e.completeWorkerLocked() + e.completeWorkerLocked() - if e.drainDone != nil { - close(e.drainDone) - } + if e.drainDone != nil { + close(e.drainDone) + } - e.mu.Unlock() + e.mu.Unlock() - e.drainClosingSegmentQueue() + e.drainClosingSegmentQueue() - // When the protocol loop exits we should wake up our waiters. - e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) - } + // When the protocol loop exits we should wake up our waiters. + e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) +} +// protocolMainLoop is the main loop of the TCP protocol. It runs in its own +// goroutine and is responsible for sending segments and handling received +// segments. +func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error { + var ( + closeTimer tcpip.Timer + closeWaker sleep.Waker + ) + + e.mu.Lock() if handshake { - if err := e.h.complete(); err != nil { + if err := e.h.complete(); err != nil { // +checklocksforce e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() @@ -1327,8 +1331,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ e.hardError = err e.workerCleanup = true - // Lock released below. - epilogue() + e.protocolMainLoopDone(closeTimer, &closeWaker) return err } } @@ -1472,7 +1475,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ // Only block the worker if the endpoint // is not in closed state or error state. close(e.drainDone) - e.mu.Unlock() + e.mu.Unlock() // +checklocksforce <-e.undrain e.mu.Lock() } @@ -1533,8 +1536,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ if err != nil { e.resetConnectionLocked(err) } - // Lock released below. - epilogue() } loop: @@ -1558,6 +1559,7 @@ loop: // just want to terminate the loop and cleanup the // endpoint. cleanupOnError(nil) + e.protocolMainLoopDone(closeTimer, &closeWaker) return nil case StateTimeWait: fallthrough @@ -1566,6 +1568,7 @@ loop: default: if err := funcs[v].f(); err != nil { cleanupOnError(err) + e.protocolMainLoopDone(closeTimer, &closeWaker) return nil } } @@ -1589,13 +1592,13 @@ loop: // Handle any StateError transition from StateTimeWait. if e.EndpointState() == StateError { cleanupOnError(nil) + e.protocolMainLoopDone(closeTimer, &closeWaker) return nil } e.transitionToStateCloseLocked() - // Lock released below. - epilogue() + e.protocolMainLoopDone(closeTimer, &closeWaker) // A new SYN was received during TIME_WAIT and we need to abort // the timewait and redirect the segment to the listener queue @@ -1665,6 +1668,7 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func() // should be executed after releasing the endpoint registrations. This is // done in cases where a new SYN is received during TIME_WAIT that carries // a sequence number larger than one see on the connection. +// +checklocks:e.mu func (e *endpoint) doTimeWait() (twReuse func()) { // Trigger a 2 * MSL time wait state. During this period // we will drop all incoming segments. diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go index dff7cb89c..7d110516b 100644 --- a/pkg/tcpip/transport/tcp/dispatcher.go +++ b/pkg/tcpip/transport/tcp/dispatcher.go @@ -127,7 +127,7 @@ func (p *processor) start(wg *sync.WaitGroup) { case !ep.segmentQueue.empty(): p.epQ.enqueue(ep) } - ep.mu.Unlock() + ep.mu.Unlock() // +checklocksforce } else { ep.newSegmentWaker.Assert() } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 4acddc959..1ed4ba419 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -664,6 +664,7 @@ func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { // The assumption behind spinning here being that background packet processing // should not be holding the lock for long and spinning reduces latency as we // avoid an expensive sleep/wakeup of of the syscall goroutine). +// +checklocksacquire:e.mu func (e *endpoint) LockUser() { for { // Try first if the sock is locked then check if it's owned @@ -683,7 +684,7 @@ func (e *endpoint) LockUser() { continue } atomic.StoreUint32(&e.ownedByUser, 1) - return + return // +checklocksforce } } @@ -700,7 +701,7 @@ func (e *endpoint) LockUser() { // protocol goroutine altogether. // // Precondition: e.LockUser() must have been called before calling e.UnlockUser() -// +checklocks:e.mu +// +checklocksrelease:e.mu func (e *endpoint) UnlockUser() { // Lock segment queue before checking so that we avoid a race where // segments can be queued between the time we check if queue is empty @@ -736,12 +737,13 @@ func (e *endpoint) UnlockUser() { } // StopWork halts packet processing. Only to be used in tests. +// +checklocksacquire:e.mu func (e *endpoint) StopWork() { e.mu.Lock() } // ResumeWork resumes packet processing. Only to be used in tests. -// +checklocks:e.mu +// +checklocksrelease:e.mu func (e *endpoint) ResumeWork() { e.mu.Unlock() } @@ -1480,86 +1482,95 @@ func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) { return avail, nil } -// Write writes data to the endpoint's peer. -func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { - // Linux completely ignores any address passed to sendto(2) for TCP sockets - // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More - // and opts.EndOfRecord are also ignored. +// readFromPayloader reads a slice from the Payloader. +// +checklocks:e.mu +// +checklocks:e.sndQueueInfo.sndQueueMu +func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) { + // We can release locks while copying data. + // + // This is not possible if atomic is set, because we can't allow the + // available buffer space to be consumed by some other caller while we + // are copying data in. + if !opts.Atomic { + e.sndQueueInfo.sndQueueMu.Unlock() + defer e.sndQueueInfo.sndQueueMu.Lock() - e.LockUser() - defer e.UnlockUser() + e.UnlockUser() + defer e.LockUser() + } - nextSeg, n, err := func() (*segment, int, tcpip.Error) { - e.sndQueueInfo.sndQueueMu.Lock() - defer e.sndQueueInfo.sndQueueMu.Unlock() + // Fetch data. + if l := p.Len(); l < avail { + avail = l + } + if avail == 0 { + return nil, nil + } + v := make([]byte, avail) + n, err := p.Read(v) + if err != nil && err != io.EOF { + return nil, &tcpip.ErrBadBuffer{} + } + return v[:n], nil +} + +// queueSegment reads data from the payloader and returns a segment to be sent. +// +checklocks:e.mu +func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { + e.sndQueueInfo.sndQueueMu.Lock() + defer e.sndQueueInfo.sndQueueMu.Unlock() + + avail, err := e.isEndpointWritableLocked() + if err != nil { + e.stats.WriteErrors.WriteClosed.Increment() + return nil, 0, err + } + v, err := e.readFromPayloader(p, opts, avail) + if err != nil { + return nil, 0, err + } + if !opts.Atomic { + // Since we released locks in between it's possible that the + // endpoint transitioned to a CLOSED/ERROR states so make + // sure endpoint is still writable before trying to write. avail, err := e.isEndpointWritableLocked() if err != nil { e.stats.WriteErrors.WriteClosed.Increment() return nil, 0, err } - v, err := func() ([]byte, tcpip.Error) { - // We can release locks while copying data. - // - // This is not possible if atomic is set, because we can't allow the - // available buffer space to be consumed by some other caller while we - // are copying data in. - if !opts.Atomic { - e.sndQueueInfo.sndQueueMu.Unlock() - defer e.sndQueueInfo.sndQueueMu.Lock() - - e.UnlockUser() - defer e.LockUser() - } - - // Fetch data. - if l := p.Len(); l < avail { - avail = l - } - if avail == 0 { - return nil, nil - } - v := make([]byte, avail) - n, err := p.Read(v) - if err != nil && err != io.EOF { - return nil, &tcpip.ErrBadBuffer{} - } - return v[:n], nil - }() - if len(v) == 0 || err != nil { - return nil, 0, err + // Discard any excess data copied in due to avail being reduced due + // to a simultaneous write call to the socket. + if avail < len(v) { + v = v[:avail] } + } - if !opts.Atomic { - // Since we released locks in between it's possible that the - // endpoint transitioned to a CLOSED/ERROR states so make - // sure endpoint is still writable before trying to write. - avail, err := e.isEndpointWritableLocked() - if err != nil { - e.stats.WriteErrors.WriteClosed.Increment() - return nil, 0, err - } + // Add data to the send queue. + s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v) + e.sndQueueInfo.SndBufUsed += len(v) + e.snd.writeList.PushBack(s) - // Discard any excess data copied in due to avail being reduced due - // to a simultaneous write call to the socket. - if avail < len(v) { - v = v[:avail] - } - } + return s, len(v), nil +} - // Add data to the send queue. - s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v) - e.sndQueueInfo.SndBufUsed += len(v) - e.snd.writeList.PushBack(s) +// Write writes data to the endpoint's peer. +func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { + // Linux completely ignores any address passed to sendto(2) for TCP sockets + // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More + // and opts.EndOfRecord are also ignored. + + e.LockUser() + defer e.UnlockUser() - return s, len(v), nil - }() // Return if either we didn't queue anything or if an error occurred while // attempting to queue data. + nextSeg, n, err := e.queueSegment(p, opts) if n == 0 || err != nil { return 0, err } + e.sendData(nextSeg) return int64(n), nil } @@ -2504,6 +2515,7 @@ func (e *endpoint) listen(backlog int) tcpip.Error { // startAcceptedLoop sets up required state and starts a goroutine with the // main loop for accepted connections. +// +checklocksrelease:e.mu func (e *endpoint) startAcceptedLoop() { e.workerRunning = true e.mu.Unlock() diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go index 65c86823a..2e709ed78 100644 --- a/pkg/tcpip/transport/tcp/forwarder.go +++ b/pkg/tcpip/transport/tcp/forwarder.go @@ -164,8 +164,9 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, return nil, err } - // Start the protocol goroutine. - ep.startAcceptedLoop() + // Start the protocol goroutine. Note that the endpoint is returned + // from performHandshake locked. + ep.startAcceptedLoop() // +checklocksforce return ep, nil } diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index def9d7186..82a3f2287 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -364,6 +364,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult // reacquire the mutex in exclusive mode. // // Returns true for retry if preparation should be retried. +// +checklocks:e.mu func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { switch e.EndpointState() { case StateInitial: @@ -380,10 +381,8 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip } e.mu.RUnlock() - defer e.mu.RLock() - e.mu.Lock() - defer e.mu.Unlock() + defer e.mu.DowngradeLock() // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. @@ -449,37 +448,20 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp return n, err } -func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { - if err := e.LastError(); err != nil { - return 0, err - } - - // MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.) - if opts.More { - return 0, &tcpip.ErrInvalidOptionValue{} - } - - to := opts.To - +func (e *endpoint) buildUDPPacketInfo(p tcpip.Payloader, opts tcpip.WriteOptions) (udpPacketInfo, tcpip.Error) { e.mu.RLock() - lockReleased := false - defer func() { - if lockReleased { - return - } - e.mu.RUnlock() - }() + defer e.mu.RUnlock() // If we've shutdown with SHUT_WR we are in an invalid state for sending. if e.shutdownFlags&tcpip.ShutdownWrite != 0 { - return 0, &tcpip.ErrClosedForSend{} + return udpPacketInfo{}, &tcpip.ErrClosedForSend{} } // Prepare for write. for { - retry, err := e.prepareForWrite(to) + retry, err := e.prepareForWrite(opts.To) if err != nil { - return 0, err + return udpPacketInfo{}, err } if !retry { @@ -489,34 +471,34 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp route := e.route dstPort := e.dstPort - if to != nil { + if opts.To != nil { // Reject destination address if it goes through a different // NIC than the endpoint was bound to. - nicID := to.NIC + nicID := opts.To.NIC if nicID == 0 { nicID = tcpip.NICID(e.ops.GetBindToDevice()) } if e.BindNICID != 0 { if nicID != 0 && nicID != e.BindNICID { - return 0, &tcpip.ErrNoRoute{} + return udpPacketInfo{}, &tcpip.ErrNoRoute{} } nicID = e.BindNICID } - if to.Port == 0 { + if opts.To.Port == 0 { // Port 0 is an invalid port to send to. - return 0, &tcpip.ErrInvalidEndpointState{} + return udpPacketInfo{}, &tcpip.ErrInvalidEndpointState{} } - dst, netProto, err := e.checkV4MappedLocked(*to) + dst, netProto, err := e.checkV4MappedLocked(*opts.To) if err != nil { - return 0, err + return udpPacketInfo{}, err } r, _, err := e.connectRoute(nicID, dst, netProto) if err != nil { - return 0, err + return udpPacketInfo{}, err } defer r.Release() @@ -525,12 +507,12 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp } if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() { - return 0, &tcpip.ErrBroadcastDisabled{} + return udpPacketInfo{}, &tcpip.ErrBroadcastDisabled{} } v := make([]byte, p.Len()) if _, err := io.ReadFull(p, v); err != nil { - return 0, &tcpip.ErrBadBuffer{} + return udpPacketInfo{}, &tcpip.ErrBadBuffer{} } if len(v) > header.UDPMaximumPacketSize { // Payload can't possibly fit in a packet. @@ -548,24 +530,39 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp v, ) } - return 0, &tcpip.ErrMessageTooLong{} + return udpPacketInfo{}, &tcpip.ErrMessageTooLong{} } ttl := e.ttl useDefaultTTL := ttl == 0 - if header.IsV4MulticastAddress(route.RemoteAddress()) || header.IsV6MulticastAddress(route.RemoteAddress()) { ttl = e.multicastTTL // Multicast allows a 0 TTL. useDefaultTTL = false } - localPort := e.ID.LocalPort - sendTOS := e.sendTOS - owner := e.owner - noChecksum := e.SocketOptions().GetNoChecksum() - lockReleased = true - e.mu.RUnlock() + return udpPacketInfo{ + route: route, + data: buffer.View(v), + localPort: e.ID.LocalPort, + remotePort: dstPort, + ttl: ttl, + useDefaultTTL: useDefaultTTL, + tos: e.sendTOS, + owner: e.owner, + noChecksum: e.SocketOptions().GetNoChecksum(), + }, nil +} + +func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { + if err := e.LastError(); err != nil { + return 0, err + } + + // MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.) + if opts.More { + return 0, &tcpip.ErrInvalidOptionValue{} + } // Do not hold lock when sending as loopback is synchronous and if the UDP // datagram ends up generating an ICMP response then it can result in a @@ -577,10 +574,15 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp // // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read // locking is prohibited. - if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil { + u, err := e.buildUDPPacketInfo(p, opts) + if err != nil { return 0, err } - return int64(len(v)), nil + n, err := u.send() + if err != nil { + return 0, err + } + return int64(n), nil } // OnReuseAddressSet implements tcpip.SocketOptionsHandler. @@ -817,14 +819,30 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { return nil } -// sendUDP sends a UDP segment via the provided network endpoint and under the -// provided identity. -func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner, noChecksum bool) tcpip.Error { +// udpPacketInfo contains all information required to send a UDP packet. +// +// This should be used as a value-only type, which exists in order to simplify +// return value syntax. It should not be exported or extended. +type udpPacketInfo struct { + route *stack.Route + data buffer.View + localPort uint16 + remotePort uint16 + ttl uint8 + useDefaultTTL bool + tos uint8 + owner tcpip.PacketOwner + noChecksum bool +} + +// send sends the given packet. +func (u *udpPacketInfo) send() (int, tcpip.Error) { + vv := u.data.ToVectorisedView() pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: header.UDPMinimumSize + int(r.MaxHeaderLength()), - Data: data, + ReserveHeaderBytes: header.UDPMinimumSize + int(u.route.MaxHeaderLength()), + Data: vv, }) - pkt.Owner = owner + pkt.Owner = u.owner // Initialize the UDP header. udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize)) @@ -832,8 +850,8 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u length := uint16(pkt.Size()) udp.Encode(&header.UDPFields{ - SrcPort: localPort, - DstPort: remotePort, + SrcPort: u.localPort, + DstPort: u.remotePort, Length: length, }) @@ -841,30 +859,30 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u // On IPv4, UDP checksum is optional, and a zero value indicates the // transmitter skipped the checksum generation (RFC768). // On IPv6, UDP checksum is not optional (RFC2460 Section 8.1). - if r.RequiresTXTransportChecksum() && - (!noChecksum || r.NetProto() == header.IPv6ProtocolNumber) { - xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) - for _, v := range data.Views() { + if u.route.RequiresTXTransportChecksum() && + (!u.noChecksum || u.route.NetProto() == header.IPv6ProtocolNumber) { + xsum := u.route.PseudoHeaderChecksum(ProtocolNumber, length) + for _, v := range vv.Views() { xsum = header.Checksum(v, xsum) } udp.SetChecksum(^udp.CalculateChecksum(xsum)) } - if useDefaultTTL { - ttl = r.DefaultTTL() + if u.useDefaultTTL { + u.ttl = u.route.DefaultTTL() } - if err := r.WritePacket(stack.NetworkHeaderParams{ + if err := u.route.WritePacket(stack.NetworkHeaderParams{ Protocol: ProtocolNumber, - TTL: ttl, - TOS: tos, + TTL: u.ttl, + TOS: u.tos, }, pkt); err != nil { - r.Stats().UDP.PacketSendErrors.Increment() - return err + u.route.Stats().UDP.PacketSendErrors.Increment() + return 0, err } // Track count of packets sent. - r.Stats().UDP.PacketsSent.Increment() - return nil + u.route.Stats().UDP.PacketsSent.Increment() + return len(u.data), nil } // checkV4MappedLocked determines the effective network protocol and converts |