From 9bd01642377eef9d91f9243091b6da28f37d164a Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 2 Sep 2020 11:35:18 -0700 Subject: Improve sync.SeqCount performance. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make sync.SeqCountEpoch not a struct. This allows sync.SeqCount.BeginRead() to be inlined. - Mark sync.SeqAtomicLoad nosplit to mitigate the Go compiler's refusal to inline it. (Best I could get was "cost 92 exceeds budget 80".) - Use runtime-guided spinning in SeqCount.BeginRead(). Benchmarks: name old time/op new time/op delta pkg:pkg/sync/sync goos:linux goarch:amd64 SeqCountWriteUncontended-12 8.24ns ± 0% 11.40ns ± 0% +38.35% (p=0.000 n=10+10) SeqCountReadUncontended-12 0.33ns ± 0% 0.14ns ± 3% -57.77% (p=0.000 n=7+8) pkg:pkg/sync/seqatomictest/seqatomic goos:linux goarch:amd64 SeqAtomicLoadIntUncontended-12 0.64ns ± 1% 0.41ns ± 1% -36.40% (p=0.000 n=10+8) SeqAtomicTryLoadIntUncontended-12 0.18ns ± 4% 0.18ns ± 1% ~ (p=0.206 n=10+8) AtomicValueLoadIntUncontended-12 0.27ns ± 3% 0.27ns ± 0% -1.77% (p=0.000 n=10+8) (atomic.Value.Load is, of course, inlined. We would expect an uncontended inline SeqAtomicLoad to perform identically to SeqAtomicTryLoad.) The "regression" in BenchmarkSeqCountWriteUncontended, despite this CL changing nothing in that path, is attributed to microarchitectural subtlety; the benchmark loop is unchanged except for its address: Before this CL: :0 0x4e62d1 48ffc2 INCQ DX :0 0x4e62d4 48399110010000 CMPQ DX, 0x110(CX) :0 0x4e62db 7e26 JLE 0x4e6303 :0 0x4e62dd 90 NOPL :0 0x4e62de bb01000000 MOVL $0x1, BX :0 0x4e62e3 f00fc118 LOCK XADDL BX, 0(AX) :0 0x4e62e7 ffc3 INCL BX :0 0x4e62e9 0fbae300 BTL $0x0, BX :0 0x4e62ed 733a JAE 0x4e6329 :0 0x4e62ef 90 NOPL :0 0x4e62f0 bb01000000 MOVL $0x1, BX :0 0x4e62f5 f00fc118 LOCK XADDL BX, 0(AX) :0 0x4e62f9 ffc3 INCL BX :0 0x4e62fb 0fbae300 BTL $0x0, BX :0 0x4e62ff 73d0 JAE 0x4e62d1 After this CL: :0 0x4e6361 48ffc2 INCQ DX :0 0x4e6364 48399110010000 CMPQ DX, 0x110(CX) :0 0x4e636b 7e26 JLE 0x4e6393 :0 0x4e636d 90 NOPL :0 0x4e636e bb01000000 MOVL $0x1, BX :0 0x4e6373 f00fc118 LOCK XADDL BX, 0(AX) :0 0x4e6377 ffc3 INCL BX :0 0x4e6379 0fbae300 BTL $0x0, BX :0 0x4e637d 733a JAE 0x4e63b9 :0 0x4e637f 90 NOPL :0 0x4e6380 bb01000000 MOVL $0x1, BX :0 0x4e6385 f00fc118 LOCK XADDL BX, 0(AX) :0 0x4e6389 ffc3 INCL BX :0 0x4e638b 0fbae300 BTL $0x0, BX :0 0x4e638f 73d0 JAE 0x4e6361 PiperOrigin-RevId: 329754148 --- pkg/sync/seqatomic_unsafe.go | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'pkg/sync/seqatomic_unsafe.go') diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go index eda6fb131..2184cb5ab 100644 --- a/pkg/sync/seqatomic_unsafe.go +++ b/pkg/sync/seqatomic_unsafe.go @@ -25,41 +25,35 @@ import ( type Value struct{} // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race -// with any writer critical sections in sc. -func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value { - // This function doesn't use SeqAtomicTryLoad because doing so is - // measurably, significantly (~20%) slower; Go is awful at inlining. - var val Value +// with any writer critical sections in seq. +// +//go:nosplit +func SeqAtomicLoad(seq *sync.SeqCount, ptr *Value) Value { for { - epoch := sc.BeginRead() - if sync.RaceEnabled { - // runtime.RaceDisable() doesn't actually stop the race detector, - // so it can't help us here. Instead, call runtime.memmove - // directly, which is not instrumented by the race detector. - sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) - } else { - // This is ~40% faster for short reads than going through memmove. - val = *ptr - } - if sc.ReadOk(epoch) { - break + if val, ok := SeqAtomicTryLoad(seq, seq.BeginRead(), ptr); ok { + return val } } - return val } // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section -// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read -// would race with a writer critical section, SeqAtomicTryLoad returns +// in seq initiated by a call to seq.BeginRead() that returned epoch. If the +// read would race with a writer critical section, SeqAtomicTryLoad returns // (unspecified, false). -func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) { - var val Value +// +//go:nosplit +func SeqAtomicTryLoad(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (val Value, ok bool) { if sync.RaceEnabled { + // runtime.RaceDisable() doesn't actually stop the race detector, so it + // can't help us here. Instead, call runtime.memmove directly, which is + // not instrumented by the race detector. sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) } else { + // This is ~40% faster for short reads than going through memmove. val = *ptr } - return val, sc.ReadOk(epoch) + ok = seq.ReadOk(epoch) + return } func init() { -- cgit v1.2.3