From 9bd01642377eef9d91f9243091b6da28f37d164a Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 2 Sep 2020 11:35:18 -0700 Subject: Improve sync.SeqCount performance. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make sync.SeqCountEpoch not a struct. This allows sync.SeqCount.BeginRead() to be inlined. - Mark sync.SeqAtomicLoad nosplit to mitigate the Go compiler's refusal to inline it. (Best I could get was "cost 92 exceeds budget 80".) - Use runtime-guided spinning in SeqCount.BeginRead(). Benchmarks: name old time/op new time/op delta pkg:pkg/sync/sync goos:linux goarch:amd64 SeqCountWriteUncontended-12 8.24ns ± 0% 11.40ns ± 0% +38.35% (p=0.000 n=10+10) SeqCountReadUncontended-12 0.33ns ± 0% 0.14ns ± 3% -57.77% (p=0.000 n=7+8) pkg:pkg/sync/seqatomictest/seqatomic goos:linux goarch:amd64 SeqAtomicLoadIntUncontended-12 0.64ns ± 1% 0.41ns ± 1% -36.40% (p=0.000 n=10+8) SeqAtomicTryLoadIntUncontended-12 0.18ns ± 4% 0.18ns ± 1% ~ (p=0.206 n=10+8) AtomicValueLoadIntUncontended-12 0.27ns ± 3% 0.27ns ± 0% -1.77% (p=0.000 n=10+8) (atomic.Value.Load is, of course, inlined. We would expect an uncontended inline SeqAtomicLoad to perform identically to SeqAtomicTryLoad.) The "regression" in BenchmarkSeqCountWriteUncontended, despite this CL changing nothing in that path, is attributed to microarchitectural subtlety; the benchmark loop is unchanged except for its address: Before this CL: :0 0x4e62d1 48ffc2 INCQ DX :0 0x4e62d4 48399110010000 CMPQ DX, 0x110(CX) :0 0x4e62db 7e26 JLE 0x4e6303 :0 0x4e62dd 90 NOPL :0 0x4e62de bb01000000 MOVL $0x1, BX :0 0x4e62e3 f00fc118 LOCK XADDL BX, 0(AX) :0 0x4e62e7 ffc3 INCL BX :0 0x4e62e9 0fbae300 BTL $0x0, BX :0 0x4e62ed 733a JAE 0x4e6329 :0 0x4e62ef 90 NOPL :0 0x4e62f0 bb01000000 MOVL $0x1, BX :0 0x4e62f5 f00fc118 LOCK XADDL BX, 0(AX) :0 0x4e62f9 ffc3 INCL BX :0 0x4e62fb 0fbae300 BTL $0x0, BX :0 0x4e62ff 73d0 JAE 0x4e62d1 After this CL: :0 0x4e6361 48ffc2 INCQ DX :0 0x4e6364 48399110010000 CMPQ DX, 0x110(CX) :0 0x4e636b 7e26 JLE 0x4e6393 :0 0x4e636d 90 NOPL :0 0x4e636e bb01000000 MOVL $0x1, BX :0 0x4e6373 f00fc118 LOCK XADDL BX, 0(AX) :0 0x4e6377 ffc3 INCL BX :0 0x4e6379 0fbae300 BTL $0x0, BX :0 0x4e637d 733a JAE 0x4e63b9 :0 0x4e637f 90 NOPL :0 0x4e6380 bb01000000 MOVL $0x1, BX :0 0x4e6385 f00fc118 LOCK XADDL BX, 0(AX) :0 0x4e6389 ffc3 INCL BX :0 0x4e638b 0fbae300 BTL $0x0, BX :0 0x4e638f 73d0 JAE 0x4e6361 PiperOrigin-RevId: 329754148 --- pkg/sync/BUILD | 1 + pkg/sync/seqatomic_unsafe.go | 40 +++++++++++++++++----------------------- pkg/sync/seqcount.go | 30 ++++++++++++++++++++---------- pkg/sync/spin_unsafe.go | 24 ++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 33 deletions(-) create mode 100644 pkg/sync/spin_unsafe.go diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD index 4d47207f7..68535c3b1 100644 --- a/pkg/sync/BUILD +++ b/pkg/sync/BUILD @@ -38,6 +38,7 @@ go_library( "race_unsafe.go", "rwmutex_unsafe.go", "seqcount.go", + "spin_unsafe.go", "sync.go", ], marshal = False, diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go index eda6fb131..2184cb5ab 100644 --- a/pkg/sync/seqatomic_unsafe.go +++ b/pkg/sync/seqatomic_unsafe.go @@ -25,41 +25,35 @@ import ( type Value struct{} // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race -// with any writer critical sections in sc. -func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value { - // This function doesn't use SeqAtomicTryLoad because doing so is - // measurably, significantly (~20%) slower; Go is awful at inlining. - var val Value +// with any writer critical sections in seq. +// +//go:nosplit +func SeqAtomicLoad(seq *sync.SeqCount, ptr *Value) Value { for { - epoch := sc.BeginRead() - if sync.RaceEnabled { - // runtime.RaceDisable() doesn't actually stop the race detector, - // so it can't help us here. Instead, call runtime.memmove - // directly, which is not instrumented by the race detector. - sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) - } else { - // This is ~40% faster for short reads than going through memmove. - val = *ptr - } - if sc.ReadOk(epoch) { - break + if val, ok := SeqAtomicTryLoad(seq, seq.BeginRead(), ptr); ok { + return val } } - return val } // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section -// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read -// would race with a writer critical section, SeqAtomicTryLoad returns +// in seq initiated by a call to seq.BeginRead() that returned epoch. If the +// read would race with a writer critical section, SeqAtomicTryLoad returns // (unspecified, false). -func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) { - var val Value +// +//go:nosplit +func SeqAtomicTryLoad(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (val Value, ok bool) { if sync.RaceEnabled { + // runtime.RaceDisable() doesn't actually stop the race detector, so it + // can't help us here. Instead, call runtime.memmove directly, which is + // not instrumented by the race detector. sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) } else { + // This is ~40% faster for short reads than going through memmove. val = *ptr } - return val, sc.ReadOk(epoch) + ok = seq.ReadOk(epoch) + return } func init() { diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go index a1e895352..2c5d3df99 100644 --- a/pkg/sync/seqcount.go +++ b/pkg/sync/seqcount.go @@ -8,7 +8,6 @@ package sync import ( "fmt" "reflect" - "runtime" "sync/atomic" ) @@ -43,9 +42,7 @@ type SeqCount struct { } // SeqCountEpoch tracks writer critical sections in a SeqCount. -type SeqCountEpoch struct { - val uint32 -} +type SeqCountEpoch uint32 // We assume that: // @@ -83,12 +80,25 @@ type SeqCountEpoch struct { // using this pattern. Most users of SeqCount will need to use the // SeqAtomicLoad function template in seqatomic.go. func (s *SeqCount) BeginRead() SeqCountEpoch { - epoch := atomic.LoadUint32(&s.epoch) - for epoch&1 != 0 { - runtime.Gosched() - epoch = atomic.LoadUint32(&s.epoch) + if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 { + return SeqCountEpoch(epoch) + } + return s.beginReadSlow() +} + +func (s *SeqCount) beginReadSlow() SeqCountEpoch { + i := 0 + for { + if canSpin(i) { + i++ + doSpin() + } else { + goyield() + } + if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 { + return SeqCountEpoch(epoch) + } } - return SeqCountEpoch{epoch} } // ReadOk returns true if the reader critical section initiated by a previous @@ -99,7 +109,7 @@ func (s *SeqCount) BeginRead() SeqCountEpoch { // Reader critical sections do not need to be explicitly terminated; the last // call to ReadOk is implicitly the end of the reader critical section. func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool { - return atomic.LoadUint32(&s.epoch) == epoch.val + return atomic.LoadUint32(&s.epoch) == uint32(epoch) } // BeginWrite indicates the beginning of a writer critical section. diff --git a/pkg/sync/spin_unsafe.go b/pkg/sync/spin_unsafe.go new file mode 100644 index 000000000..f721449e3 --- /dev/null +++ b/pkg/sync/spin_unsafe.go @@ -0,0 +1,24 @@ +// Copyright 2020 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.13 +// +build !go1.16 + +// Check go:linkname function signatures when updating Go version. + +package sync + +import ( + _ "unsafe" // for go:linkname +) + +//go:linkname canSpin sync.runtime_canSpin +func canSpin(i int) bool + +//go:linkname doSpin sync.runtime_doSpin +func doSpin() + +//go:linkname goyield runtime.goyield +func goyield() -- cgit v1.2.3