diff options
author | Bhasker Hariharan <bhaskerh@google.com> | 2020-01-26 18:32:52 -0800 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2020-01-26 18:35:01 -0800 |
commit | 68514d4ba3f7c06a89a8d0cd79327ede62dae65b (patch) | |
tree | 947cb7427ce775b85e9e3be396eb52c6426b919c | |
parent | 18a7e1309decb9bc09879e337adbc00f81d420c5 (diff) |
Unroll checksum computation loop.
Checksum computation is one of the most expensive bits of
packet processing. Manual unrolling of the loop provides
significant improvement in checksum speed.
Updates #1656
BenchmarkChecksum/checksum_64-12 49834124 23.6 ns/op
BenchmarkChecksum/checksum_128-12 27111997 44.1 ns/op
BenchmarkChecksum/checksum_256-12 11416683 91.5 ns/op
BenchmarkChecksum/checksum_512-12 6375298 174 ns/op
BenchmarkChecksum/checksum_1024-12 3403852 338 ns/op
BenchmarkChecksum/checksum_1500-12 2343576 493 ns/op
BenchmarkChecksum/checksum_2048-12 1730521 656 ns/op
BenchmarkChecksum/checksum_4096-12 920469 1327 ns/op
BenchmarkChecksum/checksum_8192-12 445885 2637 ns/op
BenchmarkChecksum/checksum_16384-12 226342 5268 ns/op
BenchmarkChecksum/checksum_32767-12 114210 10503 ns/op
BenchmarkChecksum/checksum_32768-12 99138 10610 ns/op
BenchmarkChecksum/checksum_65535-12 53438 21158 ns/op
BenchmarkChecksum/checksum_65536-12 52993 21067 ns/op
BenchmarkUnrolledChecksum/checksum_64-12 61035639 19.1 ns/op
BenchmarkUnrolledChecksum/checksum_128-12 36067015 33.6 ns/op
BenchmarkUnrolledChecksum/checksum_256-12 19731220 60.4 ns/op
BenchmarkUnrolledChecksum/checksum_512-12 9091291 116 ns/op
BenchmarkUnrolledChecksum/checksum_1024-12 4976406 226 ns/op
BenchmarkUnrolledChecksum/checksum_1500-12 3685224 328 ns/op
BenchmarkUnrolledChecksum/checksum_2048-12 2579108 447 ns/op
BenchmarkUnrolledChecksum/checksum_4096-12 1350475 887 ns/op
BenchmarkUnrolledChecksum/checksum_8192-12 658248 1780 ns/op
BenchmarkUnrolledChecksum/checksum_16384-12 335869 3534 ns/op
BenchmarkUnrolledChecksum/checksum_32767-12 168650 7095 ns/op
BenchmarkUnrolledChecksum/checksum_32768-12 168075 7098 ns/op
BenchmarkUnrolledChecksum/checksum_65535-12 75085 14277 ns/op
BenchmarkUnrolledChecksum/checksum_65536-12 75921 14127 ns/op
PiperOrigin-RevId: 291643290
-rw-r--r-- | pkg/tcpip/header/checksum.go | 124 | ||||
-rw-r--r-- | pkg/tcpip/header/checksum_test.go | 62 |
2 files changed, 186 insertions, 0 deletions
diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go index 9749c7f4d..ce57b581a 100644 --- a/pkg/tcpip/header/checksum.go +++ b/pkg/tcpip/header/checksum.go @@ -45,6 +45,121 @@ func calculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) { return ChecksumCombine(uint16(v), uint16(v>>16)), odd } +func unrolledCalculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) { + v := initial + + if odd { + v += uint32(buf[0]) + buf = buf[1:] + } + + l := len(buf) + odd = l&1 != 0 + if odd { + l-- + v += uint32(buf[l]) << 8 + } + for (l - 64) >= 0 { + i := 0 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5]) + v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7]) + v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9]) + v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11]) + v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13]) + v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15]) + i += 16 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5]) + v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7]) + v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9]) + v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11]) + v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13]) + v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15]) + i += 16 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5]) + v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7]) + v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9]) + v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11]) + v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13]) + v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15]) + i += 16 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5]) + v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7]) + v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9]) + v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11]) + v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13]) + v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15]) + buf = buf[64:] + l = l - 64 + } + if (l - 32) >= 0 { + i := 0 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5]) + v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7]) + v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9]) + v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11]) + v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13]) + v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15]) + i += 16 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5]) + v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7]) + v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9]) + v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11]) + v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13]) + v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15]) + buf = buf[32:] + l = l - 32 + } + if (l - 16) >= 0 { + i := 0 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5]) + v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7]) + v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9]) + v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11]) + v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13]) + v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15]) + buf = buf[16:] + l = l - 16 + } + if (l - 8) >= 0 { + i := 0 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5]) + v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7]) + buf = buf[8:] + l = l - 8 + } + if (l - 4) >= 0 { + i := 0 + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3]) + buf = buf[4:] + l = l - 4 + } + + // At this point since l was even before we started unrolling + // there can be only two bytes left to add. + if l != 0 { + v += (uint32(buf[0]) << 8) + uint32(buf[1]) + } + + return ChecksumCombine(uint16(v), uint16(v>>16)), odd +} + // Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the // given byte array. // @@ -54,6 +169,15 @@ func Checksum(buf []byte, initial uint16) uint16 { return s } +// UnrolledChecksum calculates the checksum (as defined in RFC 1071) of the +// bytes in the given byte array. +// +// The initial checksum must have been computed on an even number of bytes. +func UnrolledChecksum(buf []byte, initial uint16) uint16 { + s, _ := unrolledCalculateChecksum(buf, false, uint32(initial)) + return s +} + // ChecksumVV calculates the checksum (as defined in RFC 1071) of the bytes in // the given VectorizedView. // diff --git a/pkg/tcpip/header/checksum_test.go b/pkg/tcpip/header/checksum_test.go index 86b466c1c..2fbd16a65 100644 --- a/pkg/tcpip/header/checksum_test.go +++ b/pkg/tcpip/header/checksum_test.go @@ -17,6 +17,8 @@ package header_test import ( + "fmt" + "math/rand" "testing" "gvisor.dev/gvisor/pkg/tcpip/buffer" @@ -107,3 +109,63 @@ func TestChecksumVVWithOffset(t *testing.T) { }) } } + +func TestChecksum(t *testing.T) { + var bufSizes = []int{0, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256, 257, 1023, 1024} + type testCase struct { + buf []byte + initial uint16 + csumOrig uint16 + csumNew uint16 + } + testCases := make([]testCase, 100000) + // Ensure same buffer generation for test consistency. + rnd := rand.New(rand.NewSource(42)) + for i := range testCases { + testCases[i].buf = make([]byte, bufSizes[i%len(bufSizes)]) + testCases[i].initial = uint16(rnd.Intn(65536)) + rnd.Read(testCases[i].buf) + } + + for i := range testCases { + testCases[i].csumOrig = header.Checksum(testCases[i].buf, testCases[i].initial) + testCases[i].csumNew = header.UnrolledChecksum(testCases[i].buf, testCases[i].initial) + if got, want := testCases[i].csumNew, testCases[i].csumOrig; got != want { + t.Fatalf("new checksum for (buf = %x, initial = %d) does not match old got: %d, want: %d", testCases[i].buf, testCases[i].initial, got, want) + } + } +} + +func BenchmarkChecksum(b *testing.B) { + var bufSizes = []int{64, 128, 256, 512, 1024, 1500, 2048, 4096, 8192, 16384, 32767, 32768, 65535, 65536} + + checkSumImpls := []struct { + fn func([]byte, uint16) uint16 + name string + }{ + {header.Checksum, fmt.Sprintf("checksum")}, + {header.UnrolledChecksum, fmt.Sprintf("unrolled_checksum")}, + } + + for _, csumImpl := range checkSumImpls { + // Ensure same buffer generation for test consistency. + rnd := rand.New(rand.NewSource(42)) + for _, bufSz := range bufSizes { + b.Run(fmt.Sprintf("%s_%d", csumImpl.name, bufSz), func(b *testing.B) { + tc := struct { + buf []byte + initial uint16 + csum uint16 + }{ + buf: make([]byte, bufSz), + initial: uint16(rnd.Intn(65536)), + } + rnd.Read(tc.buf) + b.ResetTimer() + for i := 0; i < b.N; i++ { + tc.csum = csumImpl.fn(tc.buf, tc.initial) + } + }) + } + } +} |