summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorSamuel Neves <sneves@dei.uc.pt>2017-11-23 16:08:46 +0000
committerJason A. Donenfeld <Jason@zx2c4.com>2017-11-26 22:08:56 +0100
commit61691761a268090871eb3c201384c57e407d6105 (patch)
treef80a0f552d15eac44e83d0e8b4bb7291ce956450
parentba3c2e815f00e723bddebdf1f5df5adccbbb9b1c (diff)
blake2s: tweak avx512 code
This is not as ideal as using zmm, but zmm downclocks. And it's not as fast single-threaded as using the gathers. But it is faster when multithreaded, which is what WireGuard is doing. Signed-off-by: Samuel Neves <sneves@dei.uc.pt> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
-rw-r--r--src/crypto/blake2s-x86_64.S111
1 files changed, 47 insertions, 64 deletions
diff --git a/src/crypto/blake2s-x86_64.S b/src/crypto/blake2s-x86_64.S
index 294750e..d1e0c03 100644
--- a/src/crypto/blake2s-x86_64.S
+++ b/src/crypto/blake2s-x86_64.S
@@ -5,7 +5,7 @@
#include <linux/linkage.h>
-.section .rodata.cst32.BLAKECONST, "aM", @progbits, 32
+.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
.align 32
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
@@ -16,38 +16,19 @@ ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
.align 16
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
#ifdef CONFIG_AS_AVX512
-.section .rodata.cst64.BLAKESIGMA, "aM", @progbits, 640
+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
.align 64
SIGMA:
- .long 0, 2, 4, 6, 1, 3, 5, 7
- .long 8, 10, 12, 14, 9, 11, 13, 15
-
- .long 14, 4, 9, 13, 10, 8, 15, 6
- .long 1, 0, 11, 5, 12, 2, 7, 3
-
- .long 11, 12, 5, 15, 8, 0, 2, 13
- .long 10, 3, 7, 9, 14, 6, 1, 4
-
- .long 7, 3, 13, 11, 9, 1, 12, 14
- .long 2, 5, 4, 15, 6, 10, 0, 8
-
- .long 9, 5, 2, 10, 0, 7, 4, 15
- .long 14, 11, 6, 3, 1, 12, 8, 13
-
- .long 2, 6, 0, 8, 12, 10, 11, 3
- .long 4, 7, 15, 1, 13, 5, 14, 9
-
- .long 12, 1, 14, 4, 5, 15, 13, 10
- .long 0, 6, 9, 8, 7, 3, 2, 11
-
- .long 13, 7, 12, 3, 11, 14, 1, 9
- .long 5, 15, 8, 2, 0, 4, 6, 10
-
- .long 6, 14, 11, 0, 15, 9, 3, 8
- .long 12, 13, 1, 10, 2, 7, 4, 5
-
- .long 10, 8, 7, 1, 2, 4, 6, 5
- .long 15, 9, 3, 13, 11, 14, 12, 0
+.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15
+.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5
+.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1
+.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4
+.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2
+.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0
+.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6
+.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7
+.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8
+.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3
#endif /* CONFIG_AS_AVX512 */
.text
@@ -625,32 +606,40 @@ ENDPROC(blake2s_compress_avx)
ENTRY(blake2s_compress_avx512)
vmovdqu (%rdi),%xmm0
vmovdqu 0x10(%rdi),%xmm1
- vmovdqu 0x20(%rdi),%xmm15
- vmovq %rcx,%xmm13
- jmp .Lblake2s_compress_avx512_mainloop
+ vmovdqu 0x20(%rdi),%xmm4
+ vmovq %rcx,%xmm5
+ vmovdqa IV(%rip),%xmm14
+ vmovdqa IV+16(%rip),%xmm15
+ jmp .Lblake2s_compress_avx512_mainloop
.align 32
.Lblake2s_compress_avx512_mainloop:
- vpaddq %xmm13,%xmm15,%xmm15
- vmovdqa IV(%rip),%xmm2
- vpxor IV+16(%rip),%xmm15,%xmm3
- lea SIGMA(%rip),%rax
- movl $10,%ecx
+ vmovdqa %xmm0,%xmm10
+ vmovdqa %xmm1,%xmm11
+ vpaddq %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm14,%xmm2
+ vpxor %xmm15,%xmm4,%xmm3
+ vmovdqu (%rsi),%ymm6
+ vmovdqu 0x20(%rsi),%ymm7
+ addq $0x40,%rsi
+ leaq SIGMA(%rip),%rax
+ movb $0xa,%cl
.Lblake2s_compress_avx512_roundloop:
- add $0x40,%rax
- vmovdqa -0x40(%rax),%xmm7
- vpcmpeqd %xmm14,%xmm14,%xmm14
- vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6
- vpaddd %xmm6,%xmm0,%xmm0
+ addq $0x40,%rax
+ vmovdqa -0x40(%rax),%ymm8
+ vmovdqa -0x20(%rax),%ymm9
+ vpermi2d %ymm7,%ymm6,%ymm8
+ vpermi2d %ymm7,%ymm6,%ymm9
+ vmovdqa %ymm8,%ymm6
+ vmovdqa %ymm9,%ymm7
+ vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
- vmovdqa -0x30(%rax),%xmm7
- vpcmpeqd %xmm14,%xmm14,%xmm14
- vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6
- vpaddd %xmm6,%xmm0,%xmm0
+ vextracti128 $0x1,%ymm8,%xmm8
+ vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
@@ -660,20 +649,15 @@ ENTRY(blake2s_compress_avx512)
vpshufd $0x39,%xmm1,%xmm1
vpshufd $0x4e,%xmm2,%xmm2
vpshufd $0x93,%xmm3,%xmm3
- vmovdqa -0x20(%rax),%xmm7
- vpcmpeqd %xmm14,%xmm14,%xmm14
- vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6
- vpaddd %xmm6,%xmm0,%xmm0
+ vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
- vmovdqa -0x10(%rax),%xmm7
- vpcmpeqd %xmm14,%xmm14,%xmm14
- vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6
- vpaddd %xmm6,%xmm0,%xmm0
+ vextracti128 $0x1,%ymm9,%xmm9
+ vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
@@ -683,19 +667,18 @@ ENTRY(blake2s_compress_avx512)
vpshufd $0x93,%xmm1,%xmm1
vpshufd $0x4e,%xmm2,%xmm2
vpshufd $0x39,%xmm3,%xmm3
- decl %ecx
+ decb %cl
jne .Lblake2s_compress_avx512_roundloop
- add $0x40,%rsi
- vpxor (%rdi),%xmm0,%xmm0
- vpxor 0x10(%rdi),%xmm1,%xmm1
+ vpxor %xmm10,%xmm0,%xmm0
+ vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm2,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
+ decq %rdx
+ jne .Lblake2s_compress_avx512_mainloop
vmovdqu %xmm0,(%rdi)
vmovdqu %xmm1,0x10(%rdi)
- dec %rdx
- jne .Lblake2s_compress_avx512_mainloop
- vmovdqu %xmm15,0x20(%rdi)
- vzeroupper
+ vmovdqu %xmm4,0x20(%rdi)
+ vzeroupper
retq
ENDPROC(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */