summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorSamuel Neves <sneves@dei.uc.pt>2017-11-22 20:57:37 +0000
committerJason A. Donenfeld <Jason@zx2c4.com>2017-11-22 22:55:50 +0100
commit4edb9051d9da9236df766a30516eacd03730592d (patch)
tree8b388aaecfa44d7c6e10df6666ebe1b9a1b540e4
parentb1affbd12ed27bf0fb3c1500a38857b80dca94be (diff)
blake2s: AVX512F+VL implementation
Signed-off-by: Samuel Neves <sneves@dei.uc.pt> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
-rw-r--r--src/crypto/blake2s-x86_64.S113
-rw-r--r--src/crypto/blake2s.c19
2 files changed, 132 insertions, 0 deletions
diff --git a/src/crypto/blake2s-x86_64.S b/src/crypto/blake2s-x86_64.S
index e86afd3..294750e 100644
--- a/src/crypto/blake2s-x86_64.S
+++ b/src/crypto/blake2s-x86_64.S
@@ -15,6 +15,40 @@ ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ROR328, "aM", @progbits, 16
.align 16
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
+#ifdef CONFIG_AS_AVX512
+.section .rodata.cst64.BLAKESIGMA, "aM", @progbits, 640
+.align 64
+SIGMA:
+ .long 0, 2, 4, 6, 1, 3, 5, 7
+ .long 8, 10, 12, 14, 9, 11, 13, 15
+
+ .long 14, 4, 9, 13, 10, 8, 15, 6
+ .long 1, 0, 11, 5, 12, 2, 7, 3
+
+ .long 11, 12, 5, 15, 8, 0, 2, 13
+ .long 10, 3, 7, 9, 14, 6, 1, 4
+
+ .long 7, 3, 13, 11, 9, 1, 12, 14
+ .long 2, 5, 4, 15, 6, 10, 0, 8
+
+ .long 9, 5, 2, 10, 0, 7, 4, 15
+ .long 14, 11, 6, 3, 1, 12, 8, 13
+
+ .long 2, 6, 0, 8, 12, 10, 11, 3
+ .long 4, 7, 15, 1, 13, 5, 14, 9
+
+ .long 12, 1, 14, 4, 5, 15, 13, 10
+ .long 0, 6, 9, 8, 7, 3, 2, 11
+
+ .long 13, 7, 12, 3, 11, 14, 1, 9
+ .long 5, 15, 8, 2, 0, 4, 6, 10
+
+ .long 6, 14, 11, 0, 15, 9, 3, 8
+ .long 12, 13, 1, 10, 2, 7, 4, 5
+
+ .long 10, 8, 7, 1, 2, 4, 6, 5
+ .long 15, 9, 3, 13, 11, 14, 12, 0
+#endif /* CONFIG_AS_AVX512 */
.text
#ifdef CONFIG_AS_AVX
@@ -586,3 +620,82 @@ ENTRY(blake2s_compress_avx)
ret
ENDPROC(blake2s_compress_avx)
#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX512
+ENTRY(blake2s_compress_avx512)
+ vmovdqu (%rdi),%xmm0
+ vmovdqu 0x10(%rdi),%xmm1
+ vmovdqu 0x20(%rdi),%xmm15
+ vmovq %rcx,%xmm13
+ jmp .Lblake2s_compress_avx512_mainloop
+.align 32
+.Lblake2s_compress_avx512_mainloop:
+ vpaddq %xmm13,%xmm15,%xmm15
+ vmovdqa IV(%rip),%xmm2
+ vpxor IV+16(%rip),%xmm15,%xmm3
+ lea SIGMA(%rip),%rax
+ movl $10,%ecx
+.Lblake2s_compress_avx512_roundloop:
+ add $0x40,%rax
+ vmovdqa -0x40(%rax),%xmm7
+ vpcmpeqd %xmm14,%xmm14,%xmm14
+ vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6
+ vpaddd %xmm6,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x10,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0xc,%xmm1,%xmm1
+ vmovdqa -0x30(%rax),%xmm7
+ vpcmpeqd %xmm14,%xmm14,%xmm14
+ vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6
+ vpaddd %xmm6,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0x7,%xmm1,%xmm1
+ vpshufd $0x39,%xmm1,%xmm1
+ vpshufd $0x4e,%xmm2,%xmm2
+ vpshufd $0x93,%xmm3,%xmm3
+ vmovdqa -0x20(%rax),%xmm7
+ vpcmpeqd %xmm14,%xmm14,%xmm14
+ vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6
+ vpaddd %xmm6,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x10,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0xc,%xmm1,%xmm1
+ vmovdqa -0x10(%rax),%xmm7
+ vpcmpeqd %xmm14,%xmm14,%xmm14
+ vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6
+ vpaddd %xmm6,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0x7,%xmm1,%xmm1
+ vpshufd $0x93,%xmm1,%xmm1
+ vpshufd $0x4e,%xmm2,%xmm2
+ vpshufd $0x39,%xmm3,%xmm3
+ decl %ecx
+ jne .Lblake2s_compress_avx512_roundloop
+ add $0x40,%rsi
+ vpxor (%rdi),%xmm0,%xmm0
+ vpxor 0x10(%rdi),%xmm1,%xmm1
+ vpxor %xmm2,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqu %xmm0,(%rdi)
+ vmovdqu %xmm1,0x10(%rdi)
+ dec %rdx
+ jne .Lblake2s_compress_avx512_mainloop
+ vmovdqu %xmm15,0x20(%rdi)
+ vzeroupper
+ retq
+ENDPROC(blake2s_compress_avx512)
+#endif /* CONFIG_AS_AVX512 */
diff --git a/src/crypto/blake2s.c b/src/crypto/blake2s.c
index 91f154f..3e13277 100644
--- a/src/crypto/blake2s.c
+++ b/src/crypto/blake2s.c
@@ -114,11 +114,20 @@ void blake2s_init_key(struct blake2s_state *state, const size_t outlen, const vo
#include <asm/fpu/api.h>
#include <asm/simd.h>
static bool blake2s_use_avx __read_mostly;
+static bool blake2s_use_avx512 __read_mostly;
void __init blake2s_fpu_init(void)
{
blake2s_use_avx = boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifndef COMPAT_CANNOT_USE_AVX512
+ blake2s_use_avx512 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_ZMM_Hi256, NULL);
+#endif
}
+#ifdef CONFIG_AS_AVX
asmlinkage void blake2s_compress_avx(struct blake2s_state *state, const u8 *block, size_t nblocks, u32 inc);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, const u8 *block, size_t nblocks, u32 inc);
+#endif
#else
void __init blake2s_fpu_init(void) { }
#endif
@@ -134,6 +143,15 @@ static inline void blake2s_compress(struct blake2s_state *state, const u8 *block
#endif
#ifdef CONFIG_X86_64
+#ifdef CONFIG_AS_AVX512
+ if (blake2s_use_avx512 && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ blake2s_compress_avx512(state, block, nblocks, inc);
+ kernel_fpu_end();
+ return;
+ }
+#endif
+#ifdef CONFIG_AS_AVX
if (blake2s_use_avx && irq_fpu_usable()) {
kernel_fpu_begin();
blake2s_compress_avx(state, block, nblocks, inc);
@@ -141,6 +159,7 @@ static inline void blake2s_compress(struct blake2s_state *state, const u8 *block
return;
}
#endif
+#endif
while (nblocks > 0) {
blake2s_increment_counter(state, inc);