From 4edb9051d9da9236df766a30516eacd03730592d Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Wed, 22 Nov 2017 20:57:37 +0000 Subject: blake2s: AVX512F+VL implementation Signed-off-by: Samuel Neves Signed-off-by: Jason A. Donenfeld --- src/crypto/blake2s-x86_64.S | 113 ++++++++++++++++++++++++++++++++++++++++++++ src/crypto/blake2s.c | 19 ++++++++ 2 files changed, 132 insertions(+) (limited to 'src/crypto') diff --git a/src/crypto/blake2s-x86_64.S b/src/crypto/blake2s-x86_64.S index e86afd3..294750e 100644 --- a/src/crypto/blake2s-x86_64.S +++ b/src/crypto/blake2s-x86_64.S @@ -15,6 +15,40 @@ ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 .section .rodata.cst16.ROR328, "aM", @progbits, 16 .align 16 ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 +#ifdef CONFIG_AS_AVX512 +.section .rodata.cst64.BLAKESIGMA, "aM", @progbits, 640 +.align 64 +SIGMA: + .long 0, 2, 4, 6, 1, 3, 5, 7 + .long 8, 10, 12, 14, 9, 11, 13, 15 + + .long 14, 4, 9, 13, 10, 8, 15, 6 + .long 1, 0, 11, 5, 12, 2, 7, 3 + + .long 11, 12, 5, 15, 8, 0, 2, 13 + .long 10, 3, 7, 9, 14, 6, 1, 4 + + .long 7, 3, 13, 11, 9, 1, 12, 14 + .long 2, 5, 4, 15, 6, 10, 0, 8 + + .long 9, 5, 2, 10, 0, 7, 4, 15 + .long 14, 11, 6, 3, 1, 12, 8, 13 + + .long 2, 6, 0, 8, 12, 10, 11, 3 + .long 4, 7, 15, 1, 13, 5, 14, 9 + + .long 12, 1, 14, 4, 5, 15, 13, 10 + .long 0, 6, 9, 8, 7, 3, 2, 11 + + .long 13, 7, 12, 3, 11, 14, 1, 9 + .long 5, 15, 8, 2, 0, 4, 6, 10 + + .long 6, 14, 11, 0, 15, 9, 3, 8 + .long 12, 13, 1, 10, 2, 7, 4, 5 + + .long 10, 8, 7, 1, 2, 4, 6, 5 + .long 15, 9, 3, 13, 11, 14, 12, 0 +#endif /* CONFIG_AS_AVX512 */ .text #ifdef CONFIG_AS_AVX @@ -586,3 +620,82 @@ ENTRY(blake2s_compress_avx) ret ENDPROC(blake2s_compress_avx) #endif /* CONFIG_AS_AVX */ + +#ifdef CONFIG_AS_AVX512 +ENTRY(blake2s_compress_avx512) + vmovdqu (%rdi),%xmm0 + vmovdqu 0x10(%rdi),%xmm1 + vmovdqu 0x20(%rdi),%xmm15 + vmovq %rcx,%xmm13 + jmp .Lblake2s_compress_avx512_mainloop +.align 32 +.Lblake2s_compress_avx512_mainloop: + vpaddq %xmm13,%xmm15,%xmm15 + vmovdqa IV(%rip),%xmm2 + vpxor IV+16(%rip),%xmm15,%xmm3 + lea SIGMA(%rip),%rax + movl $10,%ecx +.Lblake2s_compress_avx512_roundloop: + add $0x40,%rax + vmovdqa -0x40(%rax),%xmm7 + vpcmpeqd %xmm14,%xmm14,%xmm14 + vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6 + vpaddd %xmm6,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vmovdqa -0x30(%rax),%xmm7 + vpcmpeqd %xmm14,%xmm14,%xmm14 + vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6 + vpaddd %xmm6,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x39,%xmm1,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpshufd $0x93,%xmm3,%xmm3 + vmovdqa -0x20(%rax),%xmm7 + vpcmpeqd %xmm14,%xmm14,%xmm14 + vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6 + vpaddd %xmm6,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vmovdqa -0x10(%rax),%xmm7 + vpcmpeqd %xmm14,%xmm14,%xmm14 + vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6 + vpaddd %xmm6,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x93,%xmm1,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpshufd $0x39,%xmm3,%xmm3 + decl %ecx + jne .Lblake2s_compress_avx512_roundloop + add $0x40,%rsi + vpxor (%rdi),%xmm0,%xmm0 + vpxor 0x10(%rdi),%xmm1,%xmm1 + vpxor %xmm2,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqu %xmm0,(%rdi) + vmovdqu %xmm1,0x10(%rdi) + dec %rdx + jne .Lblake2s_compress_avx512_mainloop + vmovdqu %xmm15,0x20(%rdi) + vzeroupper + retq +ENDPROC(blake2s_compress_avx512) +#endif /* CONFIG_AS_AVX512 */ diff --git a/src/crypto/blake2s.c b/src/crypto/blake2s.c index 91f154f..3e13277 100644 --- a/src/crypto/blake2s.c +++ b/src/crypto/blake2s.c @@ -114,11 +114,20 @@ void blake2s_init_key(struct blake2s_state *state, const size_t outlen, const vo #include #include static bool blake2s_use_avx __read_mostly; +static bool blake2s_use_avx512 __read_mostly; void __init blake2s_fpu_init(void) { blake2s_use_avx = boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifndef COMPAT_CANNOT_USE_AVX512 + blake2s_use_avx512 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_ZMM_Hi256, NULL); +#endif } +#ifdef CONFIG_AS_AVX asmlinkage void blake2s_compress_avx(struct blake2s_state *state, const u8 *block, size_t nblocks, u32 inc); +#endif +#ifdef CONFIG_AS_AVX512 +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, const u8 *block, size_t nblocks, u32 inc); +#endif #else void __init blake2s_fpu_init(void) { } #endif @@ -134,12 +143,22 @@ static inline void blake2s_compress(struct blake2s_state *state, const u8 *block #endif #ifdef CONFIG_X86_64 +#ifdef CONFIG_AS_AVX512 + if (blake2s_use_avx512 && irq_fpu_usable()) { + kernel_fpu_begin(); + blake2s_compress_avx512(state, block, nblocks, inc); + kernel_fpu_end(); + return; + } +#endif +#ifdef CONFIG_AS_AVX if (blake2s_use_avx && irq_fpu_usable()) { kernel_fpu_begin(); blake2s_compress_avx(state, block, nblocks, inc); kernel_fpu_end(); return; } +#endif #endif while (nblocks > 0) { -- cgit v1.2.3