summaryrefslogtreecommitdiffhomepage
path: root/src/crypto/zinc/blake2s
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/zinc/blake2s')
-rw-r--r--src/crypto/zinc/blake2s/blake2s-x86_64-glue.h64
-rw-r--r--src/crypto/zinc/blake2s/blake2s-x86_64.S685
-rw-r--r--src/crypto/zinc/blake2s/blake2s.c274
3 files changed, 1023 insertions, 0 deletions
diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64-glue.h b/src/crypto/zinc/blake2s/blake2s-x86_64-glue.h
new file mode 100644
index 0000000..9dadaa3
--- /dev/null
+++ b/src/crypto/zinc/blake2s/blake2s-x86_64-glue.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/blake2s.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void blake2s_compress_avx(struct blake2s_state *state,
+ const u8 *block, const size_t nblocks,
+ const u32 inc);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
+ const u8 *block, const size_t nblocks,
+ const u32 inc);
+#endif
+
+static bool blake2s_use_avx __ro_after_init;
+static bool blake2s_use_avx512 __ro_after_init;
+
+void __init blake2s_fpu_init(void)
+{
+ blake2s_use_avx =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifndef COMPAT_CANNOT_USE_AVX512
+ blake2s_use_avx512 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL);
+#endif
+}
+
+static inline bool blake2s_arch(struct blake2s_state *state, const u8 *block,
+ size_t nblocks, const u32 inc)
+{
+#ifdef CONFIG_AS_AVX512
+ if (blake2s_use_avx512 && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ blake2s_compress_avx512(state, block, nblocks, inc);
+ kernel_fpu_end();
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_AVX
+ if (blake2s_use_avx && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ blake2s_compress_avx(state, block, nblocks, inc);
+ kernel_fpu_end();
+ return true;
+ }
+#endif
+ return false;
+}
+
+#define HAVE_BLAKE2S_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64.S b/src/crypto/zinc/blake2s/blake2s-x86_64.S
new file mode 100644
index 0000000..ee683e4
--- /dev/null
+++ b/src/crypto/zinc/blake2s/blake2s-x86_64.S
@@ -0,0 +1,685 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
+.align 32
+IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
+ .octa 0x5BE0CD191F83D9AB9B05688C510E527F
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
+.section .rodata.cst16.ROR328, "aM", @progbits, 16
+.align 16
+ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
+#ifdef CONFIG_AS_AVX512
+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
+.align 64
+SIGMA:
+.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15
+.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5
+.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1
+.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4
+.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2
+.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0
+.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6
+.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7
+.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8
+.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3
+#endif /* CONFIG_AS_AVX512 */
+
+.text
+#ifdef CONFIG_AS_AVX
+ENTRY(blake2s_compress_avx)
+ movl %ecx, %ecx
+ testq %rdx, %rdx
+ je .Lendofloop
+ .align 32
+.Lbeginofloop:
+ addq %rcx, 32(%rdi)
+ vmovdqu IV+16(%rip), %xmm1
+ vmovdqu (%rsi), %xmm4
+ vpxor 32(%rdi), %xmm1, %xmm1
+ vmovdqu 16(%rsi), %xmm3
+ vshufps $136, %xmm3, %xmm4, %xmm6
+ vmovdqa ROT16(%rip), %xmm7
+ vpaddd (%rdi), %xmm6, %xmm6
+ vpaddd 16(%rdi), %xmm6, %xmm6
+ vpxor %xmm6, %xmm1, %xmm1
+ vmovdqu IV(%rip), %xmm8
+ vpshufb %xmm7, %xmm1, %xmm1
+ vmovdqu 48(%rsi), %xmm5
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor 16(%rdi), %xmm8, %xmm9
+ vmovdqu 32(%rsi), %xmm2
+ vpblendw $12, %xmm3, %xmm5, %xmm13
+ vshufps $221, %xmm5, %xmm2, %xmm12
+ vpunpckhqdq %xmm2, %xmm4, %xmm14
+ vpslld $20, %xmm9, %xmm0
+ vpsrld $12, %xmm9, %xmm9
+ vpxor %xmm0, %xmm9, %xmm0
+ vshufps $221, %xmm3, %xmm4, %xmm9
+ vpaddd %xmm9, %xmm6, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vmovdqa ROR328(%rip), %xmm6
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm0
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm8, %xmm8
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vshufps $136, %xmm5, %xmm2, %xmm10
+ vpshufd $57, %xmm0, %xmm0
+ vpaddd %xmm10, %xmm9, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpaddd %xmm12, %xmm9, %xmm9
+ vpblendw $12, %xmm2, %xmm3, %xmm12
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm10
+ vpslld $20, %xmm10, %xmm0
+ vpsrld $12, %xmm10, %xmm10
+ vpxor %xmm0, %xmm10, %xmm0
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm0
+ vpshufd $57, %xmm1, %xmm1
+ vpshufd $78, %xmm8, %xmm8
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vpslldq $4, %xmm5, %xmm10
+ vpblendw $240, %xmm10, %xmm12, %xmm12
+ vpshufd $147, %xmm0, %xmm0
+ vpshufd $147, %xmm12, %xmm12
+ vpaddd %xmm9, %xmm12, %xmm12
+ vpaddd %xmm0, %xmm12, %xmm12
+ vpxor %xmm12, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm11
+ vpslld $20, %xmm11, %xmm9
+ vpsrld $12, %xmm11, %xmm11
+ vpxor %xmm9, %xmm11, %xmm0
+ vpshufd $8, %xmm2, %xmm9
+ vpblendw $192, %xmm5, %xmm3, %xmm11
+ vpblendw $240, %xmm11, %xmm9, %xmm9
+ vpshufd $177, %xmm9, %xmm9
+ vpaddd %xmm12, %xmm9, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm11
+ vpxor %xmm11, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm9
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm8, %xmm8
+ vpslld $25, %xmm9, %xmm0
+ vpsrld $7, %xmm9, %xmm9
+ vpxor %xmm0, %xmm9, %xmm0
+ vpslldq $4, %xmm3, %xmm9
+ vpblendw $48, %xmm9, %xmm2, %xmm9
+ vpblendw $240, %xmm9, %xmm4, %xmm9
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $177, %xmm9, %xmm9
+ vpaddd %xmm11, %xmm9, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpslld $20, %xmm0, %xmm8
+ vpsrld $12, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ vpunpckhdq %xmm3, %xmm4, %xmm8
+ vpblendw $12, %xmm10, %xmm8, %xmm12
+ vpshufd $177, %xmm12, %xmm12
+ vpaddd %xmm9, %xmm12, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpshufd $57, %xmm1, %xmm1
+ vpshufd $78, %xmm11, %xmm11
+ vpslld $25, %xmm0, %xmm12
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm12, %xmm0, %xmm0
+ vpunpckhdq %xmm5, %xmm2, %xmm12
+ vpshufd $147, %xmm0, %xmm0
+ vpblendw $15, %xmm13, %xmm12, %xmm12
+ vpslldq $8, %xmm5, %xmm13
+ vpshufd $210, %xmm12, %xmm12
+ vpaddd %xmm9, %xmm12, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpslld $20, %xmm0, %xmm12
+ vpsrld $12, %xmm0, %xmm0
+ vpxor %xmm12, %xmm0, %xmm0
+ vpunpckldq %xmm4, %xmm2, %xmm12
+ vpblendw $240, %xmm4, %xmm12, %xmm12
+ vpblendw $192, %xmm13, %xmm12, %xmm12
+ vpsrldq $12, %xmm3, %xmm13
+ vpaddd %xmm12, %xmm9, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm11, %xmm11
+ vpslld $25, %xmm0, %xmm12
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm12, %xmm0, %xmm0
+ vpblendw $60, %xmm2, %xmm4, %xmm12
+ vpblendw $3, %xmm13, %xmm12, %xmm12
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $78, %xmm12, %xmm12
+ vpaddd %xmm9, %xmm12, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm12
+ vpslld $20, %xmm12, %xmm13
+ vpsrld $12, %xmm12, %xmm0
+ vpblendw $51, %xmm3, %xmm4, %xmm12
+ vpxor %xmm13, %xmm0, %xmm0
+ vpblendw $192, %xmm10, %xmm12, %xmm10
+ vpslldq $8, %xmm2, %xmm12
+ vpshufd $27, %xmm10, %xmm10
+ vpaddd %xmm9, %xmm10, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpshufd $57, %xmm1, %xmm1
+ vpshufd $78, %xmm11, %xmm11
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vpunpckhdq %xmm2, %xmm8, %xmm10
+ vpshufd $147, %xmm0, %xmm0
+ vpblendw $12, %xmm5, %xmm10, %xmm10
+ vpshufd $210, %xmm10, %xmm10
+ vpaddd %xmm9, %xmm10, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm10
+ vpslld $20, %xmm10, %xmm0
+ vpsrld $12, %xmm10, %xmm10
+ vpxor %xmm0, %xmm10, %xmm0
+ vpblendw $12, %xmm4, %xmm5, %xmm10
+ vpblendw $192, %xmm12, %xmm10, %xmm10
+ vpunpckldq %xmm2, %xmm4, %xmm12
+ vpshufd $135, %xmm10, %xmm10
+ vpaddd %xmm9, %xmm10, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm13
+ vpxor %xmm13, %xmm0, %xmm0
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vpblendw $15, %xmm3, %xmm4, %xmm10
+ vpblendw $192, %xmm5, %xmm10, %xmm10
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $198, %xmm10, %xmm10
+ vpaddd %xmm9, %xmm10, %xmm10
+ vpaddd %xmm0, %xmm10, %xmm10
+ vpxor %xmm10, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm0, %xmm9
+ vpslld $20, %xmm9, %xmm0
+ vpsrld $12, %xmm9, %xmm9
+ vpxor %xmm0, %xmm9, %xmm0
+ vpunpckhdq %xmm2, %xmm3, %xmm9
+ vpunpcklqdq %xmm12, %xmm9, %xmm15
+ vpunpcklqdq %xmm12, %xmm8, %xmm12
+ vpblendw $15, %xmm5, %xmm8, %xmm8
+ vpaddd %xmm15, %xmm10, %xmm15
+ vpaddd %xmm0, %xmm15, %xmm15
+ vpxor %xmm15, %xmm1, %xmm1
+ vpshufd $141, %xmm8, %xmm8
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm0, %xmm0
+ vpshufd $57, %xmm1, %xmm1
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vpunpcklqdq %xmm2, %xmm3, %xmm10
+ vpshufd $147, %xmm0, %xmm0
+ vpblendw $51, %xmm14, %xmm10, %xmm14
+ vpshufd $135, %xmm14, %xmm14
+ vpaddd %xmm15, %xmm14, %xmm14
+ vpaddd %xmm0, %xmm14, %xmm14
+ vpxor %xmm14, %xmm1, %xmm1
+ vpunpcklqdq %xmm3, %xmm4, %xmm15
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm0, %xmm0
+ vpslld $20, %xmm0, %xmm11
+ vpsrld $12, %xmm0, %xmm0
+ vpxor %xmm11, %xmm0, %xmm0
+ vpunpckhqdq %xmm5, %xmm3, %xmm11
+ vpblendw $51, %xmm15, %xmm11, %xmm11
+ vpunpckhqdq %xmm3, %xmm5, %xmm15
+ vpaddd %xmm11, %xmm14, %xmm11
+ vpaddd %xmm0, %xmm11, %xmm11
+ vpxor %xmm11, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm0, %xmm0
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm0, %xmm14
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm14, %xmm0, %xmm14
+ vpunpckhqdq %xmm4, %xmm2, %xmm0
+ vpshufd $57, %xmm14, %xmm14
+ vpblendw $51, %xmm15, %xmm0, %xmm15
+ vpaddd %xmm15, %xmm11, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm11
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm11, %xmm14, %xmm14
+ vpblendw $3, %xmm2, %xmm4, %xmm11
+ vpslldq $8, %xmm11, %xmm0
+ vpblendw $15, %xmm5, %xmm0, %xmm0
+ vpshufd $99, %xmm0, %xmm0
+ vpaddd %xmm15, %xmm0, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm1, %xmm0
+ vpaddd %xmm12, %xmm15, %xmm15
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm13, %xmm13
+ vpxor %xmm13, %xmm14, %xmm14
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm14, %xmm1
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm1, %xmm14, %xmm14
+ vpblendw $3, %xmm5, %xmm4, %xmm1
+ vpshufd $147, %xmm14, %xmm14
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm13, %xmm13
+ vpxor %xmm13, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm12
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpsrldq $4, %xmm2, %xmm12
+ vpblendw $60, %xmm12, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm15, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpblendw $12, %xmm4, %xmm3, %xmm1
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm13, %xmm13
+ vpxor %xmm13, %xmm14, %xmm14
+ vpshufd $147, %xmm0, %xmm0
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm14, %xmm12
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpsrldq $4, %xmm5, %xmm12
+ vpblendw $48, %xmm12, %xmm1, %xmm1
+ vpshufd $33, %xmm5, %xmm12
+ vpshufd $57, %xmm14, %xmm14
+ vpshufd $108, %xmm1, %xmm1
+ vpblendw $51, %xmm12, %xmm10, %xmm12
+ vpaddd %xmm15, %xmm1, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpaddd %xmm12, %xmm15, %xmm15
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm13, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm13
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm13, %xmm14, %xmm14
+ vpslldq $12, %xmm3, %xmm13
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm14, %xmm12
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpblendw $51, %xmm5, %xmm4, %xmm12
+ vpshufd $147, %xmm14, %xmm14
+ vpblendw $192, %xmm13, %xmm12, %xmm12
+ vpaddd %xmm12, %xmm15, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpsrldq $4, %xmm3, %xmm12
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm13
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm13, %xmm14, %xmm14
+ vpblendw $48, %xmm2, %xmm5, %xmm13
+ vpblendw $3, %xmm12, %xmm13, %xmm13
+ vpshufd $156, %xmm13, %xmm13
+ vpaddd %xmm15, %xmm13, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpshufd $147, %xmm0, %xmm0
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm14, %xmm13
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm13, %xmm14, %xmm14
+ vpunpcklqdq %xmm2, %xmm4, %xmm13
+ vpshufd $57, %xmm14, %xmm14
+ vpblendw $12, %xmm12, %xmm13, %xmm12
+ vpshufd $180, %xmm12, %xmm12
+ vpaddd %xmm15, %xmm12, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm12
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpunpckhqdq %xmm9, %xmm4, %xmm12
+ vpshufd $198, %xmm12, %xmm12
+ vpaddd %xmm15, %xmm12, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpaddd %xmm15, %xmm8, %xmm15
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm14, %xmm12
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpsrldq $4, %xmm4, %xmm12
+ vpshufd $147, %xmm14, %xmm14
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm8
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm14, %xmm8, %xmm14
+ vpblendw $48, %xmm5, %xmm2, %xmm8
+ vpblendw $3, %xmm12, %xmm8, %xmm8
+ vpunpckhqdq %xmm5, %xmm4, %xmm12
+ vpshufd $75, %xmm8, %xmm8
+ vpblendw $60, %xmm10, %xmm12, %xmm10
+ vpaddd %xmm15, %xmm8, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm0, %xmm15, %xmm0
+ vpshufd $45, %xmm10, %xmm10
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm15, %xmm10, %xmm15
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpshufd $147, %xmm0, %xmm0
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm14, %xmm8
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm14, %xmm8, %xmm8
+ vpshufd $57, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm15, %xmm15
+ vpxor %xmm0, %xmm15, %xmm0
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm8, %xmm1, %xmm8
+ vpslld $20, %xmm8, %xmm10
+ vpsrld $12, %xmm8, %xmm8
+ vpxor %xmm8, %xmm10, %xmm10
+ vpunpckldq %xmm3, %xmm4, %xmm8
+ vpunpcklqdq %xmm9, %xmm8, %xmm9
+ vpaddd %xmm9, %xmm15, %xmm9
+ vpaddd %xmm10, %xmm9, %xmm9
+ vpxor %xmm0, %xmm9, %xmm8
+ vpshufb %xmm6, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm1, %xmm10, %xmm10
+ vpshufd $57, %xmm8, %xmm8
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm10, %xmm12
+ vpsrld $7, %xmm10, %xmm10
+ vpxor %xmm10, %xmm12, %xmm10
+ vpblendw $48, %xmm4, %xmm3, %xmm12
+ vpshufd $147, %xmm10, %xmm0
+ vpunpckhdq %xmm5, %xmm3, %xmm10
+ vpshufd $78, %xmm12, %xmm12
+ vpunpcklqdq %xmm4, %xmm10, %xmm10
+ vpblendw $192, %xmm2, %xmm10, %xmm10
+ vpshufhw $78, %xmm10, %xmm10
+ vpaddd %xmm10, %xmm9, %xmm10
+ vpaddd %xmm0, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm8
+ vpshufb %xmm7, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm9
+ vpslld $20, %xmm9, %xmm0
+ vpsrld $12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm0, %xmm0
+ vpunpckhdq %xmm5, %xmm4, %xmm9
+ vpblendw $240, %xmm9, %xmm2, %xmm13
+ vpshufd $39, %xmm13, %xmm13
+ vpaddd %xmm10, %xmm13, %xmm10
+ vpaddd %xmm0, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm8
+ vpblendw $12, %xmm4, %xmm2, %xmm13
+ vpshufb %xmm6, %xmm8, %xmm8
+ vpslldq $4, %xmm13, %xmm13
+ vpblendw $15, %xmm5, %xmm13, %xmm13
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm1, %xmm0, %xmm0
+ vpaddd %xmm13, %xmm10, %xmm13
+ vpshufd $147, %xmm8, %xmm8
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm0, %xmm14
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm0, %xmm14, %xmm14
+ vpshufd $57, %xmm14, %xmm14
+ vpaddd %xmm14, %xmm13, %xmm13
+ vpxor %xmm8, %xmm13, %xmm8
+ vpaddd %xmm13, %xmm12, %xmm12
+ vpshufb %xmm7, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm14, %xmm1, %xmm14
+ vpslld $20, %xmm14, %xmm10
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm14, %xmm10, %xmm10
+ vpaddd %xmm10, %xmm12, %xmm12
+ vpxor %xmm8, %xmm12, %xmm8
+ vpshufb %xmm6, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm1, %xmm10, %xmm0
+ vpshufd $57, %xmm8, %xmm8
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm0, %xmm10, %xmm10
+ vpblendw $48, %xmm2, %xmm3, %xmm0
+ vpblendw $15, %xmm11, %xmm0, %xmm0
+ vpshufd $147, %xmm10, %xmm10
+ vpshufd $114, %xmm0, %xmm0
+ vpaddd %xmm12, %xmm0, %xmm0
+ vpaddd %xmm10, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm8
+ vpshufb %xmm7, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm10, %xmm1, %xmm10
+ vpslld $20, %xmm10, %xmm11
+ vpsrld $12, %xmm10, %xmm10
+ vpxor %xmm10, %xmm11, %xmm10
+ vpslldq $4, %xmm4, %xmm11
+ vpblendw $192, %xmm11, %xmm3, %xmm3
+ vpunpckldq %xmm5, %xmm4, %xmm4
+ vpshufd $99, %xmm3, %xmm3
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm10, %xmm3, %xmm3
+ vpxor %xmm8, %xmm3, %xmm11
+ vpunpckldq %xmm5, %xmm2, %xmm0
+ vpblendw $192, %xmm2, %xmm5, %xmm2
+ vpshufb %xmm6, %xmm11, %xmm11
+ vpunpckhqdq %xmm0, %xmm9, %xmm0
+ vpblendw $15, %xmm4, %xmm2, %xmm4
+ vpaddd %xmm11, %xmm1, %xmm1
+ vpxor %xmm1, %xmm10, %xmm10
+ vpshufd $147, %xmm11, %xmm11
+ vpshufd $201, %xmm0, %xmm0
+ vpslld $25, %xmm10, %xmm8
+ vpsrld $7, %xmm10, %xmm10
+ vpxor %xmm10, %xmm8, %xmm10
+ vpshufd $78, %xmm1, %xmm1
+ vpaddd %xmm3, %xmm0, %xmm0
+ vpshufd $27, %xmm4, %xmm4
+ vpshufd $57, %xmm10, %xmm10
+ vpaddd %xmm10, %xmm0, %xmm0
+ vpxor %xmm11, %xmm0, %xmm11
+ vpaddd %xmm0, %xmm4, %xmm0
+ vpshufb %xmm7, %xmm11, %xmm7
+ vpaddd %xmm7, %xmm1, %xmm1
+ vpxor %xmm10, %xmm1, %xmm10
+ vpslld $20, %xmm10, %xmm8
+ vpsrld $12, %xmm10, %xmm10
+ vpxor %xmm10, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm0, %xmm0
+ vpxor %xmm7, %xmm0, %xmm7
+ vpshufb %xmm6, %xmm7, %xmm6
+ vpaddd %xmm6, %xmm1, %xmm1
+ vpxor %xmm1, %xmm8, %xmm8
+ vpshufd $78, %xmm1, %xmm1
+ vpshufd $57, %xmm6, %xmm6
+ vpslld $25, %xmm8, %xmm2
+ vpsrld $7, %xmm8, %xmm8
+ vpxor %xmm8, %xmm2, %xmm8
+ vpxor (%rdi), %xmm1, %xmm1
+ vpshufd $147, %xmm8, %xmm8
+ vpxor %xmm0, %xmm1, %xmm0
+ vmovups %xmm0, (%rdi)
+ vpxor 16(%rdi), %xmm8, %xmm0
+ vpxor %xmm6, %xmm0, %xmm6
+ vmovups %xmm6, 16(%rdi)
+ addq $64, %rsi
+ decq %rdx
+ jnz .Lbeginofloop
+.Lendofloop:
+ ret
+ENDPROC(blake2s_compress_avx)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX512
+ENTRY(blake2s_compress_avx512)
+ vmovdqu (%rdi),%xmm0
+ vmovdqu 0x10(%rdi),%xmm1
+ vmovdqu 0x20(%rdi),%xmm4
+ vmovq %rcx,%xmm5
+ vmovdqa IV(%rip),%xmm14
+ vmovdqa IV+16(%rip),%xmm15
+ jmp .Lblake2s_compress_avx512_mainloop
+.align 32
+.Lblake2s_compress_avx512_mainloop:
+ vmovdqa %xmm0,%xmm10
+ vmovdqa %xmm1,%xmm11
+ vpaddq %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm14,%xmm2
+ vpxor %xmm15,%xmm4,%xmm3
+ vmovdqu (%rsi),%ymm6
+ vmovdqu 0x20(%rsi),%ymm7
+ addq $0x40,%rsi
+ leaq SIGMA(%rip),%rax
+ movb $0xa,%cl
+.Lblake2s_compress_avx512_roundloop:
+ addq $0x40,%rax
+ vmovdqa -0x40(%rax),%ymm8
+ vmovdqa -0x20(%rax),%ymm9
+ vpermi2d %ymm7,%ymm6,%ymm8
+ vpermi2d %ymm7,%ymm6,%ymm9
+ vmovdqa %ymm8,%ymm6
+ vmovdqa %ymm9,%ymm7
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x10,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0xc,%xmm1,%xmm1
+ vextracti128 $0x1,%ymm8,%xmm8
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0x7,%xmm1,%xmm1
+ vpshufd $0x39,%xmm1,%xmm1
+ vpshufd $0x4e,%xmm2,%xmm2
+ vpshufd $0x93,%xmm3,%xmm3
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x10,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0xc,%xmm1,%xmm1
+ vextracti128 $0x1,%ymm9,%xmm9
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0x7,%xmm1,%xmm1
+ vpshufd $0x93,%xmm1,%xmm1
+ vpshufd $0x4e,%xmm2,%xmm2
+ vpshufd $0x39,%xmm3,%xmm3
+ decb %cl
+ jne .Lblake2s_compress_avx512_roundloop
+ vpxor %xmm10,%xmm0,%xmm0
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm2,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ decq %rdx
+ jne .Lblake2s_compress_avx512_mainloop
+ vmovdqu %xmm0,(%rdi)
+ vmovdqu %xmm1,0x10(%rdi)
+ vmovdqu %xmm4,0x20(%rdi)
+ vzeroupper
+ retq
+ENDPROC(blake2s_compress_avx512)
+#endif /* CONFIG_AS_AVX512 */
diff --git a/src/crypto/zinc/blake2s/blake2s.c b/src/crypto/zinc/blake2s/blake2s.c
new file mode 100644
index 0000000..76232a3
--- /dev/null
+++ b/src/crypto/zinc/blake2s/blake2s.c
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2012 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is an implementation of the BLAKE2s hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ *
+ */
+
+#include <zinc/blake2s.h>
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/bug.h>
+#include <asm/unaligned.h>
+
+typedef union {
+ struct {
+ u8 digest_length;
+ u8 key_length;
+ u8 fanout;
+ u8 depth;
+ u32 leaf_length;
+ u32 node_offset;
+ u16 xof_length;
+ u8 node_depth;
+ u8 inner_length;
+ u8 salt[8];
+ u8 personal[8];
+ };
+ __le32 words[8];
+} __packed blake2s_param;
+
+static const u32 blake2s_iv[8] = {
+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const u8 blake2s_sigma[10][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_set_lastblock(struct blake2s_state *state)
+{
+ if (state->last_node)
+ state->f[1] = -1;
+ state->f[0] = -1;
+}
+
+static inline void blake2s_increment_counter(struct blake2s_state *state,
+ const u32 inc)
+{
+ state->t[0] += inc;
+ state->t[1] += (state->t[0] < inc);
+}
+
+static inline void blake2s_init_param(struct blake2s_state *state,
+ const blake2s_param *param)
+{
+ int i;
+
+ memset(state, 0, sizeof(struct blake2s_state));
+ for (i = 0; i < 8; ++i)
+ state->h[i] = blake2s_iv[i] ^ le32_to_cpu(param->words[i]);
+}
+
+void blake2s_init(struct blake2s_state *state, const size_t outlen)
+{
+ blake2s_param param __aligned(__alignof__(u32)) = {
+ .digest_length = outlen,
+ .fanout = 1,
+ .depth = 1
+ };
+
+#ifdef DEBUG
+ BUG_ON(!outlen || outlen > BLAKE2S_OUTBYTES);
+#endif
+ blake2s_init_param(state, &param);
+}
+EXPORT_SYMBOL(blake2s_init);
+
+void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
+ const void *key, const size_t keylen)
+{
+ blake2s_param param = { .digest_length = outlen,
+ .key_length = keylen,
+ .fanout = 1,
+ .depth = 1 };
+ u8 block[BLAKE2S_BLOCKBYTES] = { 0 };
+
+#ifdef DEBUG
+ BUG_ON(!outlen || outlen > BLAKE2S_OUTBYTES || !key || !keylen ||
+ keylen > BLAKE2S_KEYBYTES);
+#endif
+ blake2s_init_param(state, &param);
+ memcpy(block, key, keylen);
+ blake2s_update(state, block, BLAKE2S_BLOCKBYTES);
+ memzero_explicit(block, BLAKE2S_BLOCKBYTES);
+}
+EXPORT_SYMBOL(blake2s_init_key);
+
+#ifndef HAVE_BLAKE2S_ARCH_IMPLEMENTATION
+void __init blake2s_fpu_init(void)
+{
+}
+static inline bool blake2s_arch(struct blake2s_state *state, const u8 *block,
+ const size_t nblocks, const u32 inc)
+{
+ return false;
+}
+#endif
+
+static inline void blake2s_compress(struct blake2s_state *state,
+ const u8 *block, size_t nblocks,
+ const u32 inc)
+{
+ u32 m[16];
+ u32 v[16];
+ int i;
+
+#ifdef DEBUG
+ BUG_ON(nblocks > 1 && inc != BLAKE2S_BLOCKBYTES);
+#endif
+
+ if (blake2s_arch(state, block, nblocks, inc))
+ return;
+
+ while (nblocks > 0) {
+ blake2s_increment_counter(state, inc);
+
+#ifdef __LITTLE_ENDIAN
+ memcpy(m, block, BLAKE2S_BLOCKBYTES);
+#else
+ for (i = 0; i < 16; ++i)
+ m[i] = get_unaligned_le32(block + i * sizeof(m[i]));
+#endif
+ memcpy(v, state->h, 32);
+ v[ 8] = blake2s_iv[0];
+ v[ 9] = blake2s_iv[1];
+ v[10] = blake2s_iv[2];
+ v[11] = blake2s_iv[3];
+ v[12] = blake2s_iv[4] ^ state->t[0];
+ v[13] = blake2s_iv[5] ^ state->t[1];
+ v[14] = blake2s_iv[6] ^ state->f[0];
+ v[15] = blake2s_iv[7] ^ state->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+ a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+ d = ror32(d ^ a, 16); \
+ c += d; \
+ b = ror32(b ^ c, 12); \
+ a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+ d = ror32(d ^ a, 8); \
+ c += d; \
+ b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+
+#undef G
+#undef ROUND
+
+ for (i = 0; i < 8; ++i)
+ state->h[i] ^= v[i] ^ v[i + 8];
+
+ block += BLAKE2S_BLOCKBYTES;
+ --nblocks;
+ }
+}
+
+void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+{
+ const size_t fill = BLAKE2S_BLOCKBYTES - state->buflen;
+
+ if (unlikely(!inlen))
+ return;
+ if (inlen > fill) {
+ memcpy(state->buf + state->buflen, in, fill);
+ blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCKBYTES);
+ state->buflen = 0;
+ in += fill;
+ inlen -= fill;
+ }
+ if (inlen > BLAKE2S_BLOCKBYTES) {
+ const size_t nblocks =
+ (inlen + BLAKE2S_BLOCKBYTES - 1) / BLAKE2S_BLOCKBYTES;
+ /* Hash one less (full) block than strictly possible */
+ blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCKBYTES);
+ in += BLAKE2S_BLOCKBYTES * (nblocks - 1);
+ inlen -= BLAKE2S_BLOCKBYTES * (nblocks - 1);
+ }
+ memcpy(state->buf + state->buflen, in, inlen);
+ state->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2s_update);
+
+void __blake2s_final(struct blake2s_state *state)
+{
+ blake2s_set_lastblock(state);
+ memset(state->buf + state->buflen, 0,
+ BLAKE2S_BLOCKBYTES - state->buflen); /* Padding */
+ blake2s_compress(state, state->buf, 1, state->buflen);
+}
+EXPORT_SYMBOL(__blake2s_final);
+
+void blake2s_hmac(u8 *out, const u8 *in, const u8 *key, const size_t outlen,
+ const size_t inlen, const size_t keylen)
+{
+ struct blake2s_state state;
+ u8 x_key[BLAKE2S_BLOCKBYTES] __aligned(__alignof__(u32)) = { 0 };
+ u8 i_hash[BLAKE2S_OUTBYTES] __aligned(__alignof__(u32));
+ int i;
+
+ if (keylen > BLAKE2S_BLOCKBYTES) {
+ blake2s_init(&state, BLAKE2S_OUTBYTES);
+ blake2s_update(&state, key, keylen);
+ blake2s_final(&state, x_key, BLAKE2S_OUTBYTES);
+ } else
+ memcpy(x_key, key, keylen);
+
+ for (i = 0; i < BLAKE2S_BLOCKBYTES; ++i)
+ x_key[i] ^= 0x36;
+
+ blake2s_init(&state, BLAKE2S_OUTBYTES);
+ blake2s_update(&state, x_key, BLAKE2S_BLOCKBYTES);
+ blake2s_update(&state, in, inlen);
+ blake2s_final(&state, i_hash, BLAKE2S_OUTBYTES);
+
+ for (i = 0; i < BLAKE2S_BLOCKBYTES; ++i)
+ x_key[i] ^= 0x5c ^ 0x36;
+
+ blake2s_init(&state, BLAKE2S_OUTBYTES);
+ blake2s_update(&state, x_key, BLAKE2S_BLOCKBYTES);
+ blake2s_update(&state, i_hash, BLAKE2S_OUTBYTES);
+ blake2s_final(&state, i_hash, BLAKE2S_OUTBYTES);
+
+ memcpy(out, i_hash, outlen);
+ memzero_explicit(x_key, BLAKE2S_BLOCKBYTES);
+ memzero_explicit(i_hash, BLAKE2S_OUTBYTES);
+}
+EXPORT_SYMBOL(blake2s_hmac);
+
+#include "../selftest/blake2s.h"