summaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2018-11-08 17:08:22 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2018-11-14 23:59:05 -0800
commitcc36bde00d67f15d8657c2fa6f450dccf4fb76b7 (patch)
treefeca77f876b7552bb4a83883c4e193e7f35020cc /src
parent8813c7d8d2608aca8d451b3cb4d7f3285c043691 (diff)
chacha20,poly1305: switch to perlasm originals on mips and arm
We also separate out Eric Biggers' Cortex A7 implementation into its own file. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'src')
-rw-r--r--src/Makefile2
-rw-r--r--src/crypto/Kbuild.include8
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm.S1860
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm.pl1227
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm64.S1942
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm64.pl1164
-rw-r--r--src/crypto/zinc/chacha20/chacha20-unrolled-arm.S461
-rw-r--r--src/crypto/zinc/poly1305/poly1305-arm.S1117
-rw-r--r--src/crypto/zinc/poly1305/poly1305-arm.pl1272
-rw-r--r--src/crypto/zinc/poly1305/poly1305-arm64.S824
-rw-r--r--src/crypto/zinc/poly1305/poly1305-arm64.pl972
-rw-r--r--src/crypto/zinc/poly1305/poly1305-mips64.S360
-rw-r--r--src/crypto/zinc/poly1305/poly1305-mips64.pl467
-rw-r--r--src/tests/qemu/Makefile2
14 files changed, 5572 insertions, 6106 deletions
diff --git a/src/Makefile b/src/Makefile
index fcfa069..d26c6d1 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -52,7 +52,7 @@ install:
@$(MAKE) -C tools install
rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-DKMS_SOURCES := version.h Makefile Kbuild Kconfig dkms.conf $(filter-out version.h wireguard.mod.c tools/% tests/%,$(call rwildcard,,*.c *.h *.S *.include))
+DKMS_SOURCES := version.h Makefile Kbuild Kconfig dkms.conf $(filter-out version.h wireguard.mod.c tools/% tests/%,$(call rwildcard,,*.c *.h *.S *.pl *.include))
dkms-install: $(DKMS_SOURCES)
@$(foreach f,$(DKMS_SOURCES),install -v -m0644 -D $(f) $(DESTDIR)$(DKMSDIR)/$(f);)
diff --git a/src/crypto/Kbuild.include b/src/crypto/Kbuild.include
index ab5ecea..5fb9445 100644
--- a/src/crypto/Kbuild.include
+++ b/src/crypto/Kbuild.include
@@ -16,7 +16,7 @@ endif
zinc-y += chacha20/chacha20.o
zinc-$(CONFIG_ZINC_ARCH_X86_64) += chacha20/chacha20-x86_64.o
-zinc-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o
+zinc-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o chacha20/chacha20-unrolled-arm.o
zinc-$(CONFIG_ZINC_ARCH_ARM64) += chacha20/chacha20-arm64.o
zinc-$(CONFIG_ZINC_ARCH_MIPS) += chacha20/chacha20-mips.o
AFLAGS_chacha20-mips.o += -O2 # This is required to fill the branch delay slots
@@ -37,6 +37,12 @@ zinc-$(CONFIG_ZINC_ARCH_X86_64) += blake2s/blake2s-x86_64.o
zinc-y += curve25519/curve25519.o
zinc-$(CONFIG_ZINC_ARCH_ARM) += curve25519/curve25519-arm.o
+quiet_cmd_perlasm = PERLASM $@
+ cmd_perlasm = $(PERL) $< > $@
+%.S: %.pl
+ $(call cmd,perlasm)
+targets += $(patsubst %.o,crypto/zinc/%.S,$(zinc-y))
+
wireguard-y += $(addprefix crypto/zinc/,$(zinc-y))
ccflags-y += -I$(src)/crypto/include
ccflags-$(CONFIG_ZINC_ARCH_X86_64) += -DCONFIG_ZINC_ARCH_X86_64
diff --git a/src/crypto/zinc/chacha20/chacha20-arm.S b/src/crypto/zinc/chacha20/chacha20-arm.S
deleted file mode 100644
index 79ed18f..0000000
--- a/src/crypto/zinc/chacha20/chacha20-arm.S
+++ /dev/null
@@ -1,1860 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/*
- * Copyright (C) 2018 Google, Inc.
- * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * The following scalar routine was written by Eric Biggers.
- *
- * Design notes:
- *
- * 16 registers would be needed to hold the state matrix, but only 14 are
- * available because 'sp' and 'pc' cannot be used. So we spill the elements
- * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
- * 'ldrd' and one 'strd' instruction per round.
- *
- * All rotates are performed using the implicit rotate operand accepted by the
- * 'add' and 'eor' instructions. This is faster than using explicit rotate
- * instructions. To make this work, we allow the values in the second and last
- * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
- * wrong rotation amount. The rotation amount is then fixed up just in time
- * when the values are used. 'brot' is the number of bits the values in row 'b'
- * need to be rotated right to arrive at the correct values, and 'drot'
- * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
- * that they end up as (25, 24) after every round.
- */
-
- // ChaCha state registers
- X0 .req r0
- X1 .req r1
- X2 .req r2
- X3 .req r3
- X4 .req r4
- X5 .req r5
- X6 .req r6
- X7 .req r7
- X8_X10 .req r8 // shared by x8 and x10
- X9_X11 .req r9 // shared by x9 and x11
- X12 .req r10
- X13 .req r11
- X14 .req r12
- X15 .req r14
-
-.Lexpand_32byte_k:
- // "expand 32-byte k"
- .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
-
-#ifdef __thumb2__
-# define adrl adr
-#endif
-
-.macro __rev out, in, t0, t1, t2
-.if __LINUX_ARM_ARCH__ >= 6
- rev \out, \in
-.else
- lsl \t0, \in, #24
- and \t1, \in, #0xff00
- and \t2, \in, #0xff0000
- orr \out, \t0, \in, lsr #24
- orr \out, \out, \t1, lsl #8
- orr \out, \out, \t2, lsr #8
-.endif
-.endm
-
-.macro _le32_bswap x, t0, t1, t2
-#ifdef __ARMEB__
- __rev \x, \x, \t0, \t1, \t2
-#endif
-.endm
-
-.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
- _le32_bswap \a, \t0, \t1, \t2
- _le32_bswap \b, \t0, \t1, \t2
- _le32_bswap \c, \t0, \t1, \t2
- _le32_bswap \d, \t0, \t1, \t2
-.endm
-
-.macro __ldrd a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
- ldrd \a, \b, [\src, #\offset]
-#else
- ldr \a, [\src, #\offset]
- ldr \b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
- strd \a, \b, [\dst, #\offset]
-#else
- str \a, [\dst, #\offset]
- str \b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
-
- // a += b; d ^= a; d = rol(d, 16);
- add \a1, \a1, \b1, ror #brot
- add \a2, \a2, \b2, ror #brot
- eor \d1, \a1, \d1, ror #drot
- eor \d2, \a2, \d2, ror #drot
- // drot == 32 - 16 == 16
-
- // c += d; b ^= c; b = rol(b, 12);
- add \c1, \c1, \d1, ror #16
- add \c2, \c2, \d2, ror #16
- eor \b1, \c1, \b1, ror #brot
- eor \b2, \c2, \b2, ror #brot
- // brot == 32 - 12 == 20
-
- // a += b; d ^= a; d = rol(d, 8);
- add \a1, \a1, \b1, ror #20
- add \a2, \a2, \b2, ror #20
- eor \d1, \a1, \d1, ror #16
- eor \d2, \a2, \d2, ror #16
- // drot == 32 - 8 == 24
-
- // c += d; b ^= c; b = rol(b, 7);
- add \c1, \c1, \d1, ror #24
- add \c2, \c2, \d2, ror #24
- eor \b1, \c1, \b1, ror #20
- eor \b2, \c2, \b2, ror #20
- // brot == 32 - 7 == 25
-.endm
-
-.macro _doubleround
-
- // column round
-
- // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
- _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
-
- // save (x8, x9); restore (x10, x11)
- __strd X8_X10, X9_X11, sp, 0
- __ldrd X8_X10, X9_X11, sp, 8
-
- // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
- _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
-
- .set brot, 25
- .set drot, 24
-
- // diagonal round
-
- // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
- _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
-
- // save (x10, x11); restore (x8, x9)
- __strd X8_X10, X9_X11, sp, 8
- __ldrd X8_X10, X9_X11, sp, 0
-
- // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
- _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
-.endm
-
-.macro _chacha_permute nrounds
- .set brot, 0
- .set drot, 0
- .rept \nrounds / 2
- _doubleround
- .endr
-.endm
-
-.macro _chacha nrounds
-
-.Lnext_block\@:
- // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
- // Registers contain x0-x9,x12-x15.
-
- // Do the core ChaCha permutation to update x0-x15.
- _chacha_permute \nrounds
-
- add sp, #8
- // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers contain x0-x9,x12-x15.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
- push {X8_X10, X9_X11, X12, X13, X14, X15}
-
- // Load (OUT, IN, LEN).
- ldr r14, [sp, #96]
- ldr r12, [sp, #100]
- ldr r11, [sp, #104]
-
- orr r10, r14, r12
-
- // Use slow path if fewer than 64 bytes remain.
- cmp r11, #64
- blt .Lxor_slowpath\@
-
- // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
- // ARMv6+, since ldmia and stmia (used below) still require alignment.
- tst r10, #3
- bne .Lxor_slowpath\@
-
- // Fast path: XOR 64 bytes of aligned data.
-
- // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // x0-x3
- __ldrd r8, r9, sp, 32
- __ldrd r10, r11, sp, 40
- add X0, X0, r8
- add X1, X1, r9
- add X2, X2, r10
- add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
- ldmia r12!, {r8-r11}
- eor X0, X0, r8
- eor X1, X1, r9
- eor X2, X2, r10
- eor X3, X3, r11
- stmia r14!, {X0-X3}
-
- // x4-x7
- __ldrd r8, r9, sp, 48
- __ldrd r10, r11, sp, 56
- add X4, r8, X4, ror #brot
- add X5, r9, X5, ror #brot
- ldmia r12!, {X0-X3}
- add X6, r10, X6, ror #brot
- add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
- eor X4, X4, X0
- eor X5, X5, X1
- eor X6, X6, X2
- eor X7, X7, X3
- stmia r14!, {X4-X7}
-
- // x8-x15
- pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
- __ldrd r8, r9, sp, 32
- __ldrd r10, r11, sp, 40
- add r0, r0, r8 // x8
- add r1, r1, r9 // x9
- add r6, r6, r10 // x10
- add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
- ldmia r12!, {r8-r11}
- eor r0, r0, r8 // x8
- eor r1, r1, r9 // x9
- eor r6, r6, r10 // x10
- eor r7, r7, r11 // x11
- stmia r14!, {r0,r1,r6,r7}
- ldmia r12!, {r0,r1,r6,r7}
- __ldrd r8, r9, sp, 48
- __ldrd r10, r11, sp, 56
- add r2, r8, r2, ror #drot // x12
- add r3, r9, r3, ror #drot // x13
- add r4, r10, r4, ror #drot // x14
- add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
- ldr r9, [sp, #72] // load LEN
- eor r2, r2, r0 // x12
- eor r3, r3, r1 // x13
- eor r4, r4, r6 // x14
- eor r5, r5, r7 // x15
- subs r9, #64 // decrement and check LEN
- stmia r14!, {r2-r5}
-
- beq .Ldone\@
-
-.Lprepare_for_next_block\@:
-
- // Stack: x0-x15 OUT IN LEN
-
- // Increment block counter (x12)
- add r8, #1
-
- // Store updated (OUT, IN, LEN)
- str r14, [sp, #64]
- str r12, [sp, #68]
- str r9, [sp, #72]
-
- mov r14, sp
-
- // Store updated block counter (x12)
- str r8, [sp, #48]
-
- sub sp, #16
-
- // Reload state and do next block
- ldmia r14!, {r0-r11} // load x0-x11
- __strd r10, r11, sp, 8 // store x10-x11 before state
- ldmia r14, {r10-r12,r14} // load x12-x15
- b .Lnext_block\@
-
-.Lxor_slowpath\@:
- // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
- // We handle it by storing the 64 bytes of keystream to the stack, then
- // XOR-ing the needed portion with the data.
-
- // Allocate keystream buffer
- sub sp, #64
- mov r14, sp
-
- // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // Save keystream for x0-x3
- __ldrd r8, r9, sp, 96
- __ldrd r10, r11, sp, 104
- add X0, X0, r8
- add X1, X1, r9
- add X2, X2, r10
- add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
- stmia r14!, {X0-X3}
-
- // Save keystream for x4-x7
- __ldrd r8, r9, sp, 112
- __ldrd r10, r11, sp, 120
- add X4, r8, X4, ror #brot
- add X5, r9, X5, ror #brot
- add X6, r10, X6, ror #brot
- add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
- add r8, sp, #64
- stmia r14!, {X4-X7}
-
- // Save keystream for x8-x15
- ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
- __ldrd r8, r9, sp, 128
- __ldrd r10, r11, sp, 136
- add r0, r0, r8 // x8
- add r1, r1, r9 // x9
- add r6, r6, r10 // x10
- add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
- stmia r14!, {r0,r1,r6,r7}
- __ldrd r8, r9, sp, 144
- __ldrd r10, r11, sp, 152
- add r2, r8, r2, ror #drot // x12
- add r3, r9, r3, ror #drot // x13
- add r4, r10, r4, ror #drot // x14
- add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
- stmia r14, {r2-r5}
-
- // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
- // Registers: r8 is block counter, r12 is IN.
-
- ldr r9, [sp, #168] // LEN
- ldr r14, [sp, #160] // OUT
- cmp r9, #64
- mov r0, sp
- movle r1, r9
- movgt r1, #64
- // r1 is number of bytes to XOR, in range [1, 64]
-
-.if __LINUX_ARM_ARCH__ < 6
- orr r2, r12, r14
- tst r2, #3 // IN or OUT misaligned?
- bne .Lxor_next_byte\@
-.endif
-
- // XOR a word at a time
-.rept 16
- subs r1, #4
- blt .Lxor_words_done\@
- ldr r2, [r12], #4
- ldr r3, [r0], #4
- eor r2, r2, r3
- str r2, [r14], #4
-.endr
- b .Lxor_slowpath_done\@
-.Lxor_words_done\@:
- ands r1, r1, #3
- beq .Lxor_slowpath_done\@
-
- // XOR a byte at a time
-.Lxor_next_byte\@:
- ldrb r2, [r12], #1
- ldrb r3, [r0], #1
- eor r2, r2, r3
- strb r2, [r14], #1
- subs r1, #1
- bne .Lxor_next_byte\@
-
-.Lxor_slowpath_done\@:
- subs r9, #64
- add sp, #96
- bgt .Lprepare_for_next_block\@
-
-.Ldone\@:
-.endm // _chacha
-
-/*
- * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
- * const u32 iv[4]);
- */
-ENTRY(chacha20_arm)
- cmp r2, #0 // len == 0?
- reteq lr
-
- push {r0-r2,r4-r11,lr}
-
- // Push state x0-x15 onto stack.
- // Also store an extra copy of x10-x11 just before the state.
-
- ldr r4, [sp, #48] // iv
- mov r0, sp
- sub sp, #80
-
- // iv: x12-x15
- ldm r4, {X12,X13,X14,X15}
- stmdb r0!, {X12,X13,X14,X15}
-
- // key: x4-x11
- __ldrd X8_X10, X9_X11, r3, 24
- __strd X8_X10, X9_X11, sp, 8
- stmdb r0!, {X8_X10, X9_X11}
- ldm r3, {X4-X9_X11}
- stmdb r0!, {X4-X9_X11}
-
- // constants: x0-x3
- adrl X3, .Lexpand_32byte_k
- ldm X3, {X0-X3}
- __strd X0, X1, sp, 16
- __strd X2, X3, sp, 24
-
- _chacha 20
-
- add sp, #76
- pop {r4-r11, pc}
-ENDPROC(chacha20_arm)
-
-/*
- * void hchacha20_arm(const u32 state[16], u32 out[8]);
- */
-ENTRY(hchacha20_arm)
- push {r1,r4-r11,lr}
-
- mov r14, r0
- ldmia r14!, {r0-r11} // load x0-x11
- push {r10-r11} // store x10-x11 to stack
- ldm r14, {r10-r12,r14} // load x12-x15
- sub sp, #8
-
- _chacha_permute 20
-
- // Skip over (unused0-unused1, x10-x11)
- add sp, #16
-
- // Fix up rotations of x12-x15
- ror X12, X12, #drot
- ror X13, X13, #drot
- pop {r4} // load 'out'
- ror X14, X14, #drot
- ror X15, X15, #drot
-
- // Store (x0-x3,x12-x15) to 'out'
- stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
-
- pop {r4-r11,pc}
-ENDPROC(hchacha20_arm)
-
-#ifdef CONFIG_KERNEL_MODE_NEON
-/*
- * This following NEON routine was ported from Andy Polyakov's implementation
- * from CRYPTOGAMS. It begins with parts of the CRYPTOGAMS scalar routine,
- * since certain NEON code paths actually branch to it.
- */
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax unified
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code 32
-#endif
-
-#if defined(__thumb2__) || defined(__clang__)
-#define ldrhsb ldrbhs
-#endif
-
-.align 4
-.Loop_outer:
- ldmia sp,{r0-r9} @ load key material
- str r11,[sp,#4*(32+2)] @ save len
- str r12, [sp,#4*(32+1)] @ save inp
- str r14, [sp,#4*(32+0)] @ save out
-.Loop_outer_enter:
- ldr r11, [sp,#4*(15)]
- mov r4,r4,ror#19 @ twist b[0..3]
- ldr r12,[sp,#4*(12)] @ modulo-scheduled load
- mov r5,r5,ror#19
- ldr r10, [sp,#4*(13)]
- mov r6,r6,ror#19
- ldr r14,[sp,#4*(14)]
- mov r7,r7,ror#19
- mov r11,r11,ror#8 @ twist d[0..3]
- mov r12,r12,ror#8
- mov r10,r10,ror#8
- mov r14,r14,ror#8
- str r11, [sp,#4*(16+15)]
- mov r11,#10
- b .Loop
-
-.align 4
-.Loop:
- subs r11,r11,#1
- add r0,r0,r4,ror#13
- add r1,r1,r5,ror#13
- eor r12,r0,r12,ror#24
- eor r10,r1,r10,ror#24
- add r8,r8,r12,ror#16
- add r9,r9,r10,ror#16
- eor r4,r8,r4,ror#13
- eor r5,r9,r5,ror#13
- add r0,r0,r4,ror#20
- add r1,r1,r5,ror#20
- eor r12,r0,r12,ror#16
- eor r10,r1,r10,ror#16
- add r8,r8,r12,ror#24
- str r10,[sp,#4*(16+13)]
- add r9,r9,r10,ror#24
- ldr r10,[sp,#4*(16+15)]
- str r8,[sp,#4*(16+8)]
- eor r4,r4,r8,ror#12
- str r9,[sp,#4*(16+9)]
- eor r5,r5,r9,ror#12
- ldr r8,[sp,#4*(16+10)]
- add r2,r2,r6,ror#13
- ldr r9,[sp,#4*(16+11)]
- add r3,r3,r7,ror#13
- eor r14,r2,r14,ror#24
- eor r10,r3,r10,ror#24
- add r8,r8,r14,ror#16
- add r9,r9,r10,ror#16
- eor r6,r8,r6,ror#13
- eor r7,r9,r7,ror#13
- add r2,r2,r6,ror#20
- add r3,r3,r7,ror#20
- eor r14,r2,r14,ror#16
- eor r10,r3,r10,ror#16
- add r8,r8,r14,ror#24
- add r9,r9,r10,ror#24
- eor r6,r6,r8,ror#12
- eor r7,r7,r9,ror#12
- add r0,r0,r5,ror#13
- add r1,r1,r6,ror#13
- eor r10,r0,r10,ror#24
- eor r12,r1,r12,ror#24
- add r8,r8,r10,ror#16
- add r9,r9,r12,ror#16
- eor r5,r8,r5,ror#13
- eor r6,r9,r6,ror#13
- add r0,r0,r5,ror#20
- add r1,r1,r6,ror#20
- eor r10,r0,r10,ror#16
- eor r12,r1,r12,ror#16
- str r10,[sp,#4*(16+15)]
- add r8,r8,r10,ror#24
- ldr r10,[sp,#4*(16+13)]
- add r9,r9,r12,ror#24
- str r8,[sp,#4*(16+10)]
- eor r5,r5,r8,ror#12
- str r9,[sp,#4*(16+11)]
- eor r6,r6,r9,ror#12
- ldr r8,[sp,#4*(16+8)]
- add r2,r2,r7,ror#13
- ldr r9,[sp,#4*(16+9)]
- add r3,r3,r4,ror#13
- eor r10,r2,r10,ror#24
- eor r14,r3,r14,ror#24
- add r8,r8,r10,ror#16
- add r9,r9,r14,ror#16
- eor r7,r8,r7,ror#13
- eor r4,r9,r4,ror#13
- add r2,r2,r7,ror#20
- add r3,r3,r4,ror#20
- eor r10,r2,r10,ror#16
- eor r14,r3,r14,ror#16
- add r8,r8,r10,ror#24
- add r9,r9,r14,ror#24
- eor r7,r7,r8,ror#12
- eor r4,r4,r9,ror#12
- bne .Loop
-
- ldr r11,[sp,#4*(32+2)] @ load len
-
- str r8, [sp,#4*(16+8)] @ modulo-scheduled store
- str r9, [sp,#4*(16+9)]
- str r12,[sp,#4*(16+12)]
- str r10, [sp,#4*(16+13)]
- str r14,[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ rx and second half at sp+4*(16+8)
-
- cmp r11,#64 @ done yet?
-#ifdef __thumb2__
- itete lo
-#endif
- addlo r12,sp,#4*(0) @ shortcut or ...
- ldrhs r12,[sp,#4*(32+1)] @ ... load inp
- addlo r14,sp,#4*(0) @ shortcut or ...
- ldrhs r14,[sp,#4*(32+0)] @ ... load out
-
- ldr r8,[sp,#4*(0)] @ load key material
- ldr r9,[sp,#4*(1)]
-
-#if __LINUX_ARM_ARCH__ >= 6 || !defined(__ARMEB__)
-#if __LINUX_ARM_ARCH__ < 7
- orr r10,r12,r14
- tst r10,#3 @ are input and output aligned?
- ldr r10,[sp,#4*(2)]
- bne .Lunaligned
- cmp r11,#64 @ restore flags
-#else
- ldr r10,[sp,#4*(2)]
-#endif
- ldr r11,[sp,#4*(3)]
-
- add r0,r0,r8 @ accumulate key material
- add r1,r1,r9
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhs r8,[r12],#16 @ load input
- ldrhs r9,[r12,#-12]
-
- add r2,r2,r10
- add r3,r3,r11
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhs r10,[r12,#-8]
- ldrhs r11,[r12,#-4]
-#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-#endif
-#ifdef __thumb2__
- itt hs
-#endif
- eorhs r0,r0,r8 @ xor with input
- eorhs r1,r1,r9
- add r8,sp,#4*(4)
- str r0,[r14],#16 @ store output
-#ifdef __thumb2__
- itt hs
-#endif
- eorhs r2,r2,r10
- eorhs r3,r3,r11
- ldmia r8,{r8-r11} @ load key material
- str r1,[r14,#-12]
- str r2,[r14,#-8]
- str r3,[r14,#-4]
-
- add r4,r8,r4,ror#13 @ accumulate key material
- add r5,r9,r5,ror#13
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhs r8,[r12],#16 @ load input
- ldrhs r9,[r12,#-12]
- add r6,r10,r6,ror#13
- add r7,r11,r7,ror#13
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhs r10,[r12,#-8]
- ldrhs r11,[r12,#-4]
-#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-#endif
-#ifdef __thumb2__
- itt hs
-#endif
- eorhs r4,r4,r8
- eorhs r5,r5,r9
- add r8,sp,#4*(8)
- str r4,[r14],#16 @ store output
-#ifdef __thumb2__
- itt hs
-#endif
- eorhs r6,r6,r10
- eorhs r7,r7,r11
- str r5,[r14,#-12]
- ldmia r8,{r8-r11} @ load key material
- str r6,[r14,#-8]
- add r0,sp,#4*(16+8)
- str r7,[r14,#-4]
-
- ldmia r0,{r0-r7} @ load second half
-
- add r0,r0,r8 @ accumulate key material
- add r1,r1,r9
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhs r8,[r12],#16 @ load input
- ldrhs r9,[r12,#-12]
-#ifdef __thumb2__
- itt hi
-#endif
- strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
- strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
- add r2,r2,r10
- add r3,r3,r11
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhs r10,[r12,#-8]
- ldrhs r11,[r12,#-4]
-#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-#endif
-#ifdef __thumb2__
- itt hs
-#endif
- eorhs r0,r0,r8
- eorhs r1,r1,r9
- add r8,sp,#4*(12)
- str r0,[r14],#16 @ store output
-#ifdef __thumb2__
- itt hs
-#endif
- eorhs r2,r2,r10
- eorhs r3,r3,r11
- str r1,[r14,#-12]
- ldmia r8,{r8-r11} @ load key material
- str r2,[r14,#-8]
- str r3,[r14,#-4]
-
- add r4,r8,r4,ror#24 @ accumulate key material
- add r5,r9,r5,ror#24
-#ifdef __thumb2__
- itt hi
-#endif
- addhi r8,r8,#1 @ next counter value
- strhi r8,[sp,#4*(12)] @ save next counter value
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhs r8,[r12],#16 @ load input
- ldrhs r9,[r12,#-12]
- add r6,r10,r6,ror#24
- add r7,r11,r7,ror#24
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhs r10,[r12,#-8]
- ldrhs r11,[r12,#-4]
-#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-#endif
-#ifdef __thumb2__
- itt hs
-#endif
- eorhs r4,r4,r8
- eorhs r5,r5,r9
-#ifdef __thumb2__
- it ne
-#endif
- ldrne r8,[sp,#4*(32+2)] @ re-load len
-#ifdef __thumb2__
- itt hs
-#endif
- eorhs r6,r6,r10
- eorhs r7,r7,r11
- str r4,[r14],#16 @ store output
- str r5,[r14,#-12]
-#ifdef __thumb2__
- it hs
-#endif
- subhs r11,r8,#64 @ len-=64
- str r6,[r14,#-8]
- str r7,[r14,#-4]
- bhi .Loop_outer
-
- beq .Ldone
-#if __LINUX_ARM_ARCH__ < 7
- b .Ltail
-
-.align 4
-.Lunaligned: @ unaligned endian-neutral path
- cmp r11,#64 @ restore flags
-#endif
-#endif
-#if __LINUX_ARM_ARCH__ < 7
- ldr r11,[sp,#4*(3)]
- add r0,r8,r0 @ accumulate key material
- add r1,r9,r1
- add r2,r10,r2
-#ifdef __thumb2__
- itete lo
-#endif
- eorlo r8,r8,r8 @ zero or ...
- ldrhsb r8,[r12],#16 @ ... load input
- eorlo r9,r9,r9
- ldrhsb r9,[r12,#-12]
-
- add r3,r11,r3
-#ifdef __thumb2__
- itete lo
-#endif
- eorlo r10,r10,r10
- ldrhsb r10,[r12,#-8]
- eorlo r11,r11,r11
- ldrhsb r11,[r12,#-4]
-
- eor r0,r8,r0 @ xor with input (or zero)
- eor r1,r9,r1
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-15] @ load more input
- ldrhsb r9,[r12,#-11]
- eor r2,r10,r2
- strb r0,[r14],#16 @ store output
- eor r3,r11,r3
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-7]
- ldrhsb r11,[r12,#-3]
- strb r1,[r14,#-12]
- eor r0,r8,r0,lsr#8
- strb r2,[r14,#-8]
- eor r1,r9,r1,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-14] @ load more input
- ldrhsb r9,[r12,#-10]
- strb r3,[r14,#-4]
- eor r2,r10,r2,lsr#8
- strb r0,[r14,#-15]
- eor r3,r11,r3,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-6]
- ldrhsb r11,[r12,#-2]
- strb r1,[r14,#-11]
- eor r0,r8,r0,lsr#8
- strb r2,[r14,#-7]
- eor r1,r9,r1,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-13] @ load more input
- ldrhsb r9,[r12,#-9]
- strb r3,[r14,#-3]
- eor r2,r10,r2,lsr#8
- strb r0,[r14,#-14]
- eor r3,r11,r3,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-5]
- ldrhsb r11,[r12,#-1]
- strb r1,[r14,#-10]
- strb r2,[r14,#-6]
- eor r0,r8,r0,lsr#8
- strb r3,[r14,#-2]
- eor r1,r9,r1,lsr#8
- strb r0,[r14,#-13]
- eor r2,r10,r2,lsr#8
- strb r1,[r14,#-9]
- eor r3,r11,r3,lsr#8
- strb r2,[r14,#-5]
- strb r3,[r14,#-1]
- add r8,sp,#4*(4+0)
- ldmia r8,{r8-r11} @ load key material
- add r0,sp,#4*(16+8)
- add r4,r8,r4,ror#13 @ accumulate key material
- add r5,r9,r5,ror#13
- add r6,r10,r6,ror#13
-#ifdef __thumb2__
- itete lo
-#endif
- eorlo r8,r8,r8 @ zero or ...
- ldrhsb r8,[r12],#16 @ ... load input
- eorlo r9,r9,r9
- ldrhsb r9,[r12,#-12]
-
- add r7,r11,r7,ror#13
-#ifdef __thumb2__
- itete lo
-#endif
- eorlo r10,r10,r10
- ldrhsb r10,[r12,#-8]
- eorlo r11,r11,r11
- ldrhsb r11,[r12,#-4]
-
- eor r4,r8,r4 @ xor with input (or zero)
- eor r5,r9,r5
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-15] @ load more input
- ldrhsb r9,[r12,#-11]
- eor r6,r10,r6
- strb r4,[r14],#16 @ store output
- eor r7,r11,r7
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-7]
- ldrhsb r11,[r12,#-3]
- strb r5,[r14,#-12]
- eor r4,r8,r4,lsr#8
- strb r6,[r14,#-8]
- eor r5,r9,r5,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-14] @ load more input
- ldrhsb r9,[r12,#-10]
- strb r7,[r14,#-4]
- eor r6,r10,r6,lsr#8
- strb r4,[r14,#-15]
- eor r7,r11,r7,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-6]
- ldrhsb r11,[r12,#-2]
- strb r5,[r14,#-11]
- eor r4,r8,r4,lsr#8
- strb r6,[r14,#-7]
- eor r5,r9,r5,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-13] @ load more input
- ldrhsb r9,[r12,#-9]
- strb r7,[r14,#-3]
- eor r6,r10,r6,lsr#8
- strb r4,[r14,#-14]
- eor r7,r11,r7,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-5]
- ldrhsb r11,[r12,#-1]
- strb r5,[r14,#-10]
- strb r6,[r14,#-6]
- eor r4,r8,r4,lsr#8
- strb r7,[r14,#-2]
- eor r5,r9,r5,lsr#8
- strb r4,[r14,#-13]
- eor r6,r10,r6,lsr#8
- strb r5,[r14,#-9]
- eor r7,r11,r7,lsr#8
- strb r6,[r14,#-5]
- strb r7,[r14,#-1]
- add r8,sp,#4*(4+4)
- ldmia r8,{r8-r11} @ load key material
- ldmia r0,{r0-r7} @ load second half
-#ifdef __thumb2__
- itt hi
-#endif
- strhi r10,[sp,#4*(16+10)] @ copy "rx"
- strhi r11,[sp,#4*(16+11)] @ copy "rx"
- add r0,r8,r0 @ accumulate key material
- add r1,r9,r1
- add r2,r10,r2
-#ifdef __thumb2__
- itete lo
-#endif
- eorlo r8,r8,r8 @ zero or ...
- ldrhsb r8,[r12],#16 @ ... load input
- eorlo r9,r9,r9
- ldrhsb r9,[r12,#-12]
-
- add r3,r11,r3
-#ifdef __thumb2__
- itete lo
-#endif
- eorlo r10,r10,r10
- ldrhsb r10,[r12,#-8]
- eorlo r11,r11,r11
- ldrhsb r11,[r12,#-4]
-
- eor r0,r8,r0 @ xor with input (or zero)
- eor r1,r9,r1
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-15] @ load more input
- ldrhsb r9,[r12,#-11]
- eor r2,r10,r2
- strb r0,[r14],#16 @ store output
- eor r3,r11,r3
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-7]
- ldrhsb r11,[r12,#-3]
- strb r1,[r14,#-12]
- eor r0,r8,r0,lsr#8
- strb r2,[r14,#-8]
- eor r1,r9,r1,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-14] @ load more input
- ldrhsb r9,[r12,#-10]
- strb r3,[r14,#-4]
- eor r2,r10,r2,lsr#8
- strb r0,[r14,#-15]
- eor r3,r11,r3,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-6]
- ldrhsb r11,[r12,#-2]
- strb r1,[r14,#-11]
- eor r0,r8,r0,lsr#8
- strb r2,[r14,#-7]
- eor r1,r9,r1,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-13] @ load more input
- ldrhsb r9,[r12,#-9]
- strb r3,[r14,#-3]
- eor r2,r10,r2,lsr#8
- strb r0,[r14,#-14]
- eor r3,r11,r3,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-5]
- ldrhsb r11,[r12,#-1]
- strb r1,[r14,#-10]
- strb r2,[r14,#-6]
- eor r0,r8,r0,lsr#8
- strb r3,[r14,#-2]
- eor r1,r9,r1,lsr#8
- strb r0,[r14,#-13]
- eor r2,r10,r2,lsr#8
- strb r1,[r14,#-9]
- eor r3,r11,r3,lsr#8
- strb r2,[r14,#-5]
- strb r3,[r14,#-1]
- add r8,sp,#4*(4+8)
- ldmia r8,{r8-r11} @ load key material
- add r4,r8,r4,ror#24 @ accumulate key material
-#ifdef __thumb2__
- itt hi
-#endif
- addhi r8,r8,#1 @ next counter value
- strhi r8,[sp,#4*(12)] @ save next counter value
- add r5,r9,r5,ror#24
- add r6,r10,r6,ror#24
-#ifdef __thumb2__
- itete lo
-#endif
- eorlo r8,r8,r8 @ zero or ...
- ldrhsb r8,[r12],#16 @ ... load input
- eorlo r9,r9,r9
- ldrhsb r9,[r12,#-12]
-
- add r7,r11,r7,ror#24
-#ifdef __thumb2__
- itete lo
-#endif
- eorlo r10,r10,r10
- ldrhsb r10,[r12,#-8]
- eorlo r11,r11,r11
- ldrhsb r11,[r12,#-4]
-
- eor r4,r8,r4 @ xor with input (or zero)
- eor r5,r9,r5
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-15] @ load more input
- ldrhsb r9,[r12,#-11]
- eor r6,r10,r6
- strb r4,[r14],#16 @ store output
- eor r7,r11,r7
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-7]
- ldrhsb r11,[r12,#-3]
- strb r5,[r14,#-12]
- eor r4,r8,r4,lsr#8
- strb r6,[r14,#-8]
- eor r5,r9,r5,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-14] @ load more input
- ldrhsb r9,[r12,#-10]
- strb r7,[r14,#-4]
- eor r6,r10,r6,lsr#8
- strb r4,[r14,#-15]
- eor r7,r11,r7,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-6]
- ldrhsb r11,[r12,#-2]
- strb r5,[r14,#-11]
- eor r4,r8,r4,lsr#8
- strb r6,[r14,#-7]
- eor r5,r9,r5,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r8,[r12,#-13] @ load more input
- ldrhsb r9,[r12,#-9]
- strb r7,[r14,#-3]
- eor r6,r10,r6,lsr#8
- strb r4,[r14,#-14]
- eor r7,r11,r7,lsr#8
-#ifdef __thumb2__
- itt hs
-#endif
- ldrhsb r10,[r12,#-5]
- ldrhsb r11,[r12,#-1]
- strb r5,[r14,#-10]
- strb r6,[r14,#-6]
- eor r4,r8,r4,lsr#8
- strb r7,[r14,#-2]
- eor r5,r9,r5,lsr#8
- strb r4,[r14,#-13]
- eor r6,r10,r6,lsr#8
- strb r5,[r14,#-9]
- eor r7,r11,r7,lsr#8
- strb r6,[r14,#-5]
- strb r7,[r14,#-1]
-#ifdef __thumb2__
- it ne
-#endif
- ldrne r8,[sp,#4*(32+2)] @ re-load len
-#ifdef __thumb2__
- it hs
-#endif
- subhs r11,r8,#64 @ len-=64
- bhi .Loop_outer
-
- beq .Ldone
-#endif
-
-.Ltail:
- ldr r12,[sp,#4*(32+1)] @ load inp
- add r9,sp,#4*(0)
- ldr r14,[sp,#4*(32+0)] @ load out
-
-.Loop_tail:
- ldrb r10,[r9],#1 @ read buffer on stack
- ldrb r11,[r12],#1 @ read input
- subs r8,r8,#1
- eor r11,r11,r10
- strb r11,[r14],#1 @ store output
- bne .Loop_tail
-
-.Ldone:
- add sp,sp,#4*(32+3)
- ldmia sp!,{r4-r11,pc}
-
-.align 5
-.Lsigma2:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
-.Lone2:
-.long 1,0,0,0
-.word -1
-
-.arch armv7-a
-.fpu neon
-
-.align 5
-ENTRY(chacha20_neon)
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0-r2,r4-r11,lr}
- cmp r2,#0 @ len==0?
-#ifdef __thumb2__
- itt eq
-#endif
- addeq sp,sp,#4*3
- beq .Lno_data_neon
-.Lchacha20_neon_begin:
- adr r14,.Lsigma2
- vstmdb sp!,{d8-d15} @ ABI spec says so
- stmdb sp!,{r0-r3}
-
- vld1.32 {q1-q2},[r3] @ load key
- ldmia r3,{r4-r11} @ load key
-
- sub sp,sp,#4*(16+16)
- vld1.32 {q3},[r12] @ load counter and nonce
- add r12,sp,#4*8
- ldmia r14,{r0-r3} @ load sigma
- vld1.32 {q0},[r14]! @ load sigma
- vld1.32 {q12},[r14]! @ one
- @ vld1.32 {d30},[r14] @ rot8
- vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce
- vst1.32 {q0-q1},[sp] @ copy sigma|1/2key
-
- str r10,[sp,#4*(16+10)] @ off-load "rx"
- str r11,[sp,#4*(16+11)] @ off-load "rx"
- vshl.i32 d26,d24,#1 @ two
- vstr d24,[sp,#4*(16+0)]
- vshl.i32 d28,d24,#2 @ four
- vstr d26,[sp,#4*(16+2)]
- vmov q4,q0
- vstr d28,[sp,#4*(16+4)]
- vmov q8,q0
- @ vstr d30,[sp,#4*(16+6)]
- vmov q5,q1
- vmov q9,q1
- b .Loop_neon_enter
-
-.align 4
-.Loop_neon_outer:
- ldmia sp,{r0-r9} @ load key material
- cmp r11,#64*2 @ if len<=64*2
- bls .Lbreak_neon @ switch to integer-only
- @ vldr d30,[sp,#4*(16+6)] @ rot8
- vmov q4,q0
- str r11,[sp,#4*(32+2)] @ save len
- vmov q8,q0
- str r12, [sp,#4*(32+1)] @ save inp
- vmov q5,q1
- str r14, [sp,#4*(32+0)] @ save out
- vmov q9,q1
-.Loop_neon_enter:
- ldr r11, [sp,#4*(15)]
- mov r4,r4,ror#19 @ twist b[0..3]
- vadd.i32 q7,q3,q12 @ counter+1
- ldr r12,[sp,#4*(12)] @ modulo-scheduled load
- mov r5,r5,ror#19
- vmov q6,q2
- ldr r10, [sp,#4*(13)]
- mov r6,r6,ror#19
- vmov q10,q2
- ldr r14,[sp,#4*(14)]
- mov r7,r7,ror#19
- vadd.i32 q11,q7,q12 @ counter+2
- add r12,r12,#3 @ counter+3
- mov r11,r11,ror#8 @ twist d[0..3]
- mov r12,r12,ror#8
- mov r10,r10,ror#8
- mov r14,r14,ror#8
- str r11, [sp,#4*(16+15)]
- mov r11,#10
- b .Loop_neon
-
-.align 4
-.Loop_neon:
- subs r11,r11,#1
- vadd.i32 q0,q0,q1
- add r0,r0,r4,ror#13
- vadd.i32 q4,q4,q5
- add r1,r1,r5,ror#13
- vadd.i32 q8,q8,q9
- eor r12,r0,r12,ror#24
- veor q3,q3,q0
- eor r10,r1,r10,ror#24
- veor q7,q7,q4
- add r8,r8,r12,ror#16
- veor q11,q11,q8
- add r9,r9,r10,ror#16
- vrev32.16 q3,q3
- eor r4,r8,r4,ror#13
- vrev32.16 q7,q7
- eor r5,r9,r5,ror#13
- vrev32.16 q11,q11
- add r0,r0,r4,ror#20
- vadd.i32 q2,q2,q3
- add r1,r1,r5,ror#20
- vadd.i32 q6,q6,q7
- eor r12,r0,r12,ror#16
- vadd.i32 q10,q10,q11
- eor r10,r1,r10,ror#16
- veor q12,q1,q2
- add r8,r8,r12,ror#24
- veor q13,q5,q6
- str r10,[sp,#4*(16+13)]
- veor q14,q9,q10
- add r9,r9,r10,ror#24
- vshr.u32 q1,q12,#20
- ldr r10,[sp,#4*(16+15)]
- vshr.u32 q5,q13,#20
- str r8,[sp,#4*(16+8)]
- vshr.u32 q9,q14,#20
- eor r4,r4,r8,ror#12
- vsli.32 q1,q12,#12
- str r9,[sp,#4*(16+9)]
- vsli.32 q5,q13,#12
- eor r5,r5,r9,ror#12
- vsli.32 q9,q14,#12
- ldr r8,[sp,#4*(16+10)]
- vadd.i32 q0,q0,q1
- add r2,r2,r6,ror#13
- vadd.i32 q4,q4,q5
- ldr r9,[sp,#4*(16+11)]
- vadd.i32 q8,q8,q9
- add r3,r3,r7,ror#13
- veor q12,q3,q0
- eor r14,r2,r14,ror#24
- veor q13,q7,q4
- eor r10,r3,r10,ror#24
- veor q14,q11,q8
- add r8,r8,r14,ror#16
- vshr.u32 q3,q12,#24
- add r9,r9,r10,ror#16
- vshr.u32 q7,q13,#24
- eor r6,r8,r6,ror#13
- vshr.u32 q11,q14,#24
- eor r7,r9,r7,ror#13
- vsli.32 q3,q12,#8
- add r2,r2,r6,ror#20
- vsli.32 q7,q13,#8
- add r3,r3,r7,ror#20
- vsli.32 q11,q14,#8
- eor r14,r2,r14,ror#16
- vadd.i32 q2,q2,q3
- eor r10,r3,r10,ror#16
- vadd.i32 q6,q6,q7
- add r8,r8,r14,ror#24
- vadd.i32 q10,q10,q11
- add r9,r9,r10,ror#24
- veor q12,q1,q2
- eor r6,r6,r8,ror#12
- veor q13,q5,q6
- eor r7,r7,r9,ror#12
- veor q14,q9,q10
- vshr.u32 q1,q12,#25
- vshr.u32 q5,q13,#25
- vshr.u32 q9,q14,#25
- vsli.32 q1,q12,#7
- vsli.32 q5,q13,#7
- vsli.32 q9,q14,#7
- vext.8 q2,q2,q2,#8
- vext.8 q6,q6,q6,#8
- vext.8 q10,q10,q10,#8
- vext.8 q1,q1,q1,#4
- vext.8 q5,q5,q5,#4
- vext.8 q9,q9,q9,#4
- vext.8 q3,q3,q3,#12
- vext.8 q7,q7,q7,#12
- vext.8 q11,q11,q11,#12
- vadd.i32 q0,q0,q1
- add r0,r0,r5,ror#13
- vadd.i32 q4,q4,q5
- add r1,r1,r6,ror#13
- vadd.i32 q8,q8,q9
- eor r10,r0,r10,ror#24
- veor q3,q3,q0
- eor r12,r1,r12,ror#24
- veor q7,q7,q4
- add r8,r8,r10,ror#16
- veor q11,q11,q8
- add r9,r9,r12,ror#16
- vrev32.16 q3,q3
- eor r5,r8,r5,ror#13
- vrev32.16 q7,q7
- eor r6,r9,r6,ror#13
- vrev32.16 q11,q11
- add r0,r0,r5,ror#20
- vadd.i32 q2,q2,q3
- add r1,r1,r6,ror#20
- vadd.i32 q6,q6,q7
- eor r10,r0,r10,ror#16
- vadd.i32 q10,q10,q11
- eor r12,r1,r12,ror#16
- veor q12,q1,q2
- str r10,[sp,#4*(16+15)]
- veor q13,q5,q6
- add r8,r8,r10,ror#24
- veor q14,q9,q10
- ldr r10,[sp,#4*(16+13)]
- vshr.u32 q1,q12,#20
- add r9,r9,r12,ror#24
- vshr.u32 q5,q13,#20
- str r8,[sp,#4*(16+10)]
- vshr.u32 q9,q14,#20
- eor r5,r5,r8,ror#12
- vsli.32 q1,q12,#12
- str r9,[sp,#4*(16+11)]
- vsli.32 q5,q13,#12
- eor r6,r6,r9,ror#12
- vsli.32 q9,q14,#12
- ldr r8,[sp,#4*(16+8)]
- vadd.i32 q0,q0,q1
- add r2,r2,r7,ror#13
- vadd.i32 q4,q4,q5
- ldr r9,[sp,#4*(16+9)]
- vadd.i32 q8,q8,q9
- add r3,r3,r4,ror#13
- veor q12,q3,q0
- eor r10,r2,r10,ror#24
- veor q13,q7,q4
- eor r14,r3,r14,ror#24
- veor q14,q11,q8
- add r8,r8,r10,ror#16
- vshr.u32 q3,q12,#24
- add r9,r9,r14,ror#16
- vshr.u32 q7,q13,#24
- eor r7,r8,r7,ror#13
- vshr.u32 q11,q14,#24
- eor r4,r9,r4,ror#13
- vsli.32 q3,q12,#8
- add r2,r2,r7,ror#20
- vsli.32 q7,q13,#8
- add r3,r3,r4,ror#20
- vsli.32 q11,q14,#8
- eor r10,r2,r10,ror#16
- vadd.i32 q2,q2,q3
- eor r14,r3,r14,ror#16
- vadd.i32 q6,q6,q7
- add r8,r8,r10,ror#24
- vadd.i32 q10,q10,q11
- add r9,r9,r14,ror#24
- veor q12,q1,q2
- eor r7,r7,r8,ror#12
- veor q13,q5,q6
- eor r4,r4,r9,ror#12
- veor q14,q9,q10
- vshr.u32 q1,q12,#25
- vshr.u32 q5,q13,#25
- vshr.u32 q9,q14,#25
- vsli.32 q1,q12,#7
- vsli.32 q5,q13,#7
- vsli.32 q9,q14,#7
- vext.8 q2,q2,q2,#8
- vext.8 q6,q6,q6,#8
- vext.8 q10,q10,q10,#8
- vext.8 q1,q1,q1,#12
- vext.8 q5,q5,q5,#12
- vext.8 q9,q9,q9,#12
- vext.8 q3,q3,q3,#4
- vext.8 q7,q7,q7,#4
- vext.8 q11,q11,q11,#4
- bne .Loop_neon
-
- add r11,sp,#32
- vld1.32 {q12-q13},[sp] @ load key material
- vld1.32 {q14-q15},[r11]
-
- ldr r11,[sp,#4*(32+2)] @ load len
-
- str r8, [sp,#4*(16+8)] @ modulo-scheduled store
- str r9, [sp,#4*(16+9)]
- str r12,[sp,#4*(16+12)]
- str r10, [sp,#4*(16+13)]
- str r14,[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ rx and second half at sp+4*(16+8)
-
- ldr r12,[sp,#4*(32+1)] @ load inp
- ldr r14,[sp,#4*(32+0)] @ load out
-
- vadd.i32 q0,q0,q12 @ accumulate key material
- vadd.i32 q4,q4,q12
- vadd.i32 q8,q8,q12
- vldr d24,[sp,#4*(16+0)] @ one
-
- vadd.i32 q1,q1,q13
- vadd.i32 q5,q5,q13
- vadd.i32 q9,q9,q13
- vldr d26,[sp,#4*(16+2)] @ two
-
- vadd.i32 q2,q2,q14
- vadd.i32 q6,q6,q14
- vadd.i32 q10,q10,q14
- vadd.i32 d14,d14,d24 @ counter+1
- vadd.i32 d22,d22,d26 @ counter+2
-
- vadd.i32 q3,q3,q15
- vadd.i32 q7,q7,q15
- vadd.i32 q11,q11,q15
-
- cmp r11,#64*4
- blo .Ltail_neon
-
- vld1.8 {q12-q13},[r12]! @ load input
- mov r11,sp
- vld1.8 {q14-q15},[r12]!
- veor q0,q0,q12 @ xor with input
- veor q1,q1,q13
- vld1.8 {q12-q13},[r12]!
- veor q2,q2,q14
- veor q3,q3,q15
- vld1.8 {q14-q15},[r12]!
-
- veor q4,q4,q12
- vst1.8 {q0-q1},[r14]! @ store output
- veor q5,q5,q13
- vld1.8 {q12-q13},[r12]!
- veor q6,q6,q14
- vst1.8 {q2-q3},[r14]!
- veor q7,q7,q15
- vld1.8 {q14-q15},[r12]!
-
- veor q8,q8,q12
- vld1.32 {q0-q1},[r11]! @ load for next iteration
- veor d25,d25,d25
- vldr d24,[sp,#4*(16+4)] @ four
- veor q9,q9,q13
- vld1.32 {q2-q3},[r11]
- veor q10,q10,q14
- vst1.8 {q4-q5},[r14]!
- veor q11,q11,q15
- vst1.8 {q6-q7},[r14]!
-
- vadd.i32 d6,d6,d24 @ next counter value
- vldr d24,[sp,#4*(16+0)] @ one
-
- ldmia sp,{r8-r11} @ load key material
- add r0,r0,r8 @ accumulate key material
- ldr r8,[r12],#16 @ load input
- vst1.8 {q8-q9},[r14]!
- add r1,r1,r9
- ldr r9,[r12,#-12]
- vst1.8 {q10-q11},[r14]!
- add r2,r2,r10
- ldr r10,[r12,#-8]
- add r3,r3,r11
- ldr r11,[r12,#-4]
-#ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-#endif
- eor r0,r0,r8 @ xor with input
- add r8,sp,#4*(4)
- eor r1,r1,r9
- str r0,[r14],#16 @ store output
- eor r2,r2,r10
- str r1,[r14,#-12]
- eor r3,r3,r11
- ldmia r8,{r8-r11} @ load key material
- str r2,[r14,#-8]
- str r3,[r14,#-4]
-
- add r4,r8,r4,ror#13 @ accumulate key material
- ldr r8,[r12],#16 @ load input
- add r5,r9,r5,ror#13
- ldr r9,[r12,#-12]
- add r6,r10,r6,ror#13
- ldr r10,[r12,#-8]
- add r7,r11,r7,ror#13
- ldr r11,[r12,#-4]
-#ifdef __ARMEB__
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-#endif
- eor r4,r4,r8
- add r8,sp,#4*(8)
- eor r5,r5,r9
- str r4,[r14],#16 @ store output
- eor r6,r6,r10
- str r5,[r14,#-12]
- eor r7,r7,r11
- ldmia r8,{r8-r11} @ load key material
- str r6,[r14,#-8]
- add r0,sp,#4*(16+8)
- str r7,[r14,#-4]
-
- ldmia r0,{r0-r7} @ load second half
-
- add r0,r0,r8 @ accumulate key material
- ldr r8,[r12],#16 @ load input
- add r1,r1,r9
- ldr r9,[r12,#-12]
-#ifdef __thumb2__
- it hi
-#endif
- strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
- add r2,r2,r10
- ldr r10,[r12,#-8]
-#ifdef __thumb2__
- it hi
-#endif
- strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
- add r3,r3,r11
- ldr r11,[r12,#-4]
-#ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-#endif
- eor r0,r0,r8
- add r8,sp,#4*(12)
- eor r1,r1,r9
- str r0,[r14],#16 @ store output
- eor r2,r2,r10
- str r1,[r14,#-12]
- eor r3,r3,r11
- ldmia r8,{r8-r11} @ load key material
- str r2,[r14,#-8]
- str r3,[r14,#-4]
-
- add r4,r8,r4,ror#24 @ accumulate key material
- add r8,r8,#4 @ next counter value
- add r5,r9,r5,ror#24
- str r8,[sp,#4*(12)] @ save next counter value
- ldr r8,[r12],#16 @ load input
- add r6,r10,r6,ror#24
- add r4,r4,#3 @ counter+3
- ldr r9,[r12,#-12]
- add r7,r11,r7,ror#24
- ldr r10,[r12,#-8]
- ldr r11,[r12,#-4]
-#ifdef __ARMEB__
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-#endif
- eor r4,r4,r8
-#ifdef __thumb2__
- it hi
-#endif
- ldrhi r8,[sp,#4*(32+2)] @ re-load len
- eor r5,r5,r9
- eor r6,r6,r10
- str r4,[r14],#16 @ store output
- eor r7,r7,r11
- str r5,[r14,#-12]
- sub r11,r8,#64*4 @ len-=64*4
- str r6,[r14,#-8]
- str r7,[r14,#-4]
- bhi .Loop_neon_outer
-
- b .Ldone_neon
-
-.align 4
-.Lbreak_neon:
- @ harmonize NEON and integer-only stack frames: load data
- @ from NEON frame, but save to integer-only one; distance
- @ between the two is 4*(32+4+16-32)=4*(20).
-
- str r11, [sp,#4*(20+32+2)] @ save len
- add r11,sp,#4*(32+4)
- str r12, [sp,#4*(20+32+1)] @ save inp
- str r14, [sp,#4*(20+32+0)] @ save out
-
- ldr r12,[sp,#4*(16+10)]
- ldr r14,[sp,#4*(16+11)]
- vldmia r11,{d8-d15} @ fulfill ABI requirement
- str r12,[sp,#4*(20+16+10)] @ copy "rx"
- str r14,[sp,#4*(20+16+11)] @ copy "rx"
-
- ldr r11, [sp,#4*(15)]
- mov r4,r4,ror#19 @ twist b[0..3]
- ldr r12,[sp,#4*(12)] @ modulo-scheduled load
- mov r5,r5,ror#19
- ldr r10, [sp,#4*(13)]
- mov r6,r6,ror#19
- ldr r14,[sp,#4*(14)]
- mov r7,r7,ror#19
- mov r11,r11,ror#8 @ twist d[0..3]
- mov r12,r12,ror#8
- mov r10,r10,ror#8
- mov r14,r14,ror#8
- str r11, [sp,#4*(20+16+15)]
- add r11,sp,#4*(20)
- vst1.32 {q0-q1},[r11]! @ copy key
- add sp,sp,#4*(20) @ switch frame
- vst1.32 {q2-q3},[r11]
- mov r11,#10
- b .Loop @ go integer-only
-
-.align 4
-.Ltail_neon:
- cmp r11,#64*3
- bhs .L192_or_more_neon
- cmp r11,#64*2
- bhs .L128_or_more_neon
- cmp r11,#64*1
- bhs .L64_or_more_neon
-
- add r8,sp,#4*(8)
- vst1.8 {q0-q1},[sp]
- add r10,sp,#4*(0)
- vst1.8 {q2-q3},[r8]
- b .Loop_tail_neon
-
-.align 4
-.L64_or_more_neon:
- vld1.8 {q12-q13},[r12]!
- vld1.8 {q14-q15},[r12]!
- veor q0,q0,q12
- veor q1,q1,q13
- veor q2,q2,q14
- veor q3,q3,q15
- vst1.8 {q0-q1},[r14]!
- vst1.8 {q2-q3},[r14]!
-
- beq .Ldone_neon
-
- add r8,sp,#4*(8)
- vst1.8 {q4-q5},[sp]
- add r10,sp,#4*(0)
- vst1.8 {q6-q7},[r8]
- sub r11,r11,#64*1 @ len-=64*1
- b .Loop_tail_neon
-
-.align 4
-.L128_or_more_neon:
- vld1.8 {q12-q13},[r12]!
- vld1.8 {q14-q15},[r12]!
- veor q0,q0,q12
- veor q1,q1,q13
- vld1.8 {q12-q13},[r12]!
- veor q2,q2,q14
- veor q3,q3,q15
- vld1.8 {q14-q15},[r12]!
-
- veor q4,q4,q12
- veor q5,q5,q13
- vst1.8 {q0-q1},[r14]!
- veor q6,q6,q14
- vst1.8 {q2-q3},[r14]!
- veor q7,q7,q15
- vst1.8 {q4-q5},[r14]!
- vst1.8 {q6-q7},[r14]!
-
- beq .Ldone_neon
-
- add r8,sp,#4*(8)
- vst1.8 {q8-q9},[sp]
- add r10,sp,#4*(0)
- vst1.8 {q10-q11},[r8]
- sub r11,r11,#64*2 @ len-=64*2
- b .Loop_tail_neon
-
-.align 4
-.L192_or_more_neon:
- vld1.8 {q12-q13},[r12]!
- vld1.8 {q14-q15},[r12]!
- veor q0,q0,q12
- veor q1,q1,q13
- vld1.8 {q12-q13},[r12]!
- veor q2,q2,q14
- veor q3,q3,q15
- vld1.8 {q14-q15},[r12]!
-
- veor q4,q4,q12
- veor q5,q5,q13
- vld1.8 {q12-q13},[r12]!
- veor q6,q6,q14
- vst1.8 {q0-q1},[r14]!
- veor q7,q7,q15
- vld1.8 {q14-q15},[r12]!
-
- veor q8,q8,q12
- vst1.8 {q2-q3},[r14]!
- veor q9,q9,q13
- vst1.8 {q4-q5},[r14]!
- veor q10,q10,q14
- vst1.8 {q6-q7},[r14]!
- veor q11,q11,q15
- vst1.8 {q8-q9},[r14]!
- vst1.8 {q10-q11},[r14]!
-
- beq .Ldone_neon
-
- ldmia sp,{r8-r11} @ load key material
- add r0,r0,r8 @ accumulate key material
- add r8,sp,#4*(4)
- add r1,r1,r9
- add r2,r2,r10
- add r3,r3,r11
- ldmia r8,{r8-r11} @ load key material
-
- add r4,r8,r4,ror#13 @ accumulate key material
- add r8,sp,#4*(8)
- add r5,r9,r5,ror#13
- add r6,r10,r6,ror#13
- add r7,r11,r7,ror#13
- ldmia r8,{r8-r11} @ load key material
-#ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-#endif
- stmia sp,{r0-r7}
- add r0,sp,#4*(16+8)
-
- ldmia r0,{r0-r7} @ load second half
-
- add r0,r0,r8 @ accumulate key material
- add r8,sp,#4*(12)
- add r1,r1,r9
- add r2,r2,r10
- add r3,r3,r11
- ldmia r8,{r8-r11} @ load key material
-
- add r4,r8,r4,ror#24 @ accumulate key material
- add r8,sp,#4*(8)
- add r5,r9,r5,ror#24
- add r4,r4,#3 @ counter+3
- add r6,r10,r6,ror#24
- add r7,r11,r7,ror#24
- ldr r11,[sp,#4*(32+2)] @ re-load len
-#ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-#endif
- stmia r8,{r0-r7}
- add r10,sp,#4*(0)
- sub r11,r11,#64*3 @ len-=64*3
-
-.Loop_tail_neon:
- ldrb r8,[r10],#1 @ read buffer on stack
- ldrb r9,[r12],#1 @ read input
- subs r11,r11,#1
- eor r8,r8,r9
- strb r8,[r14],#1 @ store output
- bne .Loop_tail_neon
-
-.Ldone_neon:
- add sp,sp,#4*(32+4)
- vldmia sp,{d8-d15}
- add sp,sp,#4*(16+3)
-.Lno_data_neon:
- ldmia sp!,{r4-r11,pc}
-ENDPROC(chacha20_neon)
-#endif
diff --git a/src/crypto/zinc/chacha20/chacha20-arm.pl b/src/crypto/zinc/chacha20/chacha20-arm.pl
new file mode 100644
index 0000000..3621957
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-arm.pl
@@ -0,0 +1,1227 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2014
+#
+# ChaCha20 for ARMv4.
+#
+# September 2018
+#
+# Improve scalar performance per Eric Biggers' suggestion to eliminate
+# separate rotates. This requires b[0..3] and d[0..3] to be maintained
+# pre-rotated, hence odd twists prior inner loop and when accumulating
+# key material. Since amount of instructions is reduced as result, even
+# NEON performance is improved somewhat, most notably by ~9% on low-end
+# Cortex-A5/A7. Full unroll was shown to provide even better scalar
+# performance on Cortex-A5/A7, naturally at the cost of manyfold size
+# increase. We let it be. Oversized code works in benchmarks, but is not
+# necessarily optimal in real life, when it's likely to be out-of-cache
+# upon entry and evict significant part of cache upon completion.
+#
+# Performance in cycles per byte out of large buffer.
+#
+# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
+#
+# Cortex-A5 14.2(*)/+160% 21.8 12.9(**)
+# Cortex-A8 10.2(*)/+190% 13.9 6.10
+# Cortex-A9 10.8(*)/+150% 14.3 6.50
+# Cortex-A15 11.0/+40% 16.0 4.90
+# Snapdragon S4 13.9(***)/+90% 13.6 4.90
+#
+# (*) most "favourable" result for aligned data on little-endian
+# processor, result for misaligned data is 10-15% lower;
+# (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8%
+# better performance on Cortex-A5/A7, but not on others;
+# (***) it's 17% slower than original, trade-off is considered
+# acceptable, because of improvement on others, specifically
+# +36% on Cortex-A5/A7 and +20% on Cortex-A9;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
+my @t=map("r$_",(8..11));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my $odd = $d0&1;
+my ($xc,$xc_) = (@t[0..1]);
+my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
+my @ret;
+
+ # Consider order in which variables are addressed by their
+ # index:
+ #
+ # a b c d
+ #
+ # 0 4 8 12 < even round
+ # 1 5 9 13
+ # 2 6 10 14
+ # 3 7 11 15
+ # 0 5 10 15 < odd round
+ # 1 6 11 12
+ # 2 7 8 13
+ # 3 4 9 14
+ #
+ # 'a', 'b' are permanently allocated in registers, @x[0..7],
+ # while 'c's and pair of 'd's are maintained in memory. If
+ # you observe 'c' column, you'll notice that pair of 'c's is
+ # invariant between rounds. This means that we have to reload
+ # them once per round, in the middle. This is why you'll see
+ # bunch of 'c' stores and loads in the middle, but none in
+ # the beginning or end. If you observe 'd' column, you'll
+ # notice that 15 and 13 are reused in next pair of rounds.
+ # This is why these two are chosen for offloading to memory,
+ # to make loads count more.
+ push @ret,(
+ "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')",
+ "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')",
+ "&eor ($xd,@x[$a0],$xd,'ror#24')",
+ "&eor ($xd_,@x[$a1],$xd_,'ror#24')",
+
+ "&add ($xc,$xc,$xd,'ror#16')",
+ "&add ($xc_,$xc_,$xd_,'ror#16')",
+ "&eor (@x[$b0],$xc, @x[$b0],'ror#13')",
+ "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')",
+
+ "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')",
+ "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')",
+ "&eor ($xd,@x[$a0],$xd,'ror#16')",
+ "&eor ($xd_,@x[$a1],$xd_,'ror#16')" );
+ push @ret,(
+ "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd);
+ push @ret,(
+ "&add ($xc,$xc,$xd,'ror#24')" );
+ push @ret,(
+ "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
+ push @ret,(
+ "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd);
+ push @ret,(
+ "&add ($xc_,$xc_,$xd_,'ror#24')" );
+ push @ret,(
+ "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
+ push @ret,(
+ "&str ($xc,'[sp,#4*(16+$c0)]')",
+ "&eor (@x[$b0],@x[$b0],$xc,'ror#12')",
+ "&str ($xc_,'[sp,#4*(16+$c1)]')",
+ "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" );
+
+ $xd=@x[$d2] if (!$odd);
+ $xd_=@x[$d3] if ($odd);
+ push @ret,(
+ "&ldr ($xc,'[sp,#4*(16+$c2)]')",
+ "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')",
+ "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
+ "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')",
+ "&eor ($xd,@x[$a2],$xd,'ror#24')",
+ "&eor ($xd_,@x[$a3],$xd_,'ror#24')",
+
+ "&add ($xc,$xc,$xd,'ror#16')",
+ "&add ($xc_,$xc_,$xd_,'ror#16')",
+ "&eor (@x[$b2],$xc, @x[$b2],'ror#13')",
+ "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')",
+
+ "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')",
+ "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')",
+ "&eor ($xd,@x[$a2],$xd,'ror#16')",
+ "&eor ($xd_,@x[$a3],$xd_,'ror#16')",
+
+ "&add ($xc,$xc,$xd,'ror#24')",
+ "&add ($xc_,$xc_,$xd_,'ror#24')",
+ "&eor (@x[$b2],@x[$b2],$xc,'ror#12')",
+ "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" );
+
+ @ret;
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define ChaCha20_ctr32 chacha20_arm_cryptogams
+# define ChaCha20_neon chacha20_neon
+#endif
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+# define ldrhsb ldrbhs
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+.align 5
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone:
+.long 1,0,0,0
+.Lrot8:
+.long 0x02010003,0x06050407
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.LChaCha20_ctr32
+#else
+.word -1
+#endif
+
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+.LChaCha20_ctr32:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r14,pc,#16 @ ChaCha20_ctr32
+#else
+ adr r14,.LChaCha20_ctr32
+#endif
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ cmp r2,#192 @ test len
+ bls .Lshort
+ ldr r4,[r14,#-24]
+ ldr r4,[r14,r4]
+# ifdef __APPLE__
+ ldr r4,[r4]
+# endif
+ tst r4,#ARMV7_NEON
+ bne .LChaCha20_neon
+.Lshort:
+#endif
+ ldmia r12,{r4-r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+ sub r14,r14,#64 @ .Lsigma
+ stmdb sp!,{r4-r7} @ copy counter and nonce
+ ldmia r3,{r4-r11} @ load key
+ ldmia r14,{r0-r3} @ load sigma
+ stmdb sp!,{r4-r11} @ copy key
+ stmdb sp!,{r0-r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
+ str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
+ b .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+ ldmia sp,{r0-r9} @ load key material
+ str @t[3],[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+.Loop_outer_enter:
+ ldr @t[3], [sp,#4*(15)]
+ mov @x[4],@x[4],ror#19 @ twist b[0..3]
+ ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
+ mov @x[5],@x[5],ror#19
+ ldr @t[2], [sp,#4*(13)]
+ mov @x[6],@x[6],ror#19
+ ldr @x[14],[sp,#4*(14)]
+ mov @x[7],@x[7],ror#19
+ mov @t[3],@t[3],ror#8 @ twist d[0..3]
+ mov @x[12],@x[12],ror#8
+ mov @t[2],@t[2],ror#8
+ mov @x[14],@x[14],ror#8
+ str @t[3], [sp,#4*(16+15)]
+ mov @t[3],#10
+ b .Loop
+
+.align 4
+.Loop:
+ subs @t[3],@t[3],#1
+___
+ foreach (&ROUND(0, 4, 8,12)) { eval; }
+ foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+ bne .Loop
+
+ ldr @t[3],[sp,#4*(32+2)] @ load len
+
+ str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
+ str @t[1], [sp,#4*(16+9)]
+ str @x[12],[sp,#4*(16+12)]
+ str @t[2], [sp,#4*(16+13)]
+ str @x[14],[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ @x[0-7] and second half at sp+4*(16+8)
+
+ cmp @t[3],#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr @t[0],[sp,#4*(0)] @ load key material
+ ldr @t[1],[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+ orr @t[2],r12,r14
+ tst @t[2],#3 @ are input and output aligned?
+ ldr @t[2],[sp,#4*(2)]
+ bne .Lunaligned
+ cmp @t[3],#64 @ restore flags
+# else
+ ldr @t[2],[sp,#4*(2)]
+# endif
+ ldr @t[3],[sp,#4*(3)]
+
+ add @x[0],@x[0],@t[0] @ accumulate key material
+ add @x[1],@x[1],@t[1]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[0],[r12],#16 @ load input
+ ldrhs @t[1],[r12,#-12]
+
+ add @x[2],@x[2],@t[2]
+ add @x[3],@x[3],@t[3]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[2],[r12,#-8]
+ ldrhs @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev @x[0],@x[0]
+ rev @x[1],@x[1]
+ rev @x[2],@x[2]
+ rev @x[3],@x[3]
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[0],@x[0],@t[0] @ xor with input
+ eorhs @x[1],@x[1],@t[1]
+ add @t[0],sp,#4*(4)
+ str @x[0],[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[2],@x[2],@t[2]
+ eorhs @x[3],@x[3],@t[3]
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+ str @x[1],[r14,#-12]
+ str @x[2],[r14,#-8]
+ str @x[3],[r14,#-4]
+
+ add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
+ add @x[5],@t[1],@x[5],ror#13
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[0],[r12],#16 @ load input
+ ldrhs @t[1],[r12,#-12]
+ add @x[6],@t[2],@x[6],ror#13
+ add @x[7],@t[3],@x[7],ror#13
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[2],[r12,#-8]
+ ldrhs @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev @x[4],@x[4]
+ rev @x[5],@x[5]
+ rev @x[6],@x[6]
+ rev @x[7],@x[7]
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[4],@x[4],@t[0]
+ eorhs @x[5],@x[5],@t[1]
+ add @t[0],sp,#4*(8)
+ str @x[4],[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[6],@x[6],@t[2]
+ eorhs @x[7],@x[7],@t[3]
+ str @x[5],[r14,#-12]
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+ str @x[6],[r14,#-8]
+ add @x[0],sp,#4*(16+8)
+ str @x[7],[r14,#-4]
+
+ ldmia @x[0],{@x[0]-@x[7]} @ load second half
+
+ add @x[0],@x[0],@t[0] @ accumulate key material
+ add @x[1],@x[1],@t[1]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[0],[r12],#16 @ load input
+ ldrhs @t[1],[r12,#-12]
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
+ strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
+ add @x[2],@x[2],@t[2]
+ add @x[3],@x[3],@t[3]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[2],[r12,#-8]
+ ldrhs @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev @x[0],@x[0]
+ rev @x[1],@x[1]
+ rev @x[2],@x[2]
+ rev @x[3],@x[3]
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[0],@x[0],@t[0]
+ eorhs @x[1],@x[1],@t[1]
+ add @t[0],sp,#4*(12)
+ str @x[0],[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[2],@x[2],@t[2]
+ eorhs @x[3],@x[3],@t[3]
+ str @x[1],[r14,#-12]
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+ str @x[2],[r14,#-8]
+ str @x[3],[r14,#-4]
+
+ add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
+ add @x[5],@t[1],@x[5],ror#24
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi @t[0],@t[0],#1 @ next counter value
+ strhi @t[0],[sp,#4*(12)] @ save next counter value
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[0],[r12],#16 @ load input
+ ldrhs @t[1],[r12,#-12]
+ add @x[6],@t[2],@x[6],ror#24
+ add @x[7],@t[3],@x[7],ror#24
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[2],[r12,#-8]
+ ldrhs @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev @x[4],@x[4]
+ rev @x[5],@x[5]
+ rev @x[6],@x[6]
+ rev @x[7],@x[7]
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[4],@x[4],@t[0]
+ eorhs @x[5],@x[5],@t[1]
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne @t[0],[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[6],@x[6],@t[2]
+ eorhs @x[7],@x[7],@t[3]
+ str @x[4],[r14],#16 @ store output
+ str @x[5],[r14,#-12]
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs @t[3],@t[0],#64 @ len-=64
+ str @x[6],[r14,#-8]
+ str @x[7],[r14,#-4]
+ bhi .Loop_outer
+
+ beq .Ldone
+# if __ARM_ARCH__<7
+ b .Ltail
+
+.align 4
+.Lunaligned: @ unaligned endian-neutral path
+ cmp @t[3],#64 @ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+ ldr @t[3],[sp,#4*(3)]
+___
+for ($i=0;$i<16;$i+=4) {
+my $j=$i&0x7;
+my $twist="";
+if ($i==4) { $twist = ",ror#13"; }
+elsif ($i==12) { $twist = ",ror#24"; }
+
+$code.=<<___ if ($i==4);
+ add @x[0],sp,#4*(16+8)
+___
+$code.=<<___ if ($i==8);
+ ldmia @x[0],{@x[0]-@x[7]} @ load second half
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
+ strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
+___
+$code.=<<___;
+ add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material
+___
+$code.=<<___ if ($i==12);
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi @t[0],@t[0],#1 @ next counter value
+ strhi @t[0],[sp,#4*(12)] @ save next counter value
+___
+$code.=<<___;
+ add @x[$j+1],@t[1],@x[$j+1]$twist
+ add @x[$j+2],@t[2],@x[$j+2]$twist
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo @t[0],@t[0],@t[0] @ zero or ...
+ ldrhsb @t[0],[r12],#16 @ ... load input
+ eorlo @t[1],@t[1],@t[1]
+ ldrhsb @t[1],[r12,#-12]
+
+ add @x[$j+3],@t[3],@x[$j+3]$twist
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo @t[2],@t[2],@t[2]
+ ldrhsb @t[2],[r12,#-8]
+ eorlo @t[3],@t[3],@t[3]
+ ldrhsb @t[3],[r12,#-4]
+
+ eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
+ eor @x[$j+1],@t[1],@x[$j+1]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[0],[r12,#-15] @ load more input
+ ldrhsb @t[1],[r12,#-11]
+ eor @x[$j+2],@t[2],@x[$j+2]
+ strb @x[$j+0],[r14],#16 @ store output
+ eor @x[$j+3],@t[3],@x[$j+3]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[2],[r12,#-7]
+ ldrhsb @t[3],[r12,#-3]
+ strb @x[$j+1],[r14,#-12]
+ eor @x[$j+0],@t[0],@x[$j+0],lsr#8
+ strb @x[$j+2],[r14,#-8]
+ eor @x[$j+1],@t[1],@x[$j+1],lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[0],[r12,#-14] @ load more input
+ ldrhsb @t[1],[r12,#-10]
+ strb @x[$j+3],[r14,#-4]
+ eor @x[$j+2],@t[2],@x[$j+2],lsr#8
+ strb @x[$j+0],[r14,#-15]
+ eor @x[$j+3],@t[3],@x[$j+3],lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[2],[r12,#-6]
+ ldrhsb @t[3],[r12,#-2]
+ strb @x[$j+1],[r14,#-11]
+ eor @x[$j+0],@t[0],@x[$j+0],lsr#8
+ strb @x[$j+2],[r14,#-7]
+ eor @x[$j+1],@t[1],@x[$j+1],lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[0],[r12,#-13] @ load more input
+ ldrhsb @t[1],[r12,#-9]
+ strb @x[$j+3],[r14,#-3]
+ eor @x[$j+2],@t[2],@x[$j+2],lsr#8
+ strb @x[$j+0],[r14,#-14]
+ eor @x[$j+3],@t[3],@x[$j+3],lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[2],[r12,#-5]
+ ldrhsb @t[3],[r12,#-1]
+ strb @x[$j+1],[r14,#-10]
+ strb @x[$j+2],[r14,#-6]
+ eor @x[$j+0],@t[0],@x[$j+0],lsr#8
+ strb @x[$j+3],[r14,#-2]
+ eor @x[$j+1],@t[1],@x[$j+1],lsr#8
+ strb @x[$j+0],[r14,#-13]
+ eor @x[$j+2],@t[2],@x[$j+2],lsr#8
+ strb @x[$j+1],[r14,#-9]
+ eor @x[$j+3],@t[3],@x[$j+3],lsr#8
+ strb @x[$j+2],[r14,#-5]
+ strb @x[$j+3],[r14,#-1]
+___
+$code.=<<___ if ($i<12);
+ add @t[0],sp,#4*(4+$i)
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+___
+}
+$code.=<<___;
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne @t[0],[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs @t[3],@t[0],#64 @ len-=64
+ bhi .Loop_outer
+
+ beq .Ldone
+#endif
+
+.Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add @t[1],sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+.Loop_tail:
+ ldrb @t[2],[@t[1]],#1 @ read buffer on stack
+ ldrb @t[3],[r12],#1 @ read input
+ subs @t[0],@t[0],#1
+ eor @t[3],@t[3],@t[2]
+ strb @t[3],[r14],#1 @ store output
+ bne .Loop_tail
+
+.Ldone:
+ add sp,sp,#4*(32+3)
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .long 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+___
+
+{{{
+my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
+ map("q$_",(0..15));
+
+# This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on
+# Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%!
+sub vperm()
+{ my ($dst,$src,$tbl) = @_;
+ $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n";
+ $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n";
+}
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+ (
+ "&vadd_i32 ($a,$a,$b)",
+ "&veor ($d,$d,$a)",
+ "&vrev32_16 ($d,$d)", # vrot ($d,16)
+
+ "&vadd_i32 ($c,$c,$d)",
+ "&veor ($t,$b,$c)",
+ "&vshr_u32 ($b,$t,20)",
+ "&vsli_32 ($b,$t,12)",
+
+ "&vadd_i32 ($a,$a,$b)",
+ "&veor ($t,$d,$a)",
+ "&vshr_u32 ($d,$t,24)",
+ "&vsli_32 ($d,$t,8)",
+ #"&vperm ($d,$t,$t3)",
+
+ "&vadd_i32 ($c,$c,$d)",
+ "&veor ($t,$b,$c)",
+ "&vshr_u32 ($b,$t,25)",
+ "&vsli_32 ($b,$t,7)",
+
+ "&vext_8 ($c,$c,$c,8)",
+ "&vext_8 ($b,$b,$b,$odd?12:4)",
+ "&vext_8 ($d,$d,$d,$odd?4:12)"
+ );
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+# ifdef __KERNEL__
+.globl ChaCha20_neon
+@ For optimal performance it's appropriate for caller to enforce
+@ minimum input length, 193 bytes is suggested.
+# endif
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+.LChaCha20_neon:
+ adr r14,.Lsigma
+ vstmdb sp!,{d8-d15} @ ABI spec says so
+ stmdb sp!,{r0-r3}
+
+ vld1.32 {$b0-$c0},[r3] @ load key
+ ldmia r3,{r4-r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {$d0},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0-r3} @ load sigma
+ vld1.32 {$a0},[r14]! @ load sigma
+ vld1.32 {$t0},[r14]! @ one
+ @ vld1.32 {$t3#lo},[r14] @ rot8
+ vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
+ str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
+ vshl.i32 $t1#lo,$t0#lo,#1 @ two
+ vstr $t0#lo,[sp,#4*(16+0)]
+ vshl.i32 $t2#lo,$t0#lo,#2 @ four
+ vstr $t1#lo,[sp,#4*(16+2)]
+ vmov $a1,$a0
+ vstr $t2#lo,[sp,#4*(16+4)]
+ vmov $a2,$a0
+ @ vstr $t3#lo,[sp,#4*(16+6)]
+ vmov $b1,$b0
+ vmov $b2,$b0
+ b .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+ ldmia sp,{r0-r9} @ load key material
+ cmp @t[3],#64*2 @ if len<=64*2
+ bls .Lbreak_neon @ switch to integer-only
+ @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8
+ vmov $a1,$a0
+ str @t[3],[sp,#4*(32+2)] @ save len
+ vmov $a2,$a0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov $b1,$b0
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov $b2,$b0
+.Loop_neon_enter:
+ ldr @t[3], [sp,#4*(15)]
+ mov @x[4],@x[4],ror#19 @ twist b[0..3]
+ vadd.i32 $d1,$d0,$t0 @ counter+1
+ ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
+ mov @x[5],@x[5],ror#19
+ vmov $c1,$c0
+ ldr @t[2], [sp,#4*(13)]
+ mov @x[6],@x[6],ror#19
+ vmov $c2,$c0
+ ldr @x[14],[sp,#4*(14)]
+ mov @x[7],@x[7],ror#19
+ vadd.i32 $d2,$d1,$t0 @ counter+2
+ add @x[12],@x[12],#3 @ counter+3
+ mov @t[3],@t[3],ror#8 @ twist d[0..3]
+ mov @x[12],@x[12],ror#8
+ mov @t[2],@t[2],ror#8
+ mov @x[14],@x[14],ror#8
+ str @t[3], [sp,#4*(16+15)]
+ mov @t[3],#10
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ subs @t[3],@t[3],#1
+___
+ my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
+ my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
+ my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
+ my @thread3=&ROUND(0,4,8,12);
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread3));
+ eval(shift(@thread1)); eval(shift(@thread3));
+ eval(shift(@thread2)); eval(shift(@thread3));
+ }
+
+ @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
+ @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
+ @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
+ @thread3=&ROUND(0,5,10,15);
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread3));
+ eval(shift(@thread1)); eval(shift(@thread3));
+ eval(shift(@thread2)); eval(shift(@thread3));
+ }
+$code.=<<___;
+ bne .Loop_neon
+
+ add @t[3],sp,#32
+ vld1.32 {$t0-$t1},[sp] @ load key material
+ vld1.32 {$t2-$t3},[@t[3]]
+
+ ldr @t[3],[sp,#4*(32+2)] @ load len
+
+ str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
+ str @t[1], [sp,#4*(16+9)]
+ str @x[12],[sp,#4*(16+12)]
+ str @t[2], [sp,#4*(16+13)]
+ str @x[14],[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ @x[0-7] and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 $a0,$a0,$t0 @ accumulate key material
+ vadd.i32 $a1,$a1,$t0
+ vadd.i32 $a2,$a2,$t0
+ vldr $t0#lo,[sp,#4*(16+0)] @ one
+
+ vadd.i32 $b0,$b0,$t1
+ vadd.i32 $b1,$b1,$t1
+ vadd.i32 $b2,$b2,$t1
+ vldr $t1#lo,[sp,#4*(16+2)] @ two
+
+ vadd.i32 $c0,$c0,$t2
+ vadd.i32 $c1,$c1,$t2
+ vadd.i32 $c2,$c2,$t2
+ vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
+ vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
+
+ vadd.i32 $d0,$d0,$t3
+ vadd.i32 $d1,$d1,$t3
+ vadd.i32 $d2,$d2,$t3
+
+ cmp @t[3],#64*4
+ blo .Ltail_neon
+
+ vld1.8 {$t0-$t1},[r12]! @ load input
+ mov @t[3],sp
+ vld1.8 {$t2-$t3},[r12]!
+ veor $a0,$a0,$t0 @ xor with input
+ veor $b0,$b0,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c0,$c0,$t2
+ veor $d0,$d0,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a1,$a1,$t0
+ vst1.8 {$a0-$b0},[r14]! @ store output
+ veor $b1,$b1,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c1,$c1,$t2
+ vst1.8 {$c0-$d0},[r14]!
+ veor $d1,$d1,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a2,$a2,$t0
+ vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
+ veor $t0#hi,$t0#hi,$t0#hi
+ vldr $t0#lo,[sp,#4*(16+4)] @ four
+ veor $b2,$b2,$t1
+ vld1.32 {$c0-$d0},[@t[3]]
+ veor $c2,$c2,$t2
+ vst1.8 {$a1-$b1},[r14]!
+ veor $d2,$d2,$t3
+ vst1.8 {$c1-$d1},[r14]!
+
+ vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
+ vldr $t0#lo,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{@t[0]-@t[3]} @ load key material
+ add @x[0],@x[0],@t[0] @ accumulate key material
+ ldr @t[0],[r12],#16 @ load input
+ vst1.8 {$a2-$b2},[r14]!
+ add @x[1],@x[1],@t[1]
+ ldr @t[1],[r12,#-12]
+ vst1.8 {$c2-$d2},[r14]!
+ add @x[2],@x[2],@t[2]
+ ldr @t[2],[r12,#-8]
+ add @x[3],@x[3],@t[3]
+ ldr @t[3],[r12,#-4]
+# ifdef __ARMEB__
+ rev @x[0],@x[0]
+ rev @x[1],@x[1]
+ rev @x[2],@x[2]
+ rev @x[3],@x[3]
+# endif
+ eor @x[0],@x[0],@t[0] @ xor with input
+ add @t[0],sp,#4*(4)
+ eor @x[1],@x[1],@t[1]
+ str @x[0],[r14],#16 @ store output
+ eor @x[2],@x[2],@t[2]
+ str @x[1],[r14,#-12]
+ eor @x[3],@x[3],@t[3]
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+ str @x[2],[r14,#-8]
+ str @x[3],[r14,#-4]
+
+ add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
+ ldr @t[0],[r12],#16 @ load input
+ add @x[5],@t[1],@x[5],ror#13
+ ldr @t[1],[r12,#-12]
+ add @x[6],@t[2],@x[6],ror#13
+ ldr @t[2],[r12,#-8]
+ add @x[7],@t[3],@x[7],ror#13
+ ldr @t[3],[r12,#-4]
+# ifdef __ARMEB__
+ rev @x[4],@x[4]
+ rev @x[5],@x[5]
+ rev @x[6],@x[6]
+ rev @x[7],@x[7]
+# endif
+ eor @x[4],@x[4],@t[0]
+ add @t[0],sp,#4*(8)
+ eor @x[5],@x[5],@t[1]
+ str @x[4],[r14],#16 @ store output
+ eor @x[6],@x[6],@t[2]
+ str @x[5],[r14,#-12]
+ eor @x[7],@x[7],@t[3]
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+ str @x[6],[r14,#-8]
+ add @x[0],sp,#4*(16+8)
+ str @x[7],[r14,#-4]
+
+ ldmia @x[0],{@x[0]-@x[7]} @ load second half
+
+ add @x[0],@x[0],@t[0] @ accumulate key material
+ ldr @t[0],[r12],#16 @ load input
+ add @x[1],@x[1],@t[1]
+ ldr @t[1],[r12,#-12]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
+ add @x[2],@x[2],@t[2]
+ ldr @t[2],[r12,#-8]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
+ add @x[3],@x[3],@t[3]
+ ldr @t[3],[r12,#-4]
+# ifdef __ARMEB__
+ rev @x[0],@x[0]
+ rev @x[1],@x[1]
+ rev @x[2],@x[2]
+ rev @x[3],@x[3]
+# endif
+ eor @x[0],@x[0],@t[0]
+ add @t[0],sp,#4*(12)
+ eor @x[1],@x[1],@t[1]
+ str @x[0],[r14],#16 @ store output
+ eor @x[2],@x[2],@t[2]
+ str @x[1],[r14,#-12]
+ eor @x[3],@x[3],@t[3]
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+ str @x[2],[r14,#-8]
+ str @x[3],[r14,#-4]
+
+ add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
+ add @t[0],@t[0],#4 @ next counter value
+ add @x[5],@t[1],@x[5],ror#24
+ str @t[0],[sp,#4*(12)] @ save next counter value
+ ldr @t[0],[r12],#16 @ load input
+ add @x[6],@t[2],@x[6],ror#24
+ add @x[4],@x[4],#3 @ counter+3
+ ldr @t[1],[r12,#-12]
+ add @x[7],@t[3],@x[7],ror#24
+ ldr @t[2],[r12,#-8]
+ ldr @t[3],[r12,#-4]
+# ifdef __ARMEB__
+ rev @x[4],@x[4]
+ rev @x[5],@x[5]
+ rev @x[6],@x[6]
+ rev @x[7],@x[7]
+# endif
+ eor @x[4],@x[4],@t[0]
+# ifdef __thumb2__
+ it hi
+# endif
+ ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
+ eor @x[5],@x[5],@t[1]
+ eor @x[6],@x[6],@t[2]
+ str @x[4],[r14],#16 @ store output
+ eor @x[7],@x[7],@t[3]
+ str @x[5],[r14,#-12]
+ sub @t[3],@t[0],#64*4 @ len-=64*4
+ str @x[6],[r14,#-8]
+ str @x[7],[r14,#-4]
+ bhi .Loop_neon_outer
+
+ b .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str @t[3], [sp,#4*(20+32+2)] @ save len
+ add @t[3],sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr @x[12],[sp,#4*(16+10)]
+ ldr @x[14],[sp,#4*(16+11)]
+ vldmia @t[3],{d8-d15} @ fulfill ABI requirement
+ str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
+ str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
+
+ ldr @t[3], [sp,#4*(15)]
+ mov @x[4],@x[4],ror#19 @ twist b[0..3]
+ ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
+ mov @x[5],@x[5],ror#19
+ ldr @t[2], [sp,#4*(13)]
+ mov @x[6],@x[6],ror#19
+ ldr @x[14],[sp,#4*(14)]
+ mov @x[7],@x[7],ror#19
+ mov @t[3],@t[3],ror#8 @ twist d[0..3]
+ mov @x[12],@x[12],ror#8
+ mov @t[2],@t[2],ror#8
+ mov @x[14],@x[14],ror#8
+ str @t[3], [sp,#4*(20+16+15)]
+ add @t[3],sp,#4*(20)
+ vst1.32 {$a0-$b0},[@t[3]]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {$c0-$d0},[@t[3]]
+ mov @t[3],#10
+ b .Loop @ go integer-only
+
+.align 4
+.Ltail_neon:
+ cmp @t[3],#64*3
+ bhs .L192_or_more_neon
+ cmp @t[3],#64*2
+ bhs .L128_or_more_neon
+ cmp @t[3],#64*1
+ bhs .L64_or_more_neon
+
+ add @t[0],sp,#4*(8)
+ vst1.8 {$a0-$b0},[sp]
+ add @t[2],sp,#4*(0)
+ vst1.8 {$c0-$d0},[@t[0]]
+ b .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+ vld1.8 {$t0-$t1},[r12]!
+ vld1.8 {$t2-$t3},[r12]!
+ veor $a0,$a0,$t0
+ veor $b0,$b0,$t1
+ veor $c0,$c0,$t2
+ veor $d0,$d0,$t3
+ vst1.8 {$a0-$b0},[r14]!
+ vst1.8 {$c0-$d0},[r14]!
+
+ beq .Ldone_neon
+
+ add @t[0],sp,#4*(8)
+ vst1.8 {$a1-$b1},[sp]
+ add @t[2],sp,#4*(0)
+ vst1.8 {$c1-$d1},[@t[0]]
+ sub @t[3],@t[3],#64*1 @ len-=64*1
+ b .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+ vld1.8 {$t0-$t1},[r12]!
+ vld1.8 {$t2-$t3},[r12]!
+ veor $a0,$a0,$t0
+ veor $b0,$b0,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c0,$c0,$t2
+ veor $d0,$d0,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a1,$a1,$t0
+ veor $b1,$b1,$t1
+ vst1.8 {$a0-$b0},[r14]!
+ veor $c1,$c1,$t2
+ vst1.8 {$c0-$d0},[r14]!
+ veor $d1,$d1,$t3
+ vst1.8 {$a1-$b1},[r14]!
+ vst1.8 {$c1-$d1},[r14]!
+
+ beq .Ldone_neon
+
+ add @t[0],sp,#4*(8)
+ vst1.8 {$a2-$b2},[sp]
+ add @t[2],sp,#4*(0)
+ vst1.8 {$c2-$d2},[@t[0]]
+ sub @t[3],@t[3],#64*2 @ len-=64*2
+ b .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+ vld1.8 {$t0-$t1},[r12]!
+ vld1.8 {$t2-$t3},[r12]!
+ veor $a0,$a0,$t0
+ veor $b0,$b0,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c0,$c0,$t2
+ veor $d0,$d0,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a1,$a1,$t0
+ veor $b1,$b1,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c1,$c1,$t2
+ vst1.8 {$a0-$b0},[r14]!
+ veor $d1,$d1,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a2,$a2,$t0
+ vst1.8 {$c0-$d0},[r14]!
+ veor $b2,$b2,$t1
+ vst1.8 {$a1-$b1},[r14]!
+ veor $c2,$c2,$t2
+ vst1.8 {$c1-$d1},[r14]!
+ veor $d2,$d2,$t3
+ vst1.8 {$a2-$b2},[r14]!
+ vst1.8 {$c2-$d2},[r14]!
+
+ beq .Ldone_neon
+
+ ldmia sp,{@t[0]-@t[3]} @ load key material
+ add @x[0],@x[0],@t[0] @ accumulate key material
+ add @t[0],sp,#4*(4)
+ add @x[1],@x[1],@t[1]
+ add @x[2],@x[2],@t[2]
+ add @x[3],@x[3],@t[3]
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+
+ add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
+ add @t[0],sp,#4*(8)
+ add @x[5],@t[1],@x[5],ror#13
+ add @x[6],@t[2],@x[6],ror#13
+ add @x[7],@t[3],@x[7],ror#13
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+# ifdef __ARMEB__
+ rev @x[0],@x[0]
+ rev @x[1],@x[1]
+ rev @x[2],@x[2]
+ rev @x[3],@x[3]
+ rev @x[4],@x[4]
+ rev @x[5],@x[5]
+ rev @x[6],@x[6]
+ rev @x[7],@x[7]
+# endif
+ stmia sp,{@x[0]-@x[7]}
+ add @x[0],sp,#4*(16+8)
+
+ ldmia @x[0],{@x[0]-@x[7]} @ load second half
+
+ add @x[0],@x[0],@t[0] @ accumulate key material
+ add @t[0],sp,#4*(12)
+ add @x[1],@x[1],@t[1]
+ add @x[2],@x[2],@t[2]
+ add @x[3],@x[3],@t[3]
+ ldmia @t[0],{@t[0]-@t[3]} @ load key material
+
+ add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
+ add @t[0],sp,#4*(8)
+ add @x[5],@t[1],@x[5],ror#24
+ add @x[4],@x[4],#3 @ counter+3
+ add @x[6],@t[2],@x[6],ror#24
+ add @x[7],@t[3],@x[7],ror#24
+ ldr @t[3],[sp,#4*(32+2)] @ re-load len
+# ifdef __ARMEB__
+ rev @x[0],@x[0]
+ rev @x[1],@x[1]
+ rev @x[2],@x[2]
+ rev @x[3],@x[3]
+ rev @x[4],@x[4]
+ rev @x[5],@x[5]
+ rev @x[6],@x[6]
+ rev @x[7],@x[7]
+# endif
+ stmia @t[0],{@x[0]-@x[7]}
+ add @t[2],sp,#4*(0)
+ sub @t[3],@t[3],#64*3 @ len-=64*3
+
+.Loop_tail_neon:
+ ldrb @t[0],[@t[2]],#1 @ read buffer on stack
+ ldrb @t[1],[r12],#1 @ read input
+ subs @t[3],@t[3],#1
+ eor @t[0],@t[0],@t[1]
+ strb @t[0],[r14],#1 @ store output
+ bne .Loop_tail_neon
+
+.Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8-d15}
+ add sp,sp,#4*(16+3)
+ ldmia sp!,{r4-r11,pc}
+.size ChaCha20_neon,.-ChaCha20_neon
+# ifndef __KERNEL__
+.comm OPENSSL_armcap_P,4,4
+# endif
+#endif
+___
+}}}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/src/crypto/zinc/chacha20/chacha20-arm64.S b/src/crypto/zinc/chacha20/chacha20-arm64.S
deleted file mode 100644
index 1ae11a5..0000000
--- a/src/crypto/zinc/chacha20/chacha20-arm64.S
+++ /dev/null
@@ -1,1942 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/*
- * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- *
- * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
- */
-
-#include <linux/linkage.h>
-
-.text
-.align 5
-.Lsigma:
-.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
-.Lone:
-.long 1,0,0,0
-
-.align 5
-ENTRY(chacha20_arm)
- cbz x2,.Labort
-
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr x5,.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
-
- ldp x22,x23,[x5] // load sigma
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ldp x28,x30,[x4] // load counter
-#ifdef __AARCH64EB__
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
-
-.Loop_outer:
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- mov w7,w23
- lsr x8,x23,#32
- mov w9,w24
- lsr x10,x24,#32
- mov w11,w25
- lsr x12,x25,#32
- mov w13,w26
- lsr x14,x26,#32
- mov w15,w27
- lsr x16,x27,#32
- mov w17,w28
- lsr x19,x28,#32
- mov w20,w30
- lsr x21,x30,#32
-
- mov x4,#10
- subs x2,x2,#64
-.Loop:
- sub x4,x4,#1
- add w5,w5,w9
- add w6,w6,w10
- add w7,w7,w11
- add w8,w8,w12
- eor w17,w17,w5
- eor w19,w19,w6
- eor w20,w20,w7
- eor w21,w21,w8
- ror w17,w17,#16
- ror w19,w19,#16
- ror w20,w20,#16
- ror w21,w21,#16
- add w13,w13,w17
- add w14,w14,w19
- add w15,w15,w20
- add w16,w16,w21
- eor w9,w9,w13
- eor w10,w10,w14
- eor w11,w11,w15
- eor w12,w12,w16
- ror w9,w9,#20
- ror w10,w10,#20
- ror w11,w11,#20
- ror w12,w12,#20
- add w5,w5,w9
- add w6,w6,w10
- add w7,w7,w11
- add w8,w8,w12
- eor w17,w17,w5
- eor w19,w19,w6
- eor w20,w20,w7
- eor w21,w21,w8
- ror w17,w17,#24
- ror w19,w19,#24
- ror w20,w20,#24
- ror w21,w21,#24
- add w13,w13,w17
- add w14,w14,w19
- add w15,w15,w20
- add w16,w16,w21
- eor w9,w9,w13
- eor w10,w10,w14
- eor w11,w11,w15
- eor w12,w12,w16
- ror w9,w9,#25
- ror w10,w10,#25
- ror w11,w11,#25
- ror w12,w12,#25
- add w5,w5,w10
- add w6,w6,w11
- add w7,w7,w12
- add w8,w8,w9
- eor w21,w21,w5
- eor w17,w17,w6
- eor w19,w19,w7
- eor w20,w20,w8
- ror w21,w21,#16
- ror w17,w17,#16
- ror w19,w19,#16
- ror w20,w20,#16
- add w15,w15,w21
- add w16,w16,w17
- add w13,w13,w19
- add w14,w14,w20
- eor w10,w10,w15
- eor w11,w11,w16
- eor w12,w12,w13
- eor w9,w9,w14
- ror w10,w10,#20
- ror w11,w11,#20
- ror w12,w12,#20
- ror w9,w9,#20
- add w5,w5,w10
- add w6,w6,w11
- add w7,w7,w12
- add w8,w8,w9
- eor w21,w21,w5
- eor w17,w17,w6
- eor w19,w19,w7
- eor w20,w20,w8
- ror w21,w21,#24
- ror w17,w17,#24
- ror w19,w19,#24
- ror w20,w20,#24
- add w15,w15,w21
- add w16,w16,w17
- add w13,w13,w19
- add w14,w14,w20
- eor w10,w10,w15
- eor w11,w11,w16
- eor w12,w12,w13
- eor w9,w9,w14
- ror w10,w10,#25
- ror w11,w11,#25
- ror w12,w12,#25
- ror w9,w9,#25
- cbnz x4,.Loop
-
- add w5,w5,w22 // accumulate key block
- add x6,x6,x22,lsr#32
- add w7,w7,w23
- add x8,x8,x23,lsr#32
- add w9,w9,w24
- add x10,x10,x24,lsr#32
- add w11,w11,w25
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add x21,x21,x30,lsr#32
-
- b.lo .Ltail
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#1 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
-
- b.hi .Loop_outer
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.Labort:
- ret
-
-.align 4
-.Ltail:
- add x2,x2,#64
-.Less_than_64:
- sub x0,x0,#1
- add x1,x1,x2
- add x0,x0,x2
- add x4,sp,x2
- neg x2,x2
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- stp x5,x7,[sp,#0]
- stp x9,x11,[sp,#16]
- stp x13,x15,[sp,#32]
- stp x17,x20,[sp,#48]
-
-.Loop_tail:
- ldrb w10,[x1,x2]
- ldrb w11,[x4,x2]
- add x2,x2,#1
- eor w10,w10,w11
- strb w10,[x0,x2]
- cbnz x2,.Loop_tail
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-ENDPROC(chacha20_arm)
-
-#ifdef CONFIG_KERNEL_MODE_NEON
-.align 5
-ENTRY(chacha20_neon)
- cbz x2,.Labort_neon
-
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr x5,.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- cmp x2,#512
- b.hs .L512_or_more_neon
-
- sub sp,sp,#64
-
- ldp x22,x23,[x5] // load sigma
- ld1 {v24.4s},[x5],#16
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ld1 {v25.4s,v26.4s},[x3]
- ldp x28,x30,[x4] // load counter
- ld1 {v27.4s},[x4]
- ld1 {v31.4s},[x5]
-#ifdef __AARCH64EB__
- rev64 v24.4s,v24.4s
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
- add v27.4s,v27.4s,v31.4s // += 1
- add v28.4s,v27.4s,v31.4s
- add v29.4s,v28.4s,v31.4s
- shl v31.4s,v31.4s,#2 // 1 -> 4
-
-.Loop_outer_neon:
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- mov v0.16b,v24.16b
- mov w7,w23
- lsr x8,x23,#32
- mov v4.16b,v24.16b
- mov w9,w24
- lsr x10,x24,#32
- mov v16.16b,v24.16b
- mov w11,w25
- mov v1.16b,v25.16b
- lsr x12,x25,#32
- mov v5.16b,v25.16b
- mov w13,w26
- mov v17.16b,v25.16b
- lsr x14,x26,#32
- mov v3.16b,v27.16b
- mov w15,w27
- mov v7.16b,v28.16b
- lsr x16,x27,#32
- mov v19.16b,v29.16b
- mov w17,w28
- mov v2.16b,v26.16b
- lsr x19,x28,#32
- mov v6.16b,v26.16b
- mov w20,w30
- mov v18.16b,v26.16b
- lsr x21,x30,#32
-
- mov x4,#10
- subs x2,x2,#256
-.Loop_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v16.4s,v16.4s,v17.4s
- add w7,w7,w11
- eor v3.16b,v3.16b,v0.16b
- add w8,w8,w12
- eor v7.16b,v7.16b,v4.16b
- eor w17,w17,w5
- eor v19.16b,v19.16b,v16.16b
- eor w19,w19,w6
- rev32 v3.8h,v3.8h
- eor w20,w20,w7
- rev32 v7.8h,v7.8h
- eor w21,w21,w8
- rev32 v19.8h,v19.8h
- ror w17,w17,#16
- add v2.4s,v2.4s,v3.4s
- ror w19,w19,#16
- add v6.4s,v6.4s,v7.4s
- ror w20,w20,#16
- add v18.4s,v18.4s,v19.4s
- ror w21,w21,#16
- eor v20.16b,v1.16b,v2.16b
- add w13,w13,w17
- eor v21.16b,v5.16b,v6.16b
- add w14,w14,w19
- eor v22.16b,v17.16b,v18.16b
- add w15,w15,w20
- ushr v1.4s,v20.4s,#20
- add w16,w16,w21
- ushr v5.4s,v21.4s,#20
- eor w9,w9,w13
- ushr v17.4s,v22.4s,#20
- eor w10,w10,w14
- sli v1.4s,v20.4s,#12
- eor w11,w11,w15
- sli v5.4s,v21.4s,#12
- eor w12,w12,w16
- sli v17.4s,v22.4s,#12
- ror w9,w9,#20
- add v0.4s,v0.4s,v1.4s
- ror w10,w10,#20
- add v4.4s,v4.4s,v5.4s
- ror w11,w11,#20
- add v16.4s,v16.4s,v17.4s
- ror w12,w12,#20
- eor v20.16b,v3.16b,v0.16b
- add w5,w5,w9
- eor v21.16b,v7.16b,v4.16b
- add w6,w6,w10
- eor v22.16b,v19.16b,v16.16b
- add w7,w7,w11
- ushr v3.4s,v20.4s,#24
- add w8,w8,w12
- ushr v7.4s,v21.4s,#24
- eor w17,w17,w5
- ushr v19.4s,v22.4s,#24
- eor w19,w19,w6
- sli v3.4s,v20.4s,#8
- eor w20,w20,w7
- sli v7.4s,v21.4s,#8
- eor w21,w21,w8
- sli v19.4s,v22.4s,#8
- ror w17,w17,#24
- add v2.4s,v2.4s,v3.4s
- ror w19,w19,#24
- add v6.4s,v6.4s,v7.4s
- ror w20,w20,#24
- add v18.4s,v18.4s,v19.4s
- ror w21,w21,#24
- eor v20.16b,v1.16b,v2.16b
- add w13,w13,w17
- eor v21.16b,v5.16b,v6.16b
- add w14,w14,w19
- eor v22.16b,v17.16b,v18.16b
- add w15,w15,w20
- ushr v1.4s,v20.4s,#25
- add w16,w16,w21
- ushr v5.4s,v21.4s,#25
- eor w9,w9,w13
- ushr v17.4s,v22.4s,#25
- eor w10,w10,w14
- sli v1.4s,v20.4s,#7
- eor w11,w11,w15
- sli v5.4s,v21.4s,#7
- eor w12,w12,w16
- sli v17.4s,v22.4s,#7
- ror w9,w9,#25
- ext v2.16b,v2.16b,v2.16b,#8
- ror w10,w10,#25
- ext v6.16b,v6.16b,v6.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w10
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w11
- add v16.4s,v16.4s,v17.4s
- add w7,w7,w12
- eor v3.16b,v3.16b,v0.16b
- add w8,w8,w9
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w5
- eor v19.16b,v19.16b,v16.16b
- eor w17,w17,w6
- rev32 v3.8h,v3.8h
- eor w19,w19,w7
- rev32 v7.8h,v7.8h
- eor w20,w20,w8
- rev32 v19.8h,v19.8h
- ror w21,w21,#16
- add v2.4s,v2.4s,v3.4s
- ror w17,w17,#16
- add v6.4s,v6.4s,v7.4s
- ror w19,w19,#16
- add v18.4s,v18.4s,v19.4s
- ror w20,w20,#16
- eor v20.16b,v1.16b,v2.16b
- add w15,w15,w21
- eor v21.16b,v5.16b,v6.16b
- add w16,w16,w17
- eor v22.16b,v17.16b,v18.16b
- add w13,w13,w19
- ushr v1.4s,v20.4s,#20
- add w14,w14,w20
- ushr v5.4s,v21.4s,#20
- eor w10,w10,w15
- ushr v17.4s,v22.4s,#20
- eor w11,w11,w16
- sli v1.4s,v20.4s,#12
- eor w12,w12,w13
- sli v5.4s,v21.4s,#12
- eor w9,w9,w14
- sli v17.4s,v22.4s,#12
- ror w10,w10,#20
- add v0.4s,v0.4s,v1.4s
- ror w11,w11,#20
- add v4.4s,v4.4s,v5.4s
- ror w12,w12,#20
- add v16.4s,v16.4s,v17.4s
- ror w9,w9,#20
- eor v20.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v21.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v22.16b,v19.16b,v16.16b
- add w7,w7,w12
- ushr v3.4s,v20.4s,#24
- add w8,w8,w9
- ushr v7.4s,v21.4s,#24
- eor w21,w21,w5
- ushr v19.4s,v22.4s,#24
- eor w17,w17,w6
- sli v3.4s,v20.4s,#8
- eor w19,w19,w7
- sli v7.4s,v21.4s,#8
- eor w20,w20,w8
- sli v19.4s,v22.4s,#8
- ror w21,w21,#24
- add v2.4s,v2.4s,v3.4s
- ror w17,w17,#24
- add v6.4s,v6.4s,v7.4s
- ror w19,w19,#24
- add v18.4s,v18.4s,v19.4s
- ror w20,w20,#24
- eor v20.16b,v1.16b,v2.16b
- add w15,w15,w21
- eor v21.16b,v5.16b,v6.16b
- add w16,w16,w17
- eor v22.16b,v17.16b,v18.16b
- add w13,w13,w19
- ushr v1.4s,v20.4s,#25
- add w14,w14,w20
- ushr v5.4s,v21.4s,#25
- eor w10,w10,w15
- ushr v17.4s,v22.4s,#25
- eor w11,w11,w16
- sli v1.4s,v20.4s,#7
- eor w12,w12,w13
- sli v5.4s,v21.4s,#7
- eor w9,w9,w14
- sli v17.4s,v22.4s,#7
- ror w10,w10,#25
- ext v2.16b,v2.16b,v2.16b,#8
- ror w11,w11,#25
- ext v6.16b,v6.16b,v6.16b,#8
- ror w12,w12,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- cbnz x4,.Loop_neon
-
- add w5,w5,w22 // accumulate key block
- add v0.4s,v0.4s,v24.4s
- add x6,x6,x22,lsr#32
- add v4.4s,v4.4s,v24.4s
- add w7,w7,w23
- add v16.4s,v16.4s,v24.4s
- add x8,x8,x23,lsr#32
- add v2.4s,v2.4s,v26.4s
- add w9,w9,w24
- add v6.4s,v6.4s,v26.4s
- add x10,x10,x24,lsr#32
- add v18.4s,v18.4s,v26.4s
- add w11,w11,w25
- add v3.4s,v3.4s,v27.4s
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add v7.4s,v7.4s,v28.4s
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add v19.4s,v19.4s,v29.4s
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add v1.4s,v1.4s,v25.4s
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add v5.4s,v5.4s,v25.4s
- add x21,x21,x30,lsr#32
- add v17.4s,v17.4s,v25.4s
-
- b.lo .Ltail_neon
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor v0.16b,v0.16b,v20.16b
- eor x15,x15,x16
- eor v1.16b,v1.16b,v21.16b
- eor x17,x17,x19
- eor v2.16b,v2.16b,v22.16b
- eor x20,x20,x21
- eor v3.16b,v3.16b,v23.16b
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#4 // increment counter
- stp x9,x11,[x0,#16]
- add v27.4s,v27.4s,v31.4s // += 4
- stp x13,x15,[x0,#32]
- add v28.4s,v28.4s,v31.4s
- stp x17,x20,[x0,#48]
- add v29.4s,v29.4s,v31.4s
- add x0,x0,#64
-
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
-
- eor v4.16b,v4.16b,v20.16b
- eor v5.16b,v5.16b,v21.16b
- eor v6.16b,v6.16b,v22.16b
- eor v7.16b,v7.16b,v23.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
- eor v16.16b,v16.16b,v0.16b
- eor v17.16b,v17.16b,v1.16b
- eor v18.16b,v18.16b,v2.16b
- eor v19.16b,v19.16b,v3.16b
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
- b.hi .Loop_outer_neon
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-
-.Ltail_neon:
- add x2,x2,#256
- cmp x2,#64
- b.lo .Less_than_64
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#4 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- b.eq .Ldone_neon
- sub x2,x2,#64
- cmp x2,#64
- b.lo .Less_than_128
-
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor v0.16b,v0.16b,v20.16b
- eor v1.16b,v1.16b,v21.16b
- eor v2.16b,v2.16b,v22.16b
- eor v3.16b,v3.16b,v23.16b
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- b.eq .Ldone_neon
- sub x2,x2,#64
- cmp x2,#64
- b.lo .Less_than_192
-
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor v4.16b,v4.16b,v20.16b
- eor v5.16b,v5.16b,v21.16b
- eor v6.16b,v6.16b,v22.16b
- eor v7.16b,v7.16b,v23.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
- b.eq .Ldone_neon
- sub x2,x2,#64
-
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
- b .Last_neon
-
-.Less_than_128:
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
- b .Last_neon
-.Less_than_192:
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
- b .Last_neon
-
-.align 4
-.Last_neon:
- sub x0,x0,#1
- add x1,x1,x2
- add x0,x0,x2
- add x4,sp,x2
- neg x2,x2
-
-.Loop_tail_neon:
- ldrb w10,[x1,x2]
- ldrb w11,[x4,x2]
- add x2,x2,#1
- eor w10,w10,w11
- strb w10,[x0,x2]
- cbnz x2,.Loop_tail_neon
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
-.Ldone_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-
-.L512_or_more_neon:
- sub sp,sp,#128+64
-
- ldp x22,x23,[x5] // load sigma
- ld1 {v24.4s},[x5],#16
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ld1 {v25.4s,v26.4s},[x3]
- ldp x28,x30,[x4] // load counter
- ld1 {v27.4s},[x4]
- ld1 {v31.4s},[x5]
-#ifdef __AARCH64EB__
- rev64 v24.4s,v24.4s
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
- add v27.4s,v27.4s,v31.4s // += 1
- stp q24,q25,[sp,#0] // off-load key block, invariant part
- add v27.4s,v27.4s,v31.4s // not typo
- str q26,[sp,#32]
- add v28.4s,v27.4s,v31.4s
- add v29.4s,v28.4s,v31.4s
- add v30.4s,v29.4s,v31.4s
- shl v31.4s,v31.4s,#2 // 1 -> 4
-
- stp d8,d9,[sp,#128+0] // meet ABI requirements
- stp d10,d11,[sp,#128+16]
- stp d12,d13,[sp,#128+32]
- stp d14,d15,[sp,#128+48]
-
- sub x2,x2,#512 // not typo
-
-.Loop_outer_512_neon:
- mov v0.16b,v24.16b
- mov v4.16b,v24.16b
- mov v8.16b,v24.16b
- mov v12.16b,v24.16b
- mov v16.16b,v24.16b
- mov v20.16b,v24.16b
- mov v1.16b,v25.16b
- mov w5,w22 // unpack key block
- mov v5.16b,v25.16b
- lsr x6,x22,#32
- mov v9.16b,v25.16b
- mov w7,w23
- mov v13.16b,v25.16b
- lsr x8,x23,#32
- mov v17.16b,v25.16b
- mov w9,w24
- mov v21.16b,v25.16b
- lsr x10,x24,#32
- mov v3.16b,v27.16b
- mov w11,w25
- mov v7.16b,v28.16b
- lsr x12,x25,#32
- mov v11.16b,v29.16b
- mov w13,w26
- mov v15.16b,v30.16b
- lsr x14,x26,#32
- mov v2.16b,v26.16b
- mov w15,w27
- mov v6.16b,v26.16b
- lsr x16,x27,#32
- add v19.4s,v3.4s,v31.4s // +4
- mov w17,w28
- add v23.4s,v7.4s,v31.4s // +4
- lsr x19,x28,#32
- mov v10.16b,v26.16b
- mov w20,w30
- mov v14.16b,v26.16b
- lsr x21,x30,#32
- mov v18.16b,v26.16b
- stp q27,q28,[sp,#48] // off-load key block, variable part
- mov v22.16b,v26.16b
- str q29,[sp,#80]
-
- mov x4,#5
- subs x2,x2,#512
-.Loop_upper_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v11.16b,v11.16b,v11.16b,#12
- ext v15.16b,v15.16b,v15.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v23.16b,v23.16b,v23.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v9.16b,v9.16b,v9.16b,#4
- ext v13.16b,v13.16b,v13.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- ext v21.16b,v21.16b,v21.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v11.16b,v11.16b,v11.16b,#4
- ext v15.16b,v15.16b,v15.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v23.16b,v23.16b,v23.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v9.16b,v9.16b,v9.16b,#12
- ext v13.16b,v13.16b,v13.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- ext v21.16b,v21.16b,v21.16b,#12
- cbnz x4,.Loop_upper_neon
-
- add w5,w5,w22 // accumulate key block
- add x6,x6,x22,lsr#32
- add w7,w7,w23
- add x8,x8,x23,lsr#32
- add w9,w9,w24
- add x10,x10,x24,lsr#32
- add w11,w11,w25
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add x21,x21,x30,lsr#32
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#1 // increment counter
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- stp x9,x11,[x0,#16]
- mov w7,w23
- lsr x8,x23,#32
- stp x13,x15,[x0,#32]
- mov w9,w24
- lsr x10,x24,#32
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- mov w11,w25
- lsr x12,x25,#32
- mov w13,w26
- lsr x14,x26,#32
- mov w15,w27
- lsr x16,x27,#32
- mov w17,w28
- lsr x19,x28,#32
- mov w20,w30
- lsr x21,x30,#32
-
- mov x4,#5
-.Loop_lower_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v11.16b,v11.16b,v11.16b,#12
- ext v15.16b,v15.16b,v15.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v23.16b,v23.16b,v23.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v9.16b,v9.16b,v9.16b,#4
- ext v13.16b,v13.16b,v13.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- ext v21.16b,v21.16b,v21.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v11.16b,v11.16b,v11.16b,#4
- ext v15.16b,v15.16b,v15.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v23.16b,v23.16b,v23.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v9.16b,v9.16b,v9.16b,#12
- ext v13.16b,v13.16b,v13.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- ext v21.16b,v21.16b,v21.16b,#12
- cbnz x4,.Loop_lower_neon
-
- add w5,w5,w22 // accumulate key block
- ldp q24,q25,[sp,#0]
- add x6,x6,x22,lsr#32
- ldp q26,q27,[sp,#32]
- add w7,w7,w23
- ldp q28,q29,[sp,#64]
- add x8,x8,x23,lsr#32
- add v0.4s,v0.4s,v24.4s
- add w9,w9,w24
- add v4.4s,v4.4s,v24.4s
- add x10,x10,x24,lsr#32
- add v8.4s,v8.4s,v24.4s
- add w11,w11,w25
- add v12.4s,v12.4s,v24.4s
- add x12,x12,x25,lsr#32
- add v16.4s,v16.4s,v24.4s
- add w13,w13,w26
- add v20.4s,v20.4s,v24.4s
- add x14,x14,x26,lsr#32
- add v2.4s,v2.4s,v26.4s
- add w15,w15,w27
- add v6.4s,v6.4s,v26.4s
- add x16,x16,x27,lsr#32
- add v10.4s,v10.4s,v26.4s
- add w17,w17,w28
- add v14.4s,v14.4s,v26.4s
- add x19,x19,x28,lsr#32
- add v18.4s,v18.4s,v26.4s
- add w20,w20,w30
- add v22.4s,v22.4s,v26.4s
- add x21,x21,x30,lsr#32
- add v19.4s,v19.4s,v31.4s // +4
- add x5,x5,x6,lsl#32 // pack
- add v23.4s,v23.4s,v31.4s // +4
- add x7,x7,x8,lsl#32
- add v3.4s,v3.4s,v27.4s
- ldp x6,x8,[x1,#0] // load input
- add v7.4s,v7.4s,v28.4s
- add x9,x9,x10,lsl#32
- add v11.4s,v11.4s,v29.4s
- add x11,x11,x12,lsl#32
- add v15.4s,v15.4s,v30.4s
- ldp x10,x12,[x1,#16]
- add v19.4s,v19.4s,v27.4s
- add x13,x13,x14,lsl#32
- add v23.4s,v23.4s,v28.4s
- add x15,x15,x16,lsl#32
- add v1.4s,v1.4s,v25.4s
- ldp x14,x16,[x1,#32]
- add v5.4s,v5.4s,v25.4s
- add x17,x17,x19,lsl#32
- add v9.4s,v9.4s,v25.4s
- add x20,x20,x21,lsl#32
- add v13.4s,v13.4s,v25.4s
- ldp x19,x21,[x1,#48]
- add v17.4s,v17.4s,v25.4s
- add x1,x1,#64
- add v21.4s,v21.4s,v25.4s
-
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor v0.16b,v0.16b,v24.16b
- eor x15,x15,x16
- eor v1.16b,v1.16b,v25.16b
- eor x17,x17,x19
- eor v2.16b,v2.16b,v26.16b
- eor x20,x20,x21
- eor v3.16b,v3.16b,v27.16b
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#7 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
- eor v4.16b,v4.16b,v24.16b
- eor v5.16b,v5.16b,v25.16b
- eor v6.16b,v6.16b,v26.16b
- eor v7.16b,v7.16b,v27.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
- ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
- eor v8.16b,v8.16b,v0.16b
- ldp q24,q25,[sp,#0]
- eor v9.16b,v9.16b,v1.16b
- ldp q26,q27,[sp,#32]
- eor v10.16b,v10.16b,v2.16b
- eor v11.16b,v11.16b,v3.16b
- st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
-
- ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
- eor v12.16b,v12.16b,v4.16b
- eor v13.16b,v13.16b,v5.16b
- eor v14.16b,v14.16b,v6.16b
- eor v15.16b,v15.16b,v7.16b
- st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
-
- ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
- eor v16.16b,v16.16b,v8.16b
- eor v17.16b,v17.16b,v9.16b
- eor v18.16b,v18.16b,v10.16b
- eor v19.16b,v19.16b,v11.16b
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
- shl v0.4s,v31.4s,#1 // 4 -> 8
- eor v20.16b,v20.16b,v12.16b
- eor v21.16b,v21.16b,v13.16b
- eor v22.16b,v22.16b,v14.16b
- eor v23.16b,v23.16b,v15.16b
- st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
-
- add v27.4s,v27.4s,v0.4s // += 8
- add v28.4s,v28.4s,v0.4s
- add v29.4s,v29.4s,v0.4s
- add v30.4s,v30.4s,v0.4s
-
- b.hs .Loop_outer_512_neon
-
- adds x2,x2,#512
- ushr v0.4s,v31.4s,#2 // 4 -> 1
-
- ldp d8,d9,[sp,#128+0] // meet ABI requirements
- ldp d10,d11,[sp,#128+16]
- ldp d12,d13,[sp,#128+32]
- ldp d14,d15,[sp,#128+48]
-
- stp q24,q31,[sp,#0] // wipe off-load area
- stp q24,q31,[sp,#32]
- stp q24,q31,[sp,#64]
-
- b.eq .Ldone_512_neon
-
- cmp x2,#192
- sub v27.4s,v27.4s,v0.4s // -= 1
- sub v28.4s,v28.4s,v0.4s
- sub v29.4s,v29.4s,v0.4s
- add sp,sp,#128
- b.hs .Loop_outer_neon
-
- eor v25.16b,v25.16b,v25.16b
- eor v26.16b,v26.16b,v26.16b
- eor v27.16b,v27.16b,v27.16b
- eor v28.16b,v28.16b,v28.16b
- eor v29.16b,v29.16b,v29.16b
- eor v30.16b,v30.16b,v30.16b
- b .Loop_outer
-
-.Ldone_512_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#128+64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.Labort_neon:
- ret
-ENDPROC(chacha20_neon)
-#endif
diff --git a/src/crypto/zinc/chacha20/chacha20-arm64.pl b/src/crypto/zinc/chacha20/chacha20-arm64.pl
new file mode 100644
index 0000000..7926c8d
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-arm64.pl
@@ -0,0 +1,1164 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2015
+#
+# ChaCha20 for ARMv8.
+#
+# Performance in cycles per byte out of large buffer.
+#
+# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU(*)
+#
+# Apple A7 5.50/+49% 3.33 1.70
+# Cortex-A53 8.40/+80% 4.72 4.72(**)
+# Cortex-A57 8.06/+43% 4.90 4.43(***)
+# Denver 4.50/+82% 2.63 2.67(**)
+# X-Gene 9.50/+46% 8.82 8.89(**)
+# Mongoose 8.00/+44% 3.64 3.25(***)
+# Kryo 8.17/+50% 4.83 4.65(***)
+#
+# (*) since no non-Apple processor exhibits significantly better
+# performance, the code path is #ifdef __APPLE__-ed;
+# (**) it's expected that doubling interleave factor doesn't help
+# all processors, only those with higher NEON latency and
+# higher instruction issue rate;
+# (***) expected improvement was actually higher;
+
+$flavour=shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
+
+my @x=map("x$_",(5..17,19..21));
+my @d=map("x$_",(22..28,30));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+
+ (
+ "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
+ "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
+ "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
+ "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
+ "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
+ "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
+ "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
+ "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
+ "&ror_32 (@x[$d0],@x[$d0],16)",
+ "&ror_32 (@x[$d1],@x[$d1],16)",
+ "&ror_32 (@x[$d2],@x[$d2],16)",
+ "&ror_32 (@x[$d3],@x[$d3],16)",
+
+ "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
+ "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
+ "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
+ "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
+ "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
+ "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
+ "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
+ "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
+ "&ror_32 (@x[$b0],@x[$b0],20)",
+ "&ror_32 (@x[$b1],@x[$b1],20)",
+ "&ror_32 (@x[$b2],@x[$b2],20)",
+ "&ror_32 (@x[$b3],@x[$b3],20)",
+
+ "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
+ "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
+ "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
+ "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
+ "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
+ "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
+ "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
+ "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
+ "&ror_32 (@x[$d0],@x[$d0],24)",
+ "&ror_32 (@x[$d1],@x[$d1],24)",
+ "&ror_32 (@x[$d2],@x[$d2],24)",
+ "&ror_32 (@x[$d3],@x[$d3],24)",
+
+ "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
+ "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
+ "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
+ "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
+ "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
+ "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
+ "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
+ "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
+ "&ror_32 (@x[$b0],@x[$b0],25)",
+ "&ror_32 (@x[$b1],@x[$b1],25)",
+ "&ror_32 (@x[$b2],@x[$b2],25)",
+ "&ror_32 (@x[$b3],@x[$b3],25)"
+ );
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#else
+# define ChaCha20_ctr32 chacha20_arm
+# define ChaCha20_neon chacha20_neon
+#endif
+
+.text
+
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+#ifndef __KERNEL__
+.LOPENSSL_armcap_P:
+# ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+# else
+.quad OPENSSL_armcap_P-.
+# endif
+#endif
+
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+ cbz $len,.Labort
+#ifndef __KERNEL__
+ adr @x[0],.LOPENSSL_armcap_P
+ cmp $len,#192
+ b.lo .Lshort
+# ifdef __ILP32__
+ ldrsw @x[1],[@x[0]]
+# else
+ ldr @x[1],[@x[0]]
+# endif
+ ldr w17,[@x[1],@x[0]]
+ tst w17,#ARMV7_NEON
+ b.ne ChaCha20_neon
+
+.Lshort:
+#endif
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr @x[0],.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp @d[0],@d[1],[@x[0]] // load sigma
+ ldp @d[2],@d[3],[$key] // load key
+ ldp @d[4],@d[5],[$key,#16]
+ ldp @d[6],@d[7],[$ctr] // load counter
+#ifdef __AARCH64EB__
+ ror @d[2],@d[2],#32
+ ror @d[3],@d[3],#32
+ ror @d[4],@d[4],#32
+ ror @d[5],@d[5],#32
+ ror @d[6],@d[6],#32
+ ror @d[7],@d[7],#32
+#endif
+
+.Loop_outer:
+ mov.32 @x[0],@d[0] // unpack key block
+ lsr @x[1],@d[0],#32
+ mov.32 @x[2],@d[1]
+ lsr @x[3],@d[1],#32
+ mov.32 @x[4],@d[2]
+ lsr @x[5],@d[2],#32
+ mov.32 @x[6],@d[3]
+ lsr @x[7],@d[3],#32
+ mov.32 @x[8],@d[4]
+ lsr @x[9],@d[4],#32
+ mov.32 @x[10],@d[5]
+ lsr @x[11],@d[5],#32
+ mov.32 @x[12],@d[6]
+ lsr @x[13],@d[6],#32
+ mov.32 @x[14],@d[7]
+ lsr @x[15],@d[7],#32
+
+ mov $ctr,#10
+ subs $len,$len,#64
+.Loop:
+ sub $ctr,$ctr,#1
+___
+ foreach (&ROUND(0, 4, 8,12)) { eval; }
+ foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+ cbnz $ctr,.Loop
+
+ add.32 @x[0],@x[0],@d[0] // accumulate key block
+ add @x[1],@x[1],@d[0],lsr#32
+ add.32 @x[2],@x[2],@d[1]
+ add @x[3],@x[3],@d[1],lsr#32
+ add.32 @x[4],@x[4],@d[2]
+ add @x[5],@x[5],@d[2],lsr#32
+ add.32 @x[6],@x[6],@d[3]
+ add @x[7],@x[7],@d[3],lsr#32
+ add.32 @x[8],@x[8],@d[4]
+ add @x[9],@x[9],@d[4],lsr#32
+ add.32 @x[10],@x[10],@d[5]
+ add @x[11],@x[11],@d[5],lsr#32
+ add.32 @x[12],@x[12],@d[6]
+ add @x[13],@x[13],@d[6],lsr#32
+ add.32 @x[14],@x[14],@d[7]
+ add @x[15],@x[15],@d[7],lsr#32
+
+ b.lo .Ltail
+
+ add @x[0],@x[0],@x[1],lsl#32 // pack
+ add @x[2],@x[2],@x[3],lsl#32
+ ldp @x[1],@x[3],[$inp,#0] // load input
+ add @x[4],@x[4],@x[5],lsl#32
+ add @x[6],@x[6],@x[7],lsl#32
+ ldp @x[5],@x[7],[$inp,#16]
+ add @x[8],@x[8],@x[9],lsl#32
+ add @x[10],@x[10],@x[11],lsl#32
+ ldp @x[9],@x[11],[$inp,#32]
+ add @x[12],@x[12],@x[13],lsl#32
+ add @x[14],@x[14],@x[15],lsl#32
+ ldp @x[13],@x[15],[$inp,#48]
+ add $inp,$inp,#64
+#ifdef __AARCH64EB__
+ rev @x[0],@x[0]
+ rev @x[2],@x[2]
+ rev @x[4],@x[4]
+ rev @x[6],@x[6]
+ rev @x[8],@x[8]
+ rev @x[10],@x[10]
+ rev @x[12],@x[12]
+ rev @x[14],@x[14]
+#endif
+ eor @x[0],@x[0],@x[1]
+ eor @x[2],@x[2],@x[3]
+ eor @x[4],@x[4],@x[5]
+ eor @x[6],@x[6],@x[7]
+ eor @x[8],@x[8],@x[9]
+ eor @x[10],@x[10],@x[11]
+ eor @x[12],@x[12],@x[13]
+ eor @x[14],@x[14],@x[15]
+
+ stp @x[0],@x[2],[$out,#0] // store output
+ add @d[6],@d[6],#1 // increment counter
+ stp @x[4],@x[6],[$out,#16]
+ stp @x[8],@x[10],[$out,#32]
+ stp @x[12],@x[14],[$out,#48]
+ add $out,$out,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort:
+ ret
+
+.align 4
+.Ltail:
+ add $len,$len,#64
+.Less_than_64:
+ sub $out,$out,#1
+ add $inp,$inp,$len
+ add $out,$out,$len
+ add $ctr,sp,$len
+ neg $len,$len
+
+ add @x[0],@x[0],@x[1],lsl#32 // pack
+ add @x[2],@x[2],@x[3],lsl#32
+ add @x[4],@x[4],@x[5],lsl#32
+ add @x[6],@x[6],@x[7],lsl#32
+ add @x[8],@x[8],@x[9],lsl#32
+ add @x[10],@x[10],@x[11],lsl#32
+ add @x[12],@x[12],@x[13],lsl#32
+ add @x[14],@x[14],@x[15],lsl#32
+#ifdef __AARCH64EB__
+ rev @x[0],@x[0]
+ rev @x[2],@x[2]
+ rev @x[4],@x[4]
+ rev @x[6],@x[6]
+ rev @x[8],@x[8]
+ rev @x[10],@x[10]
+ rev @x[12],@x[12]
+ rev @x[14],@x[14]
+#endif
+ stp @x[0],@x[2],[sp,#0]
+ stp @x[4],@x[6],[sp,#16]
+ stp @x[8],@x[10],[sp,#32]
+ stp @x[12],@x[14],[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[$inp,$len]
+ ldrb w11,[$ctr,$len]
+ add $len,$len,#1
+ eor w10,w10,w11
+ strb w10,[$out,$len]
+ cbnz $len,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+___
+
+{{{
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
+ map("v$_.4s",(0..7,16..23));
+my (@K)=map("v$_.4s",(24..30));
+my $ONE="v31.4s";
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+ (
+ "&add ('$a','$a','$b')",
+ "&eor ('$d','$d','$a')",
+ "&rev32_16 ('$d','$d')", # vrot ($d,16)
+
+ "&add ('$c','$c','$d')",
+ "&eor ('$t','$b','$c')",
+ "&ushr ('$b','$t',20)",
+ "&sli ('$b','$t',12)",
+
+ "&add ('$a','$a','$b')",
+ "&eor ('$t','$d','$a')",
+ "&ushr ('$d','$t',24)",
+ "&sli ('$d','$t',8)",
+
+ "&add ('$c','$c','$d')",
+ "&eor ('$t','$b','$c')",
+ "&ushr ('$b','$t',25)",
+ "&sli ('$b','$t',7)",
+
+ "&ext ('$c','$c','$c',8)",
+ "&ext ('$d','$d','$d',$odd?4:12)",
+ "&ext ('$b','$b','$b',$odd?12:4)"
+ );
+}
+
+$code.=<<___;
+
+#ifdef __KERNEL__
+.globl ChaCha20_neon
+.type ChaCha20_neon,%function
+#endif
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr @x[0],.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+#ifdef __APPLE__
+ cmp $len,#512
+ b.hs .L512_or_more_neon
+#endif
+
+ sub sp,sp,#64
+
+ ldp @d[0],@d[1],[@x[0]] // load sigma
+ ld1 {@K[0]},[@x[0]],#16
+ ldp @d[2],@d[3],[$key] // load key
+ ldp @d[4],@d[5],[$key,#16]
+ ld1 {@K[1],@K[2]},[$key]
+ ldp @d[6],@d[7],[$ctr] // load counter
+ ld1 {@K[3]},[$ctr]
+ ld1 {$ONE},[@x[0]]
+#ifdef __AARCH64EB__
+ rev64 @K[0],@K[0]
+ ror @d[2],@d[2],#32
+ ror @d[3],@d[3],#32
+ ror @d[4],@d[4],#32
+ ror @d[5],@d[5],#32
+ ror @d[6],@d[6],#32
+ ror @d[7],@d[7],#32
+#endif
+ add @K[3],@K[3],$ONE // += 1
+ add @K[4],@K[3],$ONE
+ add @K[5],@K[4],$ONE
+ shl $ONE,$ONE,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov.32 @x[0],@d[0] // unpack key block
+ lsr @x[1],@d[0],#32
+ mov $A0,@K[0]
+ mov.32 @x[2],@d[1]
+ lsr @x[3],@d[1],#32
+ mov $A1,@K[0]
+ mov.32 @x[4],@d[2]
+ lsr @x[5],@d[2],#32
+ mov $A2,@K[0]
+ mov.32 @x[6],@d[3]
+ mov $B0,@K[1]
+ lsr @x[7],@d[3],#32
+ mov $B1,@K[1]
+ mov.32 @x[8],@d[4]
+ mov $B2,@K[1]
+ lsr @x[9],@d[4],#32
+ mov $D0,@K[3]
+ mov.32 @x[10],@d[5]
+ mov $D1,@K[4]
+ lsr @x[11],@d[5],#32
+ mov $D2,@K[5]
+ mov.32 @x[12],@d[6]
+ mov $C0,@K[2]
+ lsr @x[13],@d[6],#32
+ mov $C1,@K[2]
+ mov.32 @x[14],@d[7]
+ mov $C2,@K[2]
+ lsr @x[15],@d[7],#32
+
+ mov $ctr,#10
+ subs $len,$len,#256
+.Loop_neon:
+ sub $ctr,$ctr,#1
+___
+ my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+ my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+ my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+ my @thread3=&ROUND(0,4,8,12);
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread3));
+ eval(shift(@thread1)); eval(shift(@thread3));
+ eval(shift(@thread2)); eval(shift(@thread3));
+ }
+
+ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+ @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+ @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+ @thread3=&ROUND(0,5,10,15);
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread3));
+ eval(shift(@thread1)); eval(shift(@thread3));
+ eval(shift(@thread2)); eval(shift(@thread3));
+ }
+$code.=<<___;
+ cbnz $ctr,.Loop_neon
+
+ add.32 @x[0],@x[0],@d[0] // accumulate key block
+ add $A0,$A0,@K[0]
+ add @x[1],@x[1],@d[0],lsr#32
+ add $A1,$A1,@K[0]
+ add.32 @x[2],@x[2],@d[1]
+ add $A2,$A2,@K[0]
+ add @x[3],@x[3],@d[1],lsr#32
+ add $C0,$C0,@K[2]
+ add.32 @x[4],@x[4],@d[2]
+ add $C1,$C1,@K[2]
+ add @x[5],@x[5],@d[2],lsr#32
+ add $C2,$C2,@K[2]
+ add.32 @x[6],@x[6],@d[3]
+ add $D0,$D0,@K[3]
+ add @x[7],@x[7],@d[3],lsr#32
+ add.32 @x[8],@x[8],@d[4]
+ add $D1,$D1,@K[4]
+ add @x[9],@x[9],@d[4],lsr#32
+ add.32 @x[10],@x[10],@d[5]
+ add $D2,$D2,@K[5]
+ add @x[11],@x[11],@d[5],lsr#32
+ add.32 @x[12],@x[12],@d[6]
+ add $B0,$B0,@K[1]
+ add @x[13],@x[13],@d[6],lsr#32
+ add.32 @x[14],@x[14],@d[7]
+ add $B1,$B1,@K[1]
+ add @x[15],@x[15],@d[7],lsr#32
+ add $B2,$B2,@K[1]
+
+ b.lo .Ltail_neon
+
+ add @x[0],@x[0],@x[1],lsl#32 // pack
+ add @x[2],@x[2],@x[3],lsl#32
+ ldp @x[1],@x[3],[$inp,#0] // load input
+ add @x[4],@x[4],@x[5],lsl#32
+ add @x[6],@x[6],@x[7],lsl#32
+ ldp @x[5],@x[7],[$inp,#16]
+ add @x[8],@x[8],@x[9],lsl#32
+ add @x[10],@x[10],@x[11],lsl#32
+ ldp @x[9],@x[11],[$inp,#32]
+ add @x[12],@x[12],@x[13],lsl#32
+ add @x[14],@x[14],@x[15],lsl#32
+ ldp @x[13],@x[15],[$inp,#48]
+ add $inp,$inp,#64
+#ifdef __AARCH64EB__
+ rev @x[0],@x[0]
+ rev @x[2],@x[2]
+ rev @x[4],@x[4]
+ rev @x[6],@x[6]
+ rev @x[8],@x[8]
+ rev @x[10],@x[10]
+ rev @x[12],@x[12]
+ rev @x[14],@x[14]
+#endif
+ ld1.8 {$T0-$T3},[$inp],#64
+ eor @x[0],@x[0],@x[1]
+ eor @x[2],@x[2],@x[3]
+ eor @x[4],@x[4],@x[5]
+ eor @x[6],@x[6],@x[7]
+ eor @x[8],@x[8],@x[9]
+ eor $A0,$A0,$T0
+ eor @x[10],@x[10],@x[11]
+ eor $B0,$B0,$T1
+ eor @x[12],@x[12],@x[13]
+ eor $C0,$C0,$T2
+ eor @x[14],@x[14],@x[15]
+ eor $D0,$D0,$T3
+ ld1.8 {$T0-$T3},[$inp],#64
+
+ stp @x[0],@x[2],[$out,#0] // store output
+ add @d[6],@d[6],#4 // increment counter
+ stp @x[4],@x[6],[$out,#16]
+ add @K[3],@K[3],$ONE // += 4
+ stp @x[8],@x[10],[$out,#32]
+ add @K[4],@K[4],$ONE
+ stp @x[12],@x[14],[$out,#48]
+ add @K[5],@K[5],$ONE
+ add $out,$out,#64
+
+ st1.8 {$A0-$D0},[$out],#64
+ ld1.8 {$A0-$D0},[$inp],#64
+
+ eor $A1,$A1,$T0
+ eor $B1,$B1,$T1
+ eor $C1,$C1,$T2
+ eor $D1,$D1,$T3
+ st1.8 {$A1-$D1},[$out],#64
+
+ eor $A2,$A2,$A0
+ eor $B2,$B2,$B0
+ eor $C2,$C2,$C0
+ eor $D2,$D2,$D0
+ st1.8 {$A2-$D2},[$out],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.Ltail_neon:
+ add $len,$len,#256
+ cmp $len,#64
+ b.lo .Less_than_64
+
+ add @x[0],@x[0],@x[1],lsl#32 // pack
+ add @x[2],@x[2],@x[3],lsl#32
+ ldp @x[1],@x[3],[$inp,#0] // load input
+ add @x[4],@x[4],@x[5],lsl#32
+ add @x[6],@x[6],@x[7],lsl#32
+ ldp @x[5],@x[7],[$inp,#16]
+ add @x[8],@x[8],@x[9],lsl#32
+ add @x[10],@x[10],@x[11],lsl#32
+ ldp @x[9],@x[11],[$inp,#32]
+ add @x[12],@x[12],@x[13],lsl#32
+ add @x[14],@x[14],@x[15],lsl#32
+ ldp @x[13],@x[15],[$inp,#48]
+ add $inp,$inp,#64
+#ifdef __AARCH64EB__
+ rev @x[0],@x[0]
+ rev @x[2],@x[2]
+ rev @x[4],@x[4]
+ rev @x[6],@x[6]
+ rev @x[8],@x[8]
+ rev @x[10],@x[10]
+ rev @x[12],@x[12]
+ rev @x[14],@x[14]
+#endif
+ eor @x[0],@x[0],@x[1]
+ eor @x[2],@x[2],@x[3]
+ eor @x[4],@x[4],@x[5]
+ eor @x[6],@x[6],@x[7]
+ eor @x[8],@x[8],@x[9]
+ eor @x[10],@x[10],@x[11]
+ eor @x[12],@x[12],@x[13]
+ eor @x[14],@x[14],@x[15]
+
+ stp @x[0],@x[2],[$out,#0] // store output
+ add @d[6],@d[6],#4 // increment counter
+ stp @x[4],@x[6],[$out,#16]
+ stp @x[8],@x[10],[$out,#32]
+ stp @x[12],@x[14],[$out,#48]
+ add $out,$out,#64
+ b.eq .Ldone_neon
+ sub $len,$len,#64
+ cmp $len,#64
+ b.lo .Less_than_128
+
+ ld1.8 {$T0-$T3},[$inp],#64
+ eor $A0,$A0,$T0
+ eor $B0,$B0,$T1
+ eor $C0,$C0,$T2
+ eor $D0,$D0,$T3
+ st1.8 {$A0-$D0},[$out],#64
+ b.eq .Ldone_neon
+ sub $len,$len,#64
+ cmp $len,#64
+ b.lo .Less_than_192
+
+ ld1.8 {$T0-$T3},[$inp],#64
+ eor $A1,$A1,$T0
+ eor $B1,$B1,$T1
+ eor $C1,$C1,$T2
+ eor $D1,$D1,$T3
+ st1.8 {$A1-$D1},[$out],#64
+ b.eq .Ldone_neon
+ sub $len,$len,#64
+
+ st1.8 {$A2-$D2},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1.8 {$A0-$D0},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1.8 {$A1-$D1},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub $out,$out,#1
+ add $inp,$inp,$len
+ add $out,$out,$len
+ add $ctr,sp,$len
+ neg $len,$len
+
+.Loop_tail_neon:
+ ldrb w10,[$inp,$len]
+ ldrb w11,[$ctr,$len]
+ add $len,$len,#1
+ eor w10,w10,w11
+ strb w10,[$out,$len]
+ cbnz $len,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_neon,.-ChaCha20_neon
+___
+{
+my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
+ $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
+
+$code.=<<___;
+#ifdef __APPLE__
+.type ChaCha20_512_neon,%function
+.align 5
+ChaCha20_512_neon:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr @x[0],.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp @d[0],@d[1],[@x[0]] // load sigma
+ ld1 {@K[0]},[@x[0]],#16
+ ldp @d[2],@d[3],[$key] // load key
+ ldp @d[4],@d[5],[$key,#16]
+ ld1 {@K[1],@K[2]},[$key]
+ ldp @d[6],@d[7],[$ctr] // load counter
+ ld1 {@K[3]},[$ctr]
+ ld1 {$ONE},[@x[0]]
+# ifdef __AARCH64EB__
+ rev64 @K[0],@K[0]
+ ror @d[2],@d[2],#32
+ ror @d[3],@d[3],#32
+ ror @d[4],@d[4],#32
+ ror @d[5],@d[5],#32
+ ror @d[6],@d[6],#32
+ ror @d[7],@d[7],#32
+# endif
+ add @K[3],@K[3],$ONE // += 1
+ stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
+ add @K[3],@K[3],$ONE // not typo
+ str @K[2],[sp,#32]
+ add @K[4],@K[3],$ONE
+ add @K[5],@K[4],$ONE
+ add @K[6],@K[5],$ONE
+ shl $ONE,$ONE,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub $len,$len,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov $A0,@K[0]
+ mov $A1,@K[0]
+ mov $A2,@K[0]
+ mov $A3,@K[0]
+ mov $A4,@K[0]
+ mov $A5,@K[0]
+ mov $B0,@K[1]
+ mov.32 @x[0],@d[0] // unpack key block
+ mov $B1,@K[1]
+ lsr @x[1],@d[0],#32
+ mov $B2,@K[1]
+ mov.32 @x[2],@d[1]
+ mov $B3,@K[1]
+ lsr @x[3],@d[1],#32
+ mov $B4,@K[1]
+ mov.32 @x[4],@d[2]
+ mov $B5,@K[1]
+ lsr @x[5],@d[2],#32
+ mov $D0,@K[3]
+ mov.32 @x[6],@d[3]
+ mov $D1,@K[4]
+ lsr @x[7],@d[3],#32
+ mov $D2,@K[5]
+ mov.32 @x[8],@d[4]
+ mov $D3,@K[6]
+ lsr @x[9],@d[4],#32
+ mov $C0,@K[2]
+ mov.32 @x[10],@d[5]
+ mov $C1,@K[2]
+ lsr @x[11],@d[5],#32
+ add $D4,$D0,$ONE // +4
+ mov.32 @x[12],@d[6]
+ add $D5,$D1,$ONE // +4
+ lsr @x[13],@d[6],#32
+ mov $C2,@K[2]
+ mov.32 @x[14],@d[7]
+ mov $C3,@K[2]
+ lsr @x[15],@d[7],#32
+ mov $C4,@K[2]
+ stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
+ mov $C5,@K[2]
+ str @K[5],[sp,#80]
+
+ mov $ctr,#5
+ subs $len,$len,#512
+.Loop_upper_neon:
+ sub $ctr,$ctr,#1
+___
+ my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+ my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+ my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+ my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
+ my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
+ my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
+ my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+ my $diff = ($#thread0+1)*6 - $#thread67 - 1;
+ my $i = 0;
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread67));
+ eval(shift(@thread1)); eval(shift(@thread67));
+ eval(shift(@thread2)); eval(shift(@thread67));
+ eval(shift(@thread3)); eval(shift(@thread67));
+ eval(shift(@thread4)); eval(shift(@thread67));
+ eval(shift(@thread5)); eval(shift(@thread67));
+ }
+
+ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+ @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+ @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+ @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
+ @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
+ @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
+ @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread67));
+ eval(shift(@thread1)); eval(shift(@thread67));
+ eval(shift(@thread2)); eval(shift(@thread67));
+ eval(shift(@thread3)); eval(shift(@thread67));
+ eval(shift(@thread4)); eval(shift(@thread67));
+ eval(shift(@thread5)); eval(shift(@thread67));
+ }
+$code.=<<___;
+ cbnz $ctr,.Loop_upper_neon
+
+ add.32 @x[0],@x[0],@d[0] // accumulate key block
+ add @x[1],@x[1],@d[0],lsr#32
+ add.32 @x[2],@x[2],@d[1]
+ add @x[3],@x[3],@d[1],lsr#32
+ add.32 @x[4],@x[4],@d[2]
+ add @x[5],@x[5],@d[2],lsr#32
+ add.32 @x[6],@x[6],@d[3]
+ add @x[7],@x[7],@d[3],lsr#32
+ add.32 @x[8],@x[8],@d[4]
+ add @x[9],@x[9],@d[4],lsr#32
+ add.32 @x[10],@x[10],@d[5]
+ add @x[11],@x[11],@d[5],lsr#32
+ add.32 @x[12],@x[12],@d[6]
+ add @x[13],@x[13],@d[6],lsr#32
+ add.32 @x[14],@x[14],@d[7]
+ add @x[15],@x[15],@d[7],lsr#32
+
+ add @x[0],@x[0],@x[1],lsl#32 // pack
+ add @x[2],@x[2],@x[3],lsl#32
+ ldp @x[1],@x[3],[$inp,#0] // load input
+ add @x[4],@x[4],@x[5],lsl#32
+ add @x[6],@x[6],@x[7],lsl#32
+ ldp @x[5],@x[7],[$inp,#16]
+ add @x[8],@x[8],@x[9],lsl#32
+ add @x[10],@x[10],@x[11],lsl#32
+ ldp @x[9],@x[11],[$inp,#32]
+ add @x[12],@x[12],@x[13],lsl#32
+ add @x[14],@x[14],@x[15],lsl#32
+ ldp @x[13],@x[15],[$inp,#48]
+ add $inp,$inp,#64
+# ifdef __AARCH64EB__
+ rev @x[0],@x[0]
+ rev @x[2],@x[2]
+ rev @x[4],@x[4]
+ rev @x[6],@x[6]
+ rev @x[8],@x[8]
+ rev @x[10],@x[10]
+ rev @x[12],@x[12]
+ rev @x[14],@x[14]
+# endif
+ eor @x[0],@x[0],@x[1]
+ eor @x[2],@x[2],@x[3]
+ eor @x[4],@x[4],@x[5]
+ eor @x[6],@x[6],@x[7]
+ eor @x[8],@x[8],@x[9]
+ eor @x[10],@x[10],@x[11]
+ eor @x[12],@x[12],@x[13]
+ eor @x[14],@x[14],@x[15]
+
+ stp @x[0],@x[2],[$out,#0] // store output
+ add @d[6],@d[6],#1 // increment counter
+ mov.32 @x[0],@d[0] // unpack key block
+ lsr @x[1],@d[0],#32
+ stp @x[4],@x[6],[$out,#16]
+ mov.32 @x[2],@d[1]
+ lsr @x[3],@d[1],#32
+ stp @x[8],@x[10],[$out,#32]
+ mov.32 @x[4],@d[2]
+ lsr @x[5],@d[2],#32
+ stp @x[12],@x[14],[$out,#48]
+ add $out,$out,#64
+ mov.32 @x[6],@d[3]
+ lsr @x[7],@d[3],#32
+ mov.32 @x[8],@d[4]
+ lsr @x[9],@d[4],#32
+ mov.32 @x[10],@d[5]
+ lsr @x[11],@d[5],#32
+ mov.32 @x[12],@d[6]
+ lsr @x[13],@d[6],#32
+ mov.32 @x[14],@d[7]
+ lsr @x[15],@d[7],#32
+
+ mov $ctr,#5
+.Loop_lower_neon:
+ sub $ctr,$ctr,#1
+___
+ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+ @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+ @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+ @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
+ @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
+ @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
+ @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread67));
+ eval(shift(@thread1)); eval(shift(@thread67));
+ eval(shift(@thread2)); eval(shift(@thread67));
+ eval(shift(@thread3)); eval(shift(@thread67));
+ eval(shift(@thread4)); eval(shift(@thread67));
+ eval(shift(@thread5)); eval(shift(@thread67));
+ }
+
+ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+ @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+ @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+ @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
+ @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
+ @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
+ @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread67));
+ eval(shift(@thread1)); eval(shift(@thread67));
+ eval(shift(@thread2)); eval(shift(@thread67));
+ eval(shift(@thread3)); eval(shift(@thread67));
+ eval(shift(@thread4)); eval(shift(@thread67));
+ eval(shift(@thread5)); eval(shift(@thread67));
+ }
+$code.=<<___;
+ cbnz $ctr,.Loop_lower_neon
+
+ add.32 @x[0],@x[0],@d[0] // accumulate key block
+ ldp @K[0],@K[1],[sp,#0]
+ add @x[1],@x[1],@d[0],lsr#32
+ ldp @K[2],@K[3],[sp,#32]
+ add.32 @x[2],@x[2],@d[1]
+ ldp @K[4],@K[5],[sp,#64]
+ add @x[3],@x[3],@d[1],lsr#32
+ add $A0,$A0,@K[0]
+ add.32 @x[4],@x[4],@d[2]
+ add $A1,$A1,@K[0]
+ add @x[5],@x[5],@d[2],lsr#32
+ add $A2,$A2,@K[0]
+ add.32 @x[6],@x[6],@d[3]
+ add $A3,$A3,@K[0]
+ add @x[7],@x[7],@d[3],lsr#32
+ add $A4,$A4,@K[0]
+ add.32 @x[8],@x[8],@d[4]
+ add $A5,$A5,@K[0]
+ add @x[9],@x[9],@d[4],lsr#32
+ add $C0,$C0,@K[2]
+ add.32 @x[10],@x[10],@d[5]
+ add $C1,$C1,@K[2]
+ add @x[11],@x[11],@d[5],lsr#32
+ add $C2,$C2,@K[2]
+ add.32 @x[12],@x[12],@d[6]
+ add $C3,$C3,@K[2]
+ add @x[13],@x[13],@d[6],lsr#32
+ add $C4,$C4,@K[2]
+ add.32 @x[14],@x[14],@d[7]
+ add $C5,$C5,@K[2]
+ add @x[15],@x[15],@d[7],lsr#32
+ add $D4,$D4,$ONE // +4
+ add @x[0],@x[0],@x[1],lsl#32 // pack
+ add $D5,$D5,$ONE // +4
+ add @x[2],@x[2],@x[3],lsl#32
+ add $D0,$D0,@K[3]
+ ldp @x[1],@x[3],[$inp,#0] // load input
+ add $D1,$D1,@K[4]
+ add @x[4],@x[4],@x[5],lsl#32
+ add $D2,$D2,@K[5]
+ add @x[6],@x[6],@x[7],lsl#32
+ add $D3,$D3,@K[6]
+ ldp @x[5],@x[7],[$inp,#16]
+ add $D4,$D4,@K[3]
+ add @x[8],@x[8],@x[9],lsl#32
+ add $D5,$D5,@K[4]
+ add @x[10],@x[10],@x[11],lsl#32
+ add $B0,$B0,@K[1]
+ ldp @x[9],@x[11],[$inp,#32]
+ add $B1,$B1,@K[1]
+ add @x[12],@x[12],@x[13],lsl#32
+ add $B2,$B2,@K[1]
+ add @x[14],@x[14],@x[15],lsl#32
+ add $B3,$B3,@K[1]
+ ldp @x[13],@x[15],[$inp,#48]
+ add $B4,$B4,@K[1]
+ add $inp,$inp,#64
+ add $B5,$B5,@K[1]
+
+# ifdef __AARCH64EB__
+ rev @x[0],@x[0]
+ rev @x[2],@x[2]
+ rev @x[4],@x[4]
+ rev @x[6],@x[6]
+ rev @x[8],@x[8]
+ rev @x[10],@x[10]
+ rev @x[12],@x[12]
+ rev @x[14],@x[14]
+# endif
+ ld1.8 {$T0-$T3},[$inp],#64
+ eor @x[0],@x[0],@x[1]
+ eor @x[2],@x[2],@x[3]
+ eor @x[4],@x[4],@x[5]
+ eor @x[6],@x[6],@x[7]
+ eor @x[8],@x[8],@x[9]
+ eor $A0,$A0,$T0
+ eor @x[10],@x[10],@x[11]
+ eor $B0,$B0,$T1
+ eor @x[12],@x[12],@x[13]
+ eor $C0,$C0,$T2
+ eor @x[14],@x[14],@x[15]
+ eor $D0,$D0,$T3
+ ld1.8 {$T0-$T3},[$inp],#64
+
+ stp @x[0],@x[2],[$out,#0] // store output
+ add @d[6],@d[6],#7 // increment counter
+ stp @x[4],@x[6],[$out,#16]
+ stp @x[8],@x[10],[$out,#32]
+ stp @x[12],@x[14],[$out,#48]
+ add $out,$out,#64
+ st1.8 {$A0-$D0},[$out],#64
+
+ ld1.8 {$A0-$D0},[$inp],#64
+ eor $A1,$A1,$T0
+ eor $B1,$B1,$T1
+ eor $C1,$C1,$T2
+ eor $D1,$D1,$T3
+ st1.8 {$A1-$D1},[$out],#64
+
+ ld1.8 {$A1-$D1},[$inp],#64
+ eor $A2,$A2,$A0
+ ldp @K[0],@K[1],[sp,#0]
+ eor $B2,$B2,$B0
+ ldp @K[2],@K[3],[sp,#32]
+ eor $C2,$C2,$C0
+ eor $D2,$D2,$D0
+ st1.8 {$A2-$D2},[$out],#64
+
+ ld1.8 {$A2-$D2},[$inp],#64
+ eor $A3,$A3,$A1
+ eor $B3,$B3,$B1
+ eor $C3,$C3,$C1
+ eor $D3,$D3,$D1
+ st1.8 {$A3-$D3},[$out],#64
+
+ ld1.8 {$A3-$D3},[$inp],#64
+ eor $A4,$A4,$A2
+ eor $B4,$B4,$B2
+ eor $C4,$C4,$C2
+ eor $D4,$D4,$D2
+ st1.8 {$A4-$D4},[$out],#64
+
+ shl $A0,$ONE,#1 // 4 -> 8
+ eor $A5,$A5,$A3
+ eor $B5,$B5,$B3
+ eor $C5,$C5,$C3
+ eor $D5,$D5,$D3
+ st1.8 {$A5-$D5},[$out],#64
+
+ add @K[3],@K[3],$A0 // += 8
+ add @K[4],@K[4],$A0
+ add @K[5],@K[5],$A0
+ add @K[6],@K[6],$A0
+
+ b.hs .Loop_outer_512_neon
+
+ adds $len,$len,#512
+ ushr $A0,$ONE,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp @K[0],$ONE,[sp,#0] // wipe off-load area
+ stp @K[0],$ONE,[sp,#32]
+ stp @K[0],$ONE,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp $len,#192
+ sub @K[3],@K[3],$A0 // -= 1
+ sub @K[4],@K[4],$A0
+ sub @K[5],@K[5],$A0
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor @K[1],@K[1],@K[1]
+ eor @K[2],@K[2],@K[2]
+ eor @K[3],@K[3],@K[3]
+ eor @K[4],@K[4],@K[4]
+ eor @K[5],@K[5],@K[5]
+ eor @K[6],@K[6],@K[6]
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_512_neon,.-ChaCha20_512_neon
+#endif
+___
+}
+}}}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
+ (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
+ (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
+ (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
+ (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
+
+ #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+
+ print $_,"\n";
+}
+close STDOUT; # flush
diff --git a/src/crypto/zinc/chacha20/chacha20-unrolled-arm.S b/src/crypto/zinc/chacha20/chacha20-unrolled-arm.S
new file mode 100644
index 0000000..2140319
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-unrolled-arm.S
@@ -0,0 +1,461 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used. So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions. This is faster than using explicit rotate
+ * instructions. To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount. The rotation amount is then fixed up just in time
+ * when the values are used. 'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+ // ChaCha state registers
+ X0 .req r0
+ X1 .req r1
+ X2 .req r2
+ X3 .req r3
+ X4 .req r4
+ X5 .req r5
+ X6 .req r6
+ X7 .req r7
+ X8_X10 .req r8 // shared by x8 and x10
+ X9_X11 .req r9 // shared by x9 and x11
+ X12 .req r10
+ X13 .req r11
+ X14 .req r12
+ X15 .req r14
+
+.Lexpand_32byte_k:
+ // "expand 32-byte k"
+ .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+
+#ifdef __thumb2__
+# define adrl adr
+#endif
+
+.macro __rev out, in, t0, t1, t2
+.if __LINUX_ARM_ARCH__ >= 6
+ rev \out, \in
+.else
+ lsl \t0, \in, #24
+ and \t1, \in, #0xff00
+ and \t2, \in, #0xff0000
+ orr \out, \t0, \in, lsr #24
+ orr \out, \out, \t1, lsl #8
+ orr \out, \out, \t2, lsr #8
+.endif
+.endm
+
+.macro _le32_bswap x, t0, t1, t2
+#ifdef __ARMEB__
+ __rev \x, \x, \t0, \t1, \t2
+#endif
+.endm
+
+.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
+ _le32_bswap \a, \t0, \t1, \t2
+ _le32_bswap \b, \t0, \t1, \t2
+ _le32_bswap \c, \t0, \t1, \t2
+ _le32_bswap \d, \t0, \t1, \t2
+.endm
+
+.macro __ldrd a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ ldrd \a, \b, [\src, #\offset]
+#else
+ ldr \a, [\src, #\offset]
+ ldr \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ strd \a, \b, [\dst, #\offset]
+#else
+ str \a, [\dst, #\offset]
+ str \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
+
+ // a += b; d ^= a; d = rol(d, 16);
+ add \a1, \a1, \b1, ror #brot
+ add \a2, \a2, \b2, ror #brot
+ eor \d1, \a1, \d1, ror #drot
+ eor \d2, \a2, \d2, ror #drot
+ // drot == 32 - 16 == 16
+
+ // c += d; b ^= c; b = rol(b, 12);
+ add \c1, \c1, \d1, ror #16
+ add \c2, \c2, \d2, ror #16
+ eor \b1, \c1, \b1, ror #brot
+ eor \b2, \c2, \b2, ror #brot
+ // brot == 32 - 12 == 20
+
+ // a += b; d ^= a; d = rol(d, 8);
+ add \a1, \a1, \b1, ror #20
+ add \a2, \a2, \b2, ror #20
+ eor \d1, \a1, \d1, ror #16
+ eor \d2, \a2, \d2, ror #16
+ // drot == 32 - 8 == 24
+
+ // c += d; b ^= c; b = rol(b, 7);
+ add \c1, \c1, \d1, ror #24
+ add \c2, \c2, \d2, ror #24
+ eor \b1, \c1, \b1, ror #20
+ eor \b2, \c2, \b2, ror #20
+ // brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+ // column round
+
+ // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+ _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
+
+ // save (x8, x9); restore (x10, x11)
+ __strd X8_X10, X9_X11, sp, 0
+ __ldrd X8_X10, X9_X11, sp, 8
+
+ // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+ _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
+
+ .set brot, 25
+ .set drot, 24
+
+ // diagonal round
+
+ // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+ _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
+
+ // save (x10, x11); restore (x8, x9)
+ __strd X8_X10, X9_X11, sp, 8
+ __ldrd X8_X10, X9_X11, sp, 0
+
+ // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+ _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute nrounds
+ .set brot, 0
+ .set drot, 0
+ .rept \nrounds / 2
+ _doubleround
+ .endr
+.endm
+
+.macro _chacha nrounds
+
+.Lnext_block\@:
+ // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+
+ // Do the core ChaCha permutation to update x0-x15.
+ _chacha_permute \nrounds
+
+ add sp, #8
+ // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+ push {X8_X10, X9_X11, X12, X13, X14, X15}
+
+ // Load (OUT, IN, LEN).
+ ldr r14, [sp, #96]
+ ldr r12, [sp, #100]
+ ldr r11, [sp, #104]
+
+ orr r10, r14, r12
+
+ // Use slow path if fewer than 64 bytes remain.
+ cmp r11, #64
+ blt .Lxor_slowpath\@
+
+ // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
+ // ARMv6+, since ldmia and stmia (used below) still require alignment.
+ tst r10, #3
+ bne .Lxor_slowpath\@
+
+ // Fast path: XOR 64 bytes of aligned data.
+
+ // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // x0-x3
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
+ ldmia r12!, {r8-r11}
+ eor X0, X0, r8
+ eor X1, X1, r9
+ eor X2, X2, r10
+ eor X3, X3, r11
+ stmia r14!, {X0-X3}
+
+ // x4-x7
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ ldmia r12!, {X0-X3}
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
+ eor X4, X4, X0
+ eor X5, X5, X1
+ eor X6, X6, X2
+ eor X7, X7, X3
+ stmia r14!, {X4-X7}
+
+ // x8-x15
+ pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8 // x8
+ eor r1, r1, r9 // x9
+ eor r6, r6, r10 // x10
+ eor r7, r7, r11 // x11
+ stmia r14!, {r0,r1,r6,r7}
+ ldmia r12!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
+ ldr r9, [sp, #72] // load LEN
+ eor r2, r2, r0 // x12
+ eor r3, r3, r1 // x13
+ eor r4, r4, r6 // x14
+ eor r5, r5, r7 // x15
+ subs r9, #64 // decrement and check LEN
+ stmia r14!, {r2-r5}
+
+ beq .Ldone\@
+
+.Lprepare_for_next_block\@:
+
+ // Stack: x0-x15 OUT IN LEN
+
+ // Increment block counter (x12)
+ add r8, #1
+
+ // Store updated (OUT, IN, LEN)
+ str r14, [sp, #64]
+ str r12, [sp, #68]
+ str r9, [sp, #72]
+
+ mov r14, sp
+
+ // Store updated block counter (x12)
+ str r8, [sp, #48]
+
+ sub sp, #16
+
+ // Reload state and do next block
+ ldmia r14!, {r0-r11} // load x0-x11
+ __strd r10, r11, sp, 8 // store x10-x11 before state
+ ldmia r14, {r10-r12,r14} // load x12-x15
+ b .Lnext_block\@
+
+.Lxor_slowpath\@:
+ // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+ // We handle it by storing the 64 bytes of keystream to the stack, then
+ // XOR-ing the needed portion with the data.
+
+ // Allocate keystream buffer
+ sub sp, #64
+ mov r14, sp
+
+ // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Save keystream for x0-x3
+ __ldrd r8, r9, sp, 96
+ __ldrd r10, r11, sp, 104
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
+ stmia r14!, {X0-X3}
+
+ // Save keystream for x4-x7
+ __ldrd r8, r9, sp, 112
+ __ldrd r10, r11, sp, 120
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
+ add r8, sp, #64
+ stmia r14!, {X4-X7}
+
+ // Save keystream for x8-x15
+ ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 128
+ __ldrd r10, r11, sp, 136
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
+ stmia r14!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 144
+ __ldrd r10, r11, sp, 152
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
+ stmia r14, {r2-r5}
+
+ // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+ // Registers: r8 is block counter, r12 is IN.
+
+ ldr r9, [sp, #168] // LEN
+ ldr r14, [sp, #160] // OUT
+ cmp r9, #64
+ mov r0, sp
+ movle r1, r9
+ movgt r1, #64
+ // r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+ orr r2, r12, r14
+ tst r2, #3 // IN or OUT misaligned?
+ bne .Lxor_next_byte\@
+.endif
+
+ // XOR a word at a time
+.rept 16
+ subs r1, #4
+ blt .Lxor_words_done\@
+ ldr r2, [r12], #4
+ ldr r3, [r0], #4
+ eor r2, r2, r3
+ str r2, [r14], #4
+.endr
+ b .Lxor_slowpath_done\@
+.Lxor_words_done\@:
+ ands r1, r1, #3
+ beq .Lxor_slowpath_done\@
+
+ // XOR a byte at a time
+.Lxor_next_byte\@:
+ ldrb r2, [r12], #1
+ ldrb r3, [r0], #1
+ eor r2, r2, r3
+ strb r2, [r14], #1
+ subs r1, #1
+ bne .Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+ subs r9, #64
+ add sp, #96
+ bgt .Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm // _chacha
+
+/*
+ * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
+ * const u32 iv[4]);
+ */
+ENTRY(chacha20_arm)
+ cmp r2, #0 // len == 0?
+ reteq lr
+
+ push {r0-r2,r4-r11,lr}
+
+ // Push state x0-x15 onto stack.
+ // Also store an extra copy of x10-x11 just before the state.
+
+ ldr r4, [sp, #48] // iv
+ mov r0, sp
+ sub sp, #80
+
+ // iv: x12-x15
+ ldm r4, {X12,X13,X14,X15}
+ stmdb r0!, {X12,X13,X14,X15}
+
+ // key: x4-x11
+ __ldrd X8_X10, X9_X11, r3, 24
+ __strd X8_X10, X9_X11, sp, 8
+ stmdb r0!, {X8_X10, X9_X11}
+ ldm r3, {X4-X9_X11}
+ stmdb r0!, {X4-X9_X11}
+
+ // constants: x0-x3
+ adrl X3, .Lexpand_32byte_k
+ ldm X3, {X0-X3}
+ __strd X0, X1, sp, 16
+ __strd X2, X3, sp, 24
+
+ _chacha 20
+
+ add sp, #76
+ pop {r4-r11, pc}
+ENDPROC(chacha20_arm)
+
+/*
+ * void hchacha20_arm(const u32 state[16], u32 out[8]);
+ */
+ENTRY(hchacha20_arm)
+ push {r1,r4-r11,lr}
+
+ mov r14, r0
+ ldmia r14!, {r0-r11} // load x0-x11
+ push {r10-r11} // store x10-x11 to stack
+ ldm r14, {r10-r12,r14} // load x12-x15
+ sub sp, #8
+
+ _chacha_permute 20
+
+ // Skip over (unused0-unused1, x10-x11)
+ add sp, #16
+
+ // Fix up rotations of x12-x15
+ ror X12, X12, #drot
+ ror X13, X13, #drot
+ pop {r4} // load 'out'
+ ror X14, X14, #drot
+ ror X15, X15, #drot
+
+ // Store (x0-x3,x12-x15) to 'out'
+ stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+ pop {r4-r11,pc}
+ENDPROC(hchacha20_arm)
diff --git a/src/crypto/zinc/poly1305/poly1305-arm.S b/src/crypto/zinc/poly1305/poly1305-arm.S
deleted file mode 100644
index 4a0e9d4..0000000
--- a/src/crypto/zinc/poly1305/poly1305-arm.S
+++ /dev/null
@@ -1,1117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/*
- * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- *
- * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
- */
-
-#include <linux/linkage.h>
-
-.text
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-.align 5
-ENTRY(poly1305_init_arm)
- stmdb sp!,{r4-r11}
-
- eor r3,r3,r3
- cmp r1,#0
- str r3,[r0,#0] @ zero hash value
- str r3,[r0,#4]
- str r3,[r0,#8]
- str r3,[r0,#12]
- str r3,[r0,#16]
- str r3,[r0,#36] @ is_base2_26
- add r0,r0,#20
-
-#ifdef __thumb2__
- it eq
-#endif
- moveq r0,#0
- beq .Lno_key
-
- ldrb r4,[r1,#0]
- mov r10,#0x0fffffff
- ldrb r5,[r1,#1]
- and r3,r10,#-4 @ 0x0ffffffc
- ldrb r6,[r1,#2]
- ldrb r7,[r1,#3]
- orr r4,r4,r5,lsl#8
- ldrb r5,[r1,#4]
- orr r4,r4,r6,lsl#16
- ldrb r6,[r1,#5]
- orr r4,r4,r7,lsl#24
- ldrb r7,[r1,#6]
- and r4,r4,r10
-
- ldrb r8,[r1,#7]
- orr r5,r5,r6,lsl#8
- ldrb r6,[r1,#8]
- orr r5,r5,r7,lsl#16
- ldrb r7,[r1,#9]
- orr r5,r5,r8,lsl#24
- ldrb r8,[r1,#10]
- and r5,r5,r3
-
- ldrb r9,[r1,#11]
- orr r6,r6,r7,lsl#8
- ldrb r7,[r1,#12]
- orr r6,r6,r8,lsl#16
- ldrb r8,[r1,#13]
- orr r6,r6,r9,lsl#24
- ldrb r9,[r1,#14]
- and r6,r6,r3
-
- ldrb r10,[r1,#15]
- orr r7,r7,r8,lsl#8
- str r4,[r0,#0]
- orr r7,r7,r9,lsl#16
- str r5,[r0,#4]
- orr r7,r7,r10,lsl#24
- str r6,[r0,#8]
- and r7,r7,r3
- str r7,[r0,#12]
-.Lno_key:
- ldmia sp!,{r4-r11}
-#if __LINUX_ARM_ARCH__ >= 5
- bx lr @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-ENDPROC(poly1305_init_arm)
-
-.align 5
-ENTRY(poly1305_blocks_arm)
-.Lpoly1305_blocks_arm:
- stmdb sp!,{r3-r11,lr}
-
- ands r2,r2,#-16
- beq .Lno_data
-
- cmp r3,#0
- add r2,r2,r1 @ end pointer
- sub sp,sp,#32
-
- ldmia r0,{r4-r12} @ load context
-
- str r0,[sp,#12] @ offload stuff
- mov lr,r1
- str r2,[sp,#16]
- str r10,[sp,#20]
- str r11,[sp,#24]
- str r12,[sp,#28]
- b .Loop
-
-.Loop:
-#if __LINUX_ARM_ARCH__ < 7
- ldrb r0,[lr],#16 @ load input
-#ifdef __thumb2__
- it hi
-#endif
- addhi r8,r8,#1 @ 1<<128
- ldrb r1,[lr,#-15]
- ldrb r2,[lr,#-14]
- ldrb r3,[lr,#-13]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-12]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-11]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-10]
- adds r4,r4,r3 @ accumulate input
-
- ldrb r3,[lr,#-9]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-8]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-7]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-6]
- adcs r5,r5,r3
-
- ldrb r3,[lr,#-5]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-4]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-3]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-2]
- adcs r6,r6,r3
-
- ldrb r3,[lr,#-1]
- orr r1,r0,r1,lsl#8
- str lr,[sp,#8] @ offload input pointer
- orr r2,r1,r2,lsl#16
- add r10,r10,r10,lsr#2
- orr r3,r2,r3,lsl#24
-#else
- ldr r0,[lr],#16 @ load input
-#ifdef __thumb2__
- it hi
-#endif
- addhi r8,r8,#1 @ padbit
- ldr r1,[lr,#-12]
- ldr r2,[lr,#-8]
- ldr r3,[lr,#-4]
-#ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-#endif
- adds r4,r4,r0 @ accumulate input
- str lr,[sp,#8] @ offload input pointer
- adcs r5,r5,r1
- add r10,r10,r10,lsr#2
- adcs r6,r6,r2
-#endif
- add r11,r11,r11,lsr#2
- adcs r7,r7,r3
- add r12,r12,r12,lsr#2
-
- umull r2,r3,r5,r9
- adc r8,r8,#0
- umull r0,r1,r4,r9
- umlal r2,r3,r8,r10
- umlal r0,r1,r7,r10
- ldr r10,[sp,#20] @ reload r10
- umlal r2,r3,r6,r12
- umlal r0,r1,r5,r12
- umlal r2,r3,r7,r11
- umlal r0,r1,r6,r11
- umlal r2,r3,r4,r10
- str r0,[sp,#0] @ future r4
- mul r0,r11,r8
- ldr r11,[sp,#24] @ reload r11
- adds r2,r2,r1 @ d1+=d0>>32
- eor r1,r1,r1
- adc lr,r3,#0 @ future r6
- str r2,[sp,#4] @ future r5
-
- mul r2,r12,r8
- eor r3,r3,r3
- umlal r0,r1,r7,r12
- ldr r12,[sp,#28] @ reload r12
- umlal r2,r3,r7,r9
- umlal r0,r1,r6,r9
- umlal r2,r3,r6,r10
- umlal r0,r1,r5,r10
- umlal r2,r3,r5,r11
- umlal r0,r1,r4,r11
- umlal r2,r3,r4,r12
- ldr r4,[sp,#0]
- mul r8,r9,r8
- ldr r5,[sp,#4]
-
- adds r6,lr,r0 @ d2+=d1>>32
- ldr lr,[sp,#8] @ reload input pointer
- adc r1,r1,#0
- adds r7,r2,r1 @ d3+=d2>>32
- ldr r0,[sp,#16] @ reload end pointer
- adc r3,r3,#0
- add r8,r8,r3 @ h4+=d3>>32
-
- and r1,r8,#-4
- and r8,r8,#3
- add r1,r1,r1,lsr#2 @ *=5
- adds r4,r4,r1
- adcs r5,r5,#0
- adcs r6,r6,#0
- adcs r7,r7,#0
- adc r8,r8,#0
-
- cmp r0,lr @ done yet?
- bhi .Loop
-
- ldr r0,[sp,#12]
- add sp,sp,#32
- stmia r0,{r4-r8} @ store the result
-
-.Lno_data:
-#if __LINUX_ARM_ARCH__ >= 5
- ldmia sp!,{r3-r11,pc}
-#else
- ldmia sp!,{r3-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-ENDPROC(poly1305_blocks_arm)
-
-.align 5
-ENTRY(poly1305_emit_arm)
- stmdb sp!,{r4-r11}
-.Lpoly1305_emit_enter:
- ldmia r0,{r3-r7}
- adds r8,r3,#5 @ compare to modulus
- adcs r9,r4,#0
- adcs r10,r5,#0
- adcs r11,r6,#0
- adc r7,r7,#0
- tst r7,#4 @ did it carry/borrow?
-
-#ifdef __thumb2__
- it ne
-#endif
- movne r3,r8
- ldr r8,[r2,#0]
-#ifdef __thumb2__
- it ne
-#endif
- movne r4,r9
- ldr r9,[r2,#4]
-#ifdef __thumb2__
- it ne
-#endif
- movne r5,r10
- ldr r10,[r2,#8]
-#ifdef __thumb2__
- it ne
-#endif
- movne r6,r11
- ldr r11,[r2,#12]
-
- adds r3,r3,r8
- adcs r4,r4,r9
- adcs r5,r5,r10
- adc r6,r6,r11
-
-#if __LINUX_ARM_ARCH__ >= 7
-#ifdef __ARMEB__
- rev r3,r3
- rev r4,r4
- rev r5,r5
- rev r6,r6
-#endif
- str r3,[r1,#0]
- str r4,[r1,#4]
- str r5,[r1,#8]
- str r6,[r1,#12]
-#else
- strb r3,[r1,#0]
- mov r3,r3,lsr#8
- strb r4,[r1,#4]
- mov r4,r4,lsr#8
- strb r5,[r1,#8]
- mov r5,r5,lsr#8
- strb r6,[r1,#12]
- mov r6,r6,lsr#8
-
- strb r3,[r1,#1]
- mov r3,r3,lsr#8
- strb r4,[r1,#5]
- mov r4,r4,lsr#8
- strb r5,[r1,#9]
- mov r5,r5,lsr#8
- strb r6,[r1,#13]
- mov r6,r6,lsr#8
-
- strb r3,[r1,#2]
- mov r3,r3,lsr#8
- strb r4,[r1,#6]
- mov r4,r4,lsr#8
- strb r5,[r1,#10]
- mov r5,r5,lsr#8
- strb r6,[r1,#14]
- mov r6,r6,lsr#8
-
- strb r3,[r1,#3]
- strb r4,[r1,#7]
- strb r5,[r1,#11]
- strb r6,[r1,#15]
-#endif
- ldmia sp!,{r4-r11}
-#if __LINUX_ARM_ARCH__ >= 5
- bx lr @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-ENDPROC(poly1305_emit_arm)
-
-
-#ifdef CONFIG_KERNEL_MODE_NEON
-.fpu neon
-
-.align 5
-ENTRY(poly1305_init_neon)
-.Lpoly1305_init_neon:
- ldr r4,[r0,#20] @ load key base 2^32
- ldr r5,[r0,#24]
- ldr r6,[r0,#28]
- ldr r7,[r0,#32]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- and r3,r3,#0x03ffffff
- and r4,r4,#0x03ffffff
- and r5,r5,#0x03ffffff
-
- vdup.32 d0,r2 @ r^1 in both lanes
- add r2,r3,r3,lsl#2 @ *5
- vdup.32 d1,r3
- add r3,r4,r4,lsl#2
- vdup.32 d2,r2
- vdup.32 d3,r4
- add r4,r5,r5,lsl#2
- vdup.32 d4,r3
- vdup.32 d5,r5
- add r5,r6,r6,lsl#2
- vdup.32 d6,r4
- vdup.32 d7,r6
- vdup.32 d8,r5
-
- mov r5,#2 @ counter
-
-.Lsquare_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-
- vmull.u32 q5,d0,d0[1]
- vmull.u32 q6,d1,d0[1]
- vmull.u32 q7,d3,d0[1]
- vmull.u32 q8,d5,d0[1]
- vmull.u32 q9,d7,d0[1]
-
- vmlal.u32 q5,d7,d2[1]
- vmlal.u32 q6,d0,d1[1]
- vmlal.u32 q7,d1,d1[1]
- vmlal.u32 q8,d3,d1[1]
- vmlal.u32 q9,d5,d1[1]
-
- vmlal.u32 q5,d5,d4[1]
- vmlal.u32 q6,d7,d4[1]
- vmlal.u32 q8,d1,d3[1]
- vmlal.u32 q7,d0,d3[1]
- vmlal.u32 q9,d3,d3[1]
-
- vmlal.u32 q5,d3,d6[1]
- vmlal.u32 q8,d0,d5[1]
- vmlal.u32 q6,d5,d6[1]
- vmlal.u32 q7,d7,d6[1]
- vmlal.u32 q9,d1,d5[1]
-
- vmlal.u32 q8,d7,d8[1]
- vmlal.u32 q5,d1,d8[1]
- vmlal.u32 q6,d3,d8[1]
- vmlal.u32 q7,d5,d8[1]
- vmlal.u32 q9,d0,d7[1]
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- @ and P. Schwabe
- @
- @ H0>>+H1>>+H2>>+H3>>+H4
- @ H3>>+H4>>*5+H0>>+H1
- @
- @ Trivia.
- @
- @ Result of multiplication of n-bit number by m-bit number is
- @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
- @ m-bit number multiplied by 2^n is still n+m bits wide.
- @
- @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
- @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
- @ one is n+1 bits wide.
- @
- @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
- @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
- @ can be 27. However! In cases when their width exceeds 26 bits
- @ they are limited by 2^26+2^6. This in turn means that *sum*
- @ of the products with these values can still be viewed as sum
- @ of 52-bit numbers as long as the amount of addends is not a
- @ power of 2. For example,
- @
- @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
- @
- @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
- @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
- @ 8 * (2^52) or 2^55. However, the value is then multiplied by
- @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
- @ which is less than 32 * (2^52) or 2^57. And when processing
- @ data we are looking at triple as many addends...
- @
- @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
- @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
- @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
- @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
- @ instruction accepts 2x32-bit input and writes 2x64-bit result.
- @ This means that result of reduction have to be compressed upon
- @ loop wrap-around. This can be done in the process of reduction
- @ to minimize amount of instructions [as well as amount of
- @ 128-bit instructions, which benefits low-end processors], but
- @ one has to watch for H2 (which is narrower than H0) and 5*H4
- @ not being wider than 58 bits, so that result of right shift
- @ by 26 bits fits in 32 bits. This is also useful on x86,
- @ because it allows to use paddd in place for paddq, which
- @ benefits Atom, where paddq is ridiculously slow.
-
- vshr.u64 q15,q8,#26
- vmovn.i64 d16,q8
- vshr.u64 q4,q5,#26
- vmovn.i64 d10,q5
- vadd.i64 q9,q9,q15 @ h3 -> h4
- vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
- vadd.i64 q6,q6,q4 @ h0 -> h1
- vbic.i32 d10,#0xfc000000
-
- vshrn.u64 d30,q9,#26
- vmovn.i64 d18,q9
- vshr.u64 q4,q6,#26
- vmovn.i64 d12,q6
- vadd.i64 q7,q7,q4 @ h1 -> h2
- vbic.i32 d18,#0xfc000000
- vbic.i32 d12,#0xfc000000
-
- vadd.i32 d10,d10,d30
- vshl.u32 d30,d30,#2
- vshrn.u64 d8,q7,#26
- vmovn.i64 d14,q7
- vadd.i32 d10,d10,d30 @ h4 -> h0
- vadd.i32 d16,d16,d8 @ h2 -> h3
- vbic.i32 d14,#0xfc000000
-
- vshr.u32 d30,d10,#26
- vbic.i32 d10,#0xfc000000
- vshr.u32 d8,d16,#26
- vbic.i32 d16,#0xfc000000
- vadd.i32 d12,d12,d30 @ h0 -> h1
- vadd.i32 d18,d18,d8 @ h3 -> h4
-
- subs r5,r5,#1
- beq .Lsquare_break_neon
-
- add r6,r0,#(48+0*9*4)
- add r7,r0,#(48+1*9*4)
-
- vtrn.32 d0,d10 @ r^2:r^1
- vtrn.32 d3,d14
- vtrn.32 d5,d16
- vtrn.32 d1,d12
- vtrn.32 d7,d18
-
- vshl.u32 d4,d3,#2 @ *5
- vshl.u32 d6,d5,#2
- vshl.u32 d2,d1,#2
- vshl.u32 d8,d7,#2
- vadd.i32 d4,d4,d3
- vadd.i32 d2,d2,d1
- vadd.i32 d6,d6,d5
- vadd.i32 d8,d8,d7
-
- vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
- vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
- vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vst1.32 {d8[0]},[r6,:32]
- vst1.32 {d8[1]},[r7,:32]
-
- b .Lsquare_neon
-
-.align 4
-.Lsquare_break_neon:
- add r6,r0,#(48+2*4*9)
- add r7,r0,#(48+3*4*9)
-
- vmov d0,d10 @ r^4:r^3
- vshl.u32 d2,d12,#2 @ *5
- vmov d1,d12
- vshl.u32 d4,d14,#2
- vmov d3,d14
- vshl.u32 d6,d16,#2
- vmov d5,d16
- vshl.u32 d8,d18,#2
- vmov d7,d18
- vadd.i32 d2,d2,d12
- vadd.i32 d4,d4,d14
- vadd.i32 d6,d6,d16
- vadd.i32 d8,d8,d18
-
- vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
- vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
- vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vst1.32 {d8[0]},[r6]
- vst1.32 {d8[1]},[r7]
-
- bx lr @ bx lr
-ENDPROC(poly1305_init_neon)
-
-.align 5
-ENTRY(poly1305_blocks_neon)
- ldr ip,[r0,#36] @ is_base2_26
- ands r2,r2,#-16
- beq .Lno_data_neon
-
- cmp r2,#64
- bhs .Lenter_neon
- tst ip,ip @ is_base2_26?
- beq .Lpoly1305_blocks_arm
-
-.Lenter_neon:
- stmdb sp!,{r4-r7}
- vstmdb sp!,{d8-d15} @ ABI specification says so
-
- tst ip,ip @ is_base2_26?
- bne .Lbase2_26_neon
-
- stmdb sp!,{r1-r3,lr}
- bl .Lpoly1305_init_neon
-
- ldr r4,[r0,#0] @ load hash value base 2^32
- ldr r5,[r0,#4]
- ldr r6,[r0,#8]
- ldr r7,[r0,#12]
- ldr ip,[r0,#16]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- veor d10,d10,d10
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- veor d12,d12,d12
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- veor d14,d14,d14
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- veor d16,d16,d16
- and r3,r3,#0x03ffffff
- orr r6,r6,ip,lsl#24
- veor d18,d18,d18
- and r4,r4,#0x03ffffff
- mov r1,#1
- and r5,r5,#0x03ffffff
- str r1,[r0,#36] @ is_base2_26
-
- vmov.32 d10[0],r2
- vmov.32 d12[0],r3
- vmov.32 d14[0],r4
- vmov.32 d16[0],r5
- vmov.32 d18[0],r6
- adr r5,.Lzeros
-
- ldmia sp!,{r1-r3,lr}
- b .Lbase2_32_neon
-
-.align 4
-.Lbase2_26_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ load hash value
-
- veor d10,d10,d10
- veor d12,d12,d12
- veor d14,d14,d14
- veor d16,d16,d16
- veor d18,d18,d18
- vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
- adr r5,.Lzeros
- vld1.32 {d18[0]},[r0]
- sub r0,r0,#16 @ rewind
-
-.Lbase2_32_neon:
- add r4,r1,#32
- mov r3,r3,lsl#24
- tst r2,#31
- beq .Leven
-
- vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
- vmov.32 d28[0],r3
- sub r2,r2,#16
- add r4,r1,#32
-
-#ifdef __ARMEB__
- vrev32.8 q10,q10
- vrev32.8 q13,q13
- vrev32.8 q11,q11
- vrev32.8 q12,q12
-#endif
- vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
- vshl.u32 d26,d26,#18
-
- vsri.u32 d26,d24,#14
- vshl.u32 d24,d24,#12
- vadd.i32 d29,d28,d18 @ add hash value and move to #hi
-
- vbic.i32 d26,#0xfc000000
- vsri.u32 d24,d22,#20
- vshl.u32 d22,d22,#6
-
- vbic.i32 d24,#0xfc000000
- vsri.u32 d22,d20,#26
- vadd.i32 d27,d26,d16
-
- vbic.i32 d20,#0xfc000000
- vbic.i32 d22,#0xfc000000
- vadd.i32 d25,d24,d14
-
- vadd.i32 d21,d20,d10
- vadd.i32 d23,d22,d12
-
- mov r7,r5
- add r6,r0,#48
-
- cmp r2,r2
- b .Long_tail
-
-.align 4
-.Leven:
- subs r2,r2,#64
- it lo
- movlo r4,r5
-
- vmov.i32 q14,#1<<24 @ padbit, yes, always
- vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
- add r1,r1,#64
- vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
- add r4,r4,#64
- itt hi
- addhi r7,r0,#(48+1*9*4)
- addhi r6,r0,#(48+3*9*4)
-
-#ifdef __ARMEB__
- vrev32.8 q10,q10
- vrev32.8 q13,q13
- vrev32.8 q11,q11
- vrev32.8 q12,q12
-#endif
- vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
- vshl.u32 q13,q13,#18
-
- vsri.u32 q13,q12,#14
- vshl.u32 q12,q12,#12
-
- vbic.i32 q13,#0xfc000000
- vsri.u32 q12,q11,#20
- vshl.u32 q11,q11,#6
-
- vbic.i32 q12,#0xfc000000
- vsri.u32 q11,q10,#26
-
- vbic.i32 q10,#0xfc000000
- vbic.i32 q11,#0xfc000000
-
- bls .Lskip_loop
-
- vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
- vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
- vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- b .Loop_neon
-
-.align 5
-.Loop_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- @ ___________________/
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- @ ___________________/ ____________________/
- @
- @ Note that we start with inp[2:3]*r^2. This is because it
- @ doesn't depend on reduction in previous iteration.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ inp[2:3]*r^2
-
- vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
- vmull.u32 q7,d25,d0[1]
- vadd.i32 d20,d20,d10
- vmull.u32 q5,d21,d0[1]
- vadd.i32 d26,d26,d16
- vmull.u32 q8,d27,d0[1]
- vmlal.u32 q7,d23,d1[1]
- vadd.i32 d22,d22,d12
- vmull.u32 q6,d23,d0[1]
-
- vadd.i32 d28,d28,d18
- vmull.u32 q9,d29,d0[1]
- subs r2,r2,#64
- vmlal.u32 q5,d29,d2[1]
- it lo
- movlo r4,r5
- vmlal.u32 q8,d25,d1[1]
- vld1.32 d8[1],[r7,:32]
- vmlal.u32 q6,d21,d1[1]
- vmlal.u32 q9,d27,d1[1]
-
- vmlal.u32 q5,d27,d4[1]
- vmlal.u32 q8,d23,d3[1]
- vmlal.u32 q9,d25,d3[1]
- vmlal.u32 q6,d29,d4[1]
- vmlal.u32 q7,d21,d3[1]
-
- vmlal.u32 q8,d21,d5[1]
- vmlal.u32 q5,d25,d6[1]
- vmlal.u32 q9,d23,d5[1]
- vmlal.u32 q6,d27,d6[1]
- vmlal.u32 q7,d29,d6[1]
-
- vmlal.u32 q8,d29,d8[1]
- vmlal.u32 q5,d23,d8[1]
- vmlal.u32 q9,d21,d7[1]
- vmlal.u32 q6,d25,d8[1]
- vmlal.u32 q7,d27,d8[1]
-
- vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
- add r4,r4,#64
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4 and accumulate
-
- vmlal.u32 q8,d26,d0[0]
- vmlal.u32 q5,d20,d0[0]
- vmlal.u32 q9,d28,d0[0]
- vmlal.u32 q6,d22,d0[0]
- vmlal.u32 q7,d24,d0[0]
- vld1.32 d8[0],[r6,:32]
-
- vmlal.u32 q8,d24,d1[0]
- vmlal.u32 q5,d28,d2[0]
- vmlal.u32 q9,d26,d1[0]
- vmlal.u32 q6,d20,d1[0]
- vmlal.u32 q7,d22,d1[0]
-
- vmlal.u32 q8,d22,d3[0]
- vmlal.u32 q5,d26,d4[0]
- vmlal.u32 q9,d24,d3[0]
- vmlal.u32 q6,d28,d4[0]
- vmlal.u32 q7,d20,d3[0]
-
- vmlal.u32 q8,d20,d5[0]
- vmlal.u32 q5,d24,d6[0]
- vmlal.u32 q9,d22,d5[0]
- vmlal.u32 q6,d26,d6[0]
- vmlal.u32 q8,d28,d8[0]
-
- vmlal.u32 q7,d28,d6[0]
- vmlal.u32 q5,d22,d8[0]
- vmlal.u32 q9,d20,d7[0]
- vmov.i32 q14,#1<<24 @ padbit, yes, always
- vmlal.u32 q6,d24,d8[0]
- vmlal.u32 q7,d26,d8[0]
-
- vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
- add r1,r1,#64
-#ifdef __ARMEB__
- vrev32.8 q10,q10
- vrev32.8 q11,q11
- vrev32.8 q12,q12
- vrev32.8 q13,q13
-#endif
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction interleaved with base 2^32 -> base 2^26 of
- @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
-
- vshr.u64 q15,q8,#26
- vmovn.i64 d16,q8
- vshr.u64 q4,q5,#26
- vmovn.i64 d10,q5
- vadd.i64 q9,q9,q15 @ h3 -> h4
- vbic.i32 d16,#0xfc000000
- vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
- vadd.i64 q6,q6,q4 @ h0 -> h1
- vshl.u32 q13,q13,#18
- vbic.i32 d10,#0xfc000000
-
- vshrn.u64 d30,q9,#26
- vmovn.i64 d18,q9
- vshr.u64 q4,q6,#26
- vmovn.i64 d12,q6
- vadd.i64 q7,q7,q4 @ h1 -> h2
- vsri.u32 q13,q12,#14
- vbic.i32 d18,#0xfc000000
- vshl.u32 q12,q12,#12
- vbic.i32 d12,#0xfc000000
-
- vadd.i32 d10,d10,d30
- vshl.u32 d30,d30,#2
- vbic.i32 q13,#0xfc000000
- vshrn.u64 d8,q7,#26
- vmovn.i64 d14,q7
- vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
- vsri.u32 q12,q11,#20
- vadd.i32 d16,d16,d8 @ h2 -> h3
- vshl.u32 q11,q11,#6
- vbic.i32 d14,#0xfc000000
- vbic.i32 q12,#0xfc000000
-
- vshrn.u64 d30,q5,#26 @ re-narrow
- vmovn.i64 d10,q5
- vsri.u32 q11,q10,#26
- vbic.i32 q10,#0xfc000000
- vshr.u32 d8,d16,#26
- vbic.i32 d16,#0xfc000000
- vbic.i32 d10,#0xfc000000
- vadd.i32 d12,d12,d30 @ h0 -> h1
- vadd.i32 d18,d18,d8 @ h3 -> h4
- vbic.i32 q11,#0xfc000000
-
- bhi .Loop_neon
-
-.Lskip_loop:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- add r7,r0,#(48+0*9*4)
- add r6,r0,#(48+1*9*4)
- adds r2,r2,#32
- it ne
- movne r2,#0
- bne .Long_tail
-
- vadd.i32 d25,d24,d14 @ add hash value and move to #hi
- vadd.i32 d21,d20,d10
- vadd.i32 d27,d26,d16
- vadd.i32 d23,d22,d12
- vadd.i32 d29,d28,d18
-
-.Long_tail:
- vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
- vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
-
- vadd.i32 d24,d24,d14 @ can be redundant
- vmull.u32 q7,d25,d0
- vadd.i32 d20,d20,d10
- vmull.u32 q5,d21,d0
- vadd.i32 d26,d26,d16
- vmull.u32 q8,d27,d0
- vadd.i32 d22,d22,d12
- vmull.u32 q6,d23,d0
- vadd.i32 d28,d28,d18
- vmull.u32 q9,d29,d0
-
- vmlal.u32 q5,d29,d2
- vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vmlal.u32 q8,d25,d1
- vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- vmlal.u32 q6,d21,d1
- vmlal.u32 q9,d27,d1
- vmlal.u32 q7,d23,d1
-
- vmlal.u32 q8,d23,d3
- vld1.32 d8[1],[r7,:32]
- vmlal.u32 q5,d27,d4
- vld1.32 d8[0],[r6,:32]
- vmlal.u32 q9,d25,d3
- vmlal.u32 q6,d29,d4
- vmlal.u32 q7,d21,d3
-
- vmlal.u32 q8,d21,d5
- it ne
- addne r7,r0,#(48+2*9*4)
- vmlal.u32 q5,d25,d6
- it ne
- addne r6,r0,#(48+3*9*4)
- vmlal.u32 q9,d23,d5
- vmlal.u32 q6,d27,d6
- vmlal.u32 q7,d29,d6
-
- vmlal.u32 q8,d29,d8
- vorn q0,q0,q0 @ all-ones, can be redundant
- vmlal.u32 q5,d23,d8
- vshr.u64 q0,q0,#38
- vmlal.u32 q9,d21,d7
- vmlal.u32 q6,d25,d8
- vmlal.u32 q7,d27,d8
-
- beq .Lshort_tail
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4:r^3 and accumulate
-
- vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
- vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
-
- vmlal.u32 q7,d24,d0
- vmlal.u32 q5,d20,d0
- vmlal.u32 q8,d26,d0
- vmlal.u32 q6,d22,d0
- vmlal.u32 q9,d28,d0
-
- vmlal.u32 q5,d28,d2
- vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vmlal.u32 q8,d24,d1
- vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- vmlal.u32 q6,d20,d1
- vmlal.u32 q9,d26,d1
- vmlal.u32 q7,d22,d1
-
- vmlal.u32 q8,d22,d3
- vld1.32 d8[1],[r7,:32]
- vmlal.u32 q5,d26,d4
- vld1.32 d8[0],[r6,:32]
- vmlal.u32 q9,d24,d3
- vmlal.u32 q6,d28,d4
- vmlal.u32 q7,d20,d3
-
- vmlal.u32 q8,d20,d5
- vmlal.u32 q5,d24,d6
- vmlal.u32 q9,d22,d5
- vmlal.u32 q6,d26,d6
- vmlal.u32 q7,d28,d6
-
- vmlal.u32 q8,d28,d8
- vorn q0,q0,q0 @ all-ones
- vmlal.u32 q5,d22,d8
- vshr.u64 q0,q0,#38
- vmlal.u32 q9,d20,d7
- vmlal.u32 q6,d24,d8
- vmlal.u32 q7,d26,d8
-
-.Lshort_tail:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 d16,d16,d17
- vadd.i64 d10,d10,d11
- vadd.i64 d18,d18,d19
- vadd.i64 d12,d12,d13
- vadd.i64 d14,d14,d15
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction, but without narrowing
-
- vshr.u64 q15,q8,#26
- vand.i64 q8,q8,q0
- vshr.u64 q4,q5,#26
- vand.i64 q5,q5,q0
- vadd.i64 q9,q9,q15 @ h3 -> h4
- vadd.i64 q6,q6,q4 @ h0 -> h1
-
- vshr.u64 q15,q9,#26
- vand.i64 q9,q9,q0
- vshr.u64 q4,q6,#26
- vand.i64 q6,q6,q0
- vadd.i64 q7,q7,q4 @ h1 -> h2
-
- vadd.i64 q5,q5,q15
- vshl.u64 q15,q15,#2
- vshr.u64 q4,q7,#26
- vand.i64 q7,q7,q0
- vadd.i64 q5,q5,q15 @ h4 -> h0
- vadd.i64 q8,q8,q4 @ h2 -> h3
-
- vshr.u64 q15,q5,#26
- vand.i64 q5,q5,q0
- vshr.u64 q4,q8,#26
- vand.i64 q8,q8,q0
- vadd.i64 q6,q6,q15 @ h0 -> h1
- vadd.i64 q9,q9,q4 @ h3 -> h4
-
- cmp r2,#0
- bne .Leven
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ store hash value
-
- vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
- vst1.32 {d18[0]},[r0]
-
- vldmia sp!,{d8-d15} @ epilogue
- ldmia sp!,{r4-r7}
-.Lno_data_neon:
- bx lr @ bx lr
-ENDPROC(poly1305_blocks_neon)
-
-.align 5
-ENTRY(poly1305_emit_neon)
- ldr ip,[r0,#36] @ is_base2_26
-
- stmdb sp!,{r4-r11}
-
- tst ip,ip
- beq .Lpoly1305_emit_enter
-
- ldmia r0,{r3-r7}
- eor r8,r8,r8
-
- adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
- mov r4,r4,lsr#6
- adcs r4,r4,r5,lsl#20
- mov r5,r5,lsr#12
- adcs r5,r5,r6,lsl#14
- mov r6,r6,lsr#18
- adcs r6,r6,r7,lsl#8
- adc r7,r8,r7,lsr#24 @ can be partially reduced ...
-
- and r8,r7,#-4 @ ... so reduce
- and r7,r6,#3
- add r8,r8,r8,lsr#2 @ *= 5
- adds r3,r3,r8
- adcs r4,r4,#0
- adcs r5,r5,#0
- adcs r6,r6,#0
- adc r7,r7,#0
-
- adds r8,r3,#5 @ compare to modulus
- adcs r9,r4,#0
- adcs r10,r5,#0
- adcs r11,r6,#0
- adc r7,r7,#0
- tst r7,#4 @ did it carry/borrow?
-
- it ne
- movne r3,r8
- ldr r8,[r2,#0]
- it ne
- movne r4,r9
- ldr r9,[r2,#4]
- it ne
- movne r5,r10
- ldr r10,[r2,#8]
- it ne
- movne r6,r11
- ldr r11,[r2,#12]
-
- adds r3,r3,r8 @ accumulate nonce
- adcs r4,r4,r9
- adcs r5,r5,r10
- adc r6,r6,r11
-
-#ifdef __ARMEB__
- rev r3,r3
- rev r4,r4
- rev r5,r5
- rev r6,r6
-#endif
- str r3,[r1,#0] @ store the result
- str r4,[r1,#4]
- str r5,[r1,#8]
- str r6,[r1,#12]
-
- ldmia sp!,{r4-r11}
- bx lr @ bx lr
-ENDPROC(poly1305_emit_neon)
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-#endif
diff --git a/src/crypto/zinc/poly1305/poly1305-arm.pl b/src/crypto/zinc/poly1305/poly1305-arm.pl
new file mode 100644
index 0000000..88a4260
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-arm.pl
@@ -0,0 +1,1272 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# IALU(*)/gcc-4.4 NEON
+#
+# ARM11xx(ARMv6) 7.78/+100% -
+# Cortex-A5 6.35/+130% 3.00
+# Cortex-A8 6.25/+115% 2.36
+# Cortex-A9 5.10/+95% 2.55
+# Cortex-A15 3.85/+85% 1.25(**)
+# Snapdragon S4 5.70/+100% 1.48(**)
+#
+# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**) these are trade-off results, they can be improved by ~8% but at
+# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init poly1305_init_arm
+# define poly1305_blocks poly1305_blocks_arm
+# define poly1305_emit poly1305_emit_arm
+.globl poly1305_emit_neon
+.globl poly1305_blocks_neon
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp $inp,#0
+ str r3,[$ctx,#0] @ zero hash value
+ str r3,[$ctx,#4]
+ str r3,[$ctx,#8]
+ str r3,[$ctx,#12]
+ str r3,[$ctx,#16]
+ str r3,[$ctx,#36] @ is_base2_26
+ add $ctx,$ctx,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ adr r11,.Lpoly1305_init
+ ldr r12,.LOPENSSL_armcap
+#endif
+ ldrb r4,[$inp,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[$inp,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[$inp,#2]
+ ldrb r7,[$inp,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[$inp,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[$inp,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[$inp,#6]
+ and r4,r4,r10
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
+# ifdef __APPLE__
+ ldr r12,[r12]
+# endif
+#endif
+ ldrb r8,[$inp,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[$inp,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[$inp,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[$inp,#10]
+ and r5,r5,r3
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ tst r12,#ARMV7_NEON @ check for NEON
+# ifdef __APPLE__
+ adr r9,poly1305_blocks_neon
+ adr r11,poly1305_blocks
+# ifdef __thumb2__
+ it ne
+# endif
+ movne r11,r9
+ adr r12,poly1305_emit
+ adr r10,poly1305_emit_neon
+# ifdef __thumb2__
+ it ne
+# endif
+ movne r12,r10
+# else
+# ifdef __thumb2__
+ itete eq
+# endif
+ addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
+ addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
+ addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
+# endif
+# ifdef __thumb2__
+ orr r12,r12,#1 @ thumb-ify address
+ orr r11,r11,#1
+# endif
+#endif
+ ldrb r9,[$inp,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[$inp,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[$inp,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[$inp,#14]
+ and r6,r6,r3
+
+ ldrb r10,[$inp,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[$ctx,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[$ctx,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[$ctx,#8]
+ and r7,r7,r3
+ str r7,[$ctx,#12]
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ stmia r2,{r11,r12} @ fill functions table
+ mov r0,#1
+#else
+ mov r0,#0
+#endif
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ stmdb sp!,{r3-r11,lr}
+
+ ands $len,$len,#-16
+ beq .Lno_data
+
+ cmp $padbit,#0
+ add $len,$len,$inp @ end pointer
+ sub sp,sp,#32
+
+ ldmia $ctx,{$h0-$r3} @ load context
+
+ str $ctx,[sp,#12] @ offload stuff
+ mov lr,$inp
+ str $len,[sp,#16]
+ str $r1,[sp,#20]
+ str $r2,[sp,#24]
+ str $r3,[sp,#28]
+ b .Loop
+
+.Loop:
+#if __ARM_ARCH__<7
+ ldrb r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds $h0,$h0,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs $h1,$h1,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs $h2,$h2,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add $s1,$r1,$r1,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ adds $h0,$h0,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs $h1,$h1,r1
+ add $s1,$r1,$r1,lsr#2
+ adcs $h2,$h2,r2
+#endif
+ add $s2,$r2,$r2,lsr#2
+ adcs $h3,$h3,r3
+ add $s3,$r3,$r3,lsr#2
+
+ umull r2,r3,$h1,$r0
+ adc $h4,$h4,#0
+ umull r0,r1,$h0,$r0
+ umlal r2,r3,$h4,$s1
+ umlal r0,r1,$h3,$s1
+ ldr $r1,[sp,#20] @ reload $r1
+ umlal r2,r3,$h2,$s3
+ umlal r0,r1,$h1,$s3
+ umlal r2,r3,$h3,$s2
+ umlal r0,r1,$h2,$s2
+ umlal r2,r3,$h0,$r1
+ str r0,[sp,#0] @ future $h0
+ mul r0,$s2,$h4
+ ldr $r2,[sp,#24] @ reload $r2
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future $h2
+ str r2,[sp,#4] @ future $h1
+
+ mul r2,$s3,$h4
+ eor r3,r3,r3
+ umlal r0,r1,$h3,$s3
+ ldr $r3,[sp,#28] @ reload $r3
+ umlal r2,r3,$h3,$r0
+ umlal r0,r1,$h2,$r0
+ umlal r2,r3,$h2,$r1
+ umlal r0,r1,$h1,$r1
+ umlal r2,r3,$h1,$r2
+ umlal r0,r1,$h0,$r2
+ umlal r2,r3,$h0,$r3
+ ldr $h0,[sp,#0]
+ mul $h4,$r0,$h4
+ ldr $h1,[sp,#4]
+
+ adds $h2,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds $h3,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add $h4,$h4,r3 @ h4+=d3>>32
+
+ and r1,$h4,#-4
+ and $h4,$h4,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds $h0,$h0,r1
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr $ctx,[sp,#12]
+ add sp,sp,#32
+ stmia $ctx,{$h0-$h4} @ store the result
+
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$h4;
+
+$code.=<<___;
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+ stmdb sp!,{r4-r11}
+.Lpoly1305_emit_enter:
+
+ ldmia $ctx,{$h0-$h4}
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0]
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+#else
+ strb $h0,[$mac,#0]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#4]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#8]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#12]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#1]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#5]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#9]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#13]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#2]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#6]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#10]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#14]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#3]
+ strb $h1,[$mac,#7]
+ strb $h2,[$mac,#11]
+ strb $h3,[$mac,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+ ldr r4,[$ctx,#20] @ load key base 2^32
+ ldr r5,[$ctx,#24]
+ ldr r6,[$ctx,#28]
+ ldr r7,[$ctx,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 $R0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 $R1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 $S1,r2
+ vdup.32 $R2,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 $S2,r3
+ vdup.32 $R3,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 $S3,r4
+ vdup.32 $R4,r6
+ vdup.32 $S4,r5
+
+ mov $zeros,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 $D0,$R0,${R0}[1]
+ vmull.u32 $D1,$R1,${R0}[1]
+ vmull.u32 $D2,$R2,${R0}[1]
+ vmull.u32 $D3,$R3,${R0}[1]
+ vmull.u32 $D4,$R4,${R0}[1]
+
+ vmlal.u32 $D0,$R4,${S1}[1]
+ vmlal.u32 $D1,$R0,${R1}[1]
+ vmlal.u32 $D2,$R1,${R1}[1]
+ vmlal.u32 $D3,$R2,${R1}[1]
+ vmlal.u32 $D4,$R3,${R1}[1]
+
+ vmlal.u32 $D0,$R3,${S2}[1]
+ vmlal.u32 $D1,$R4,${S2}[1]
+ vmlal.u32 $D3,$R1,${R2}[1]
+ vmlal.u32 $D2,$R0,${R2}[1]
+ vmlal.u32 $D4,$R2,${R2}[1]
+
+ vmlal.u32 $D0,$R2,${S3}[1]
+ vmlal.u32 $D3,$R0,${R3}[1]
+ vmlal.u32 $D1,$R3,${S3}[1]
+ vmlal.u32 $D2,$R4,${S3}[1]
+ vmlal.u32 $D4,$R1,${R3}[1]
+
+ vmlal.u32 $D3,$R4,${S4}[1]
+ vmlal.u32 $D0,$R1,${S4}[1]
+ vmlal.u32 $D1,$R2,${S4}[1]
+ vmlal.u32 $D2,$R3,${S4}[1]
+ vmlal.u32 $D4,$R0,${R4}[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vbic.i32 $D4#lo,#0xfc000000
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vbic.i32 $D2#lo,#0xfc000000
+
+ vshr.u32 $T0#lo,$D0#lo,#26
+ vbic.i32 $D0#lo,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+
+ subs $zeros,$zeros,#1
+ beq .Lsquare_break_neon
+
+ add $tbl0,$ctx,#(48+0*9*4)
+ add $tbl1,$ctx,#(48+1*9*4)
+
+ vtrn.32 $R0,$D0#lo @ r^2:r^1
+ vtrn.32 $R2,$D2#lo
+ vtrn.32 $R3,$D3#lo
+ vtrn.32 $R1,$D1#lo
+ vtrn.32 $R4,$D4#lo
+
+ vshl.u32 $S2,$R2,#2 @ *5
+ vshl.u32 $S3,$R3,#2
+ vshl.u32 $S1,$R1,#2
+ vshl.u32 $S4,$R4,#2
+ vadd.i32 $S2,$S2,$R2
+ vadd.i32 $S1,$S1,$R1
+ vadd.i32 $S3,$S3,$R3
+ vadd.i32 $S4,$S4,$R4
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0,:32]
+ vst1.32 {${S4}[1]},[$tbl1,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add $tbl0,$ctx,#(48+2*4*9)
+ add $tbl1,$ctx,#(48+3*4*9)
+
+ vmov $R0,$D0#lo @ r^4:r^3
+ vshl.u32 $S1,$D1#lo,#2 @ *5
+ vmov $R1,$D1#lo
+ vshl.u32 $S2,$D2#lo,#2
+ vmov $R2,$D2#lo
+ vshl.u32 $S3,$D3#lo,#2
+ vmov $R3,$D3#lo
+ vshl.u32 $S4,$D4#lo,#2
+ vmov $R4,$D4#lo
+ vadd.i32 $S1,$S1,$D1#lo
+ vadd.i32 $S2,$S2,$D2#lo
+ vadd.i32 $S3,$S3,$D3#lo
+ vadd.i32 $S4,$S4,$D4#lo
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0]
+ vst1.32 {${S4}[1]},[$tbl1]
+
+ ret @ bx lr
+.size poly1305_init_neon,.-poly1305_init_neon
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+ ands $len,$len,#-16
+ beq .Lno_data_neon
+
+ cmp $len,#64
+ bhs .Lenter_neon
+ tst ip,ip @ is_base2_26?
+ beq .Lpoly1305_blocks
+
+.Lenter_neon:
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl .Lpoly1305_init_neon
+
+ ldr r4,[$ctx,#0] @ load hash value base 2^32
+ ldr r5,[$ctx,#4]
+ ldr r6,[$ctx,#8]
+ ldr r7,[$ctx,#12]
+ ldr ip,[$ctx,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor $D0#lo,$D0#lo,$D0#lo
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor $D1#lo,$D1#lo,$D1#lo
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor $D2#lo,$D2#lo,$D2#lo
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor $D3#lo,$D3#lo,$D3#lo
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor $D4#lo,$D4#lo,$D4#lo
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[$ctx,#36] @ is_base2_26
+
+ vmov.32 $D0#lo[0],r2
+ vmov.32 $D1#lo[0],r3
+ vmov.32 $D2#lo[0],r4
+ vmov.32 $D3#lo[0],r5
+ vmov.32 $D4#lo[0],r6
+ adr $zeros,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lbase2_32_neon
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor $D0#lo,$D0#lo,$D0#lo
+ veor $D1#lo,$D1#lo,$D1#lo
+ veor $D2#lo,$D2#lo,$D2#lo
+ veor $D3#lo,$D3#lo,$D3#lo
+ veor $D4#lo,$D4#lo,$D4#lo
+ vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ adr $zeros,.Lzeros
+ vld1.32 {$D4#lo[0]},[$ctx]
+ sub $ctx,$ctx,#16 @ rewind
+
+.Lbase2_32_neon:
+ add $in2,$inp,#32
+ mov $padbit,$padbit,lsl#24
+ tst $len,#31
+ beq .Leven
+
+ vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+ vmov.32 $H4#lo[0],$padbit
+ sub $len,$len,#16
+ add $in2,$inp,#32
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3#lo,$H3#lo,#18
+
+ vsri.u32 $H3#lo,$H2#lo,#14
+ vshl.u32 $H2#lo,$H2#lo,#12
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
+
+ vbic.i32 $H3#lo,#0xfc000000
+ vsri.u32 $H2#lo,$H1#lo,#20
+ vshl.u32 $H1#lo,$H1#lo,#6
+
+ vbic.i32 $H2#lo,#0xfc000000
+ vsri.u32 $H1#lo,$H0#lo,#26
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+
+ vbic.i32 $H0#lo,#0xfc000000
+ vbic.i32 $H1#lo,#0xfc000000
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo
+
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+
+ mov $tbl1,$zeros
+ add $tbl0,$ctx,#48
+
+ cmp $len,$len
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs $len,$len,#64
+ it lo
+ movlo $in2,$zeros
+
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+ itt hi
+ addhi $tbl1,$ctx,#(48+1*9*4)
+ addhi $tbl0,$ctx,#(48+3*9*4)
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3,$H3,#18
+
+ vsri.u32 $H3,$H2,#14
+ vshl.u32 $H2,$H2,#12
+
+ vbic.i32 $H3,#0xfc000000
+ vsri.u32 $H2,$H1,#20
+ vshl.u32 $H1,$H1,#6
+
+ vbic.i32 $H2,#0xfc000000
+ vsri.u32 $H1,$H0,#26
+
+ vbic.i32 $H0,#0xfc000000
+ vbic.i32 $H1,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ \___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ \___________________/ \____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
+ vmull.u32 $D2,$H2#hi,${R0}[1]
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,${R0}[1]
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,${R0}[1]
+ vmlal.u32 $D2,$H1#hi,${R1}[1]
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,${R0}[1]
+
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,${R0}[1]
+ subs $len,$len,#64
+ vmlal.u32 $D0,$H4#hi,${S1}[1]
+ it lo
+ movlo $in2,$zeros
+ vmlal.u32 $D3,$H2#hi,${R1}[1]
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D1,$H0#hi,${R1}[1]
+ vmlal.u32 $D4,$H3#hi,${R1}[1]
+
+ vmlal.u32 $D0,$H3#hi,${S2}[1]
+ vmlal.u32 $D3,$H1#hi,${R2}[1]
+ vmlal.u32 $D4,$H2#hi,${R2}[1]
+ vmlal.u32 $D1,$H4#hi,${S2}[1]
+ vmlal.u32 $D2,$H0#hi,${R2}[1]
+
+ vmlal.u32 $D3,$H0#hi,${R3}[1]
+ vmlal.u32 $D0,$H2#hi,${S3}[1]
+ vmlal.u32 $D4,$H1#hi,${R3}[1]
+ vmlal.u32 $D1,$H3#hi,${S3}[1]
+ vmlal.u32 $D2,$H4#hi,${S3}[1]
+
+ vmlal.u32 $D3,$H4#hi,${S4}[1]
+ vmlal.u32 $D0,$H1#hi,${S4}[1]
+ vmlal.u32 $D4,$H0#hi,${R4}[1]
+ vmlal.u32 $D1,$H2#hi,${S4}[1]
+ vmlal.u32 $D2,$H3#hi,${S4}[1]
+
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 $D3,$H3#lo,${R0}[0]
+ vmlal.u32 $D0,$H0#lo,${R0}[0]
+ vmlal.u32 $D4,$H4#lo,${R0}[0]
+ vmlal.u32 $D1,$H1#lo,${R0}[0]
+ vmlal.u32 $D2,$H2#lo,${R0}[0]
+ vld1.32 ${S4}[0],[$tbl0,:32]
+
+ vmlal.u32 $D3,$H2#lo,${R1}[0]
+ vmlal.u32 $D0,$H4#lo,${S1}[0]
+ vmlal.u32 $D4,$H3#lo,${R1}[0]
+ vmlal.u32 $D1,$H0#lo,${R1}[0]
+ vmlal.u32 $D2,$H1#lo,${R1}[0]
+
+ vmlal.u32 $D3,$H1#lo,${R2}[0]
+ vmlal.u32 $D0,$H3#lo,${S2}[0]
+ vmlal.u32 $D4,$H2#lo,${R2}[0]
+ vmlal.u32 $D1,$H4#lo,${S2}[0]
+ vmlal.u32 $D2,$H0#lo,${R2}[0]
+
+ vmlal.u32 $D3,$H0#lo,${R3}[0]
+ vmlal.u32 $D0,$H2#lo,${S3}[0]
+ vmlal.u32 $D4,$H1#lo,${R3}[0]
+ vmlal.u32 $D1,$H3#lo,${S3}[0]
+ vmlal.u32 $D3,$H4#lo,${S4}[0]
+
+ vmlal.u32 $D2,$H4#lo,${S3}[0]
+ vmlal.u32 $D0,$H1#lo,${S4}[0]
+ vmlal.u32 $D4,$H0#lo,${R4}[0]
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vmlal.u32 $D1,$H2#lo,${S4}[0]
+ vmlal.u32 $D2,$H3#lo,${S4}[0]
+
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+ vrev32.8 $H3,$H3
+# endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vshl.u32 $H3,$H3,#18
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vsri.u32 $H3,$H2,#14
+ vbic.i32 $D4#lo,#0xfc000000
+ vshl.u32 $H2,$H2,#12
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vbic.i32 $H3,#0xfc000000
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
+ vsri.u32 $H2,$H1,#20
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vshl.u32 $H1,$H1,#6
+ vbic.i32 $D2#lo,#0xfc000000
+ vbic.i32 $H2,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
+ vmovn.i64 $D0#lo,$D0
+ vsri.u32 $H1,$H0,#26
+ vbic.i32 $H0,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vbic.i32 $D0#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+ vbic.i32 $H1,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add $tbl1,$ctx,#(48+0*9*4)
+ add $tbl0,$ctx,#(48+1*9*4)
+ adds $len,$len,#32
+ it ne
+ movne $len,#0
+ bne .Long_tail
+
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
+ vmull.u32 $D2,$H2#hi,$R0
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,$R0
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,$R0
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,$R0
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,$R0
+
+ vmlal.u32 $D0,$H4#hi,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#hi,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#hi,$R1
+ vmlal.u32 $D4,$H3#hi,$R1
+ vmlal.u32 $D2,$H1#hi,$R1
+
+ vmlal.u32 $D3,$H1#hi,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#hi,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#hi,$R2
+ vmlal.u32 $D1,$H4#hi,$S2
+ vmlal.u32 $D2,$H0#hi,$R2
+
+ vmlal.u32 $D3,$H0#hi,$R3
+ it ne
+ addne $tbl1,$ctx,#(48+2*9*4)
+ vmlal.u32 $D0,$H2#hi,$S3
+ it ne
+ addne $tbl0,$ctx,#(48+3*9*4)
+ vmlal.u32 $D4,$H1#hi,$R3
+ vmlal.u32 $D1,$H3#hi,$S3
+ vmlal.u32 $D2,$H4#hi,$S3
+
+ vmlal.u32 $D3,$H4#hi,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
+ vmlal.u32 $D0,$H1#hi,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#hi,$R4
+ vmlal.u32 $D1,$H2#hi,$S4
+ vmlal.u32 $D2,$H3#hi,$S4
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+
+ vmlal.u32 $D2,$H2#lo,$R0
+ vmlal.u32 $D0,$H0#lo,$R0
+ vmlal.u32 $D3,$H3#lo,$R0
+ vmlal.u32 $D1,$H1#lo,$R0
+ vmlal.u32 $D4,$H4#lo,$R0
+
+ vmlal.u32 $D0,$H4#lo,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#lo,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#lo,$R1
+ vmlal.u32 $D4,$H3#lo,$R1
+ vmlal.u32 $D2,$H1#lo,$R1
+
+ vmlal.u32 $D3,$H1#lo,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#lo,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#lo,$R2
+ vmlal.u32 $D1,$H4#lo,$S2
+ vmlal.u32 $D2,$H0#lo,$R2
+
+ vmlal.u32 $D3,$H0#lo,$R3
+ vmlal.u32 $D0,$H2#lo,$S3
+ vmlal.u32 $D4,$H1#lo,$R3
+ vmlal.u32 $D1,$H3#lo,$S3
+ vmlal.u32 $D2,$H4#lo,$S3
+
+ vmlal.u32 $D3,$H4#lo,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones
+ vmlal.u32 $D0,$H1#lo,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#lo,$R4
+ vmlal.u32 $D1,$H2#lo,$S4
+ vmlal.u32 $D2,$H3#lo,$S4
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 $T0,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vshr.u64 $T1,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+
+ vshr.u64 $T0,$D4,#26
+ vand.i64 $D4,$D4,$MASK
+ vshr.u64 $T1,$D1,#26
+ vand.i64 $D1,$D1,$MASK
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+
+ vadd.i64 $D0,$D0,$T0
+ vshl.u64 $T0,$T0,#2
+ vshr.u64 $T1,$D2,#26
+ vand.i64 $D2,$D2,$MASK
+ vadd.i64 $D0,$D0,$T0 @ h4 -> h0
+ vadd.i64 $D3,$D3,$T1 @ h2 -> h3
+
+ vshr.u64 $T0,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vshr.u64 $T1,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vadd.i64 $D1,$D1,$T0 @ h0 -> h1
+ vadd.i64 $D4,$D4,$T1 @ h3 -> h4
+
+ cmp $len,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ vst1.32 {$D4#lo[0]},[$ctx]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+.Lno_data_neon:
+ ret @ bx lr
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ stmdb sp!,{r4-r11}
+
+ tst ip,ip
+ beq .Lpoly1305_emit_enter
+
+ ldmia $ctx,{$h0-$h4}
+ eor $g0,$g0,$g0
+
+ adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $h1,$h1,lsr#6
+ adcs $h1,$h1,$h2,lsl#20
+ mov $h2,$h2,lsr#12
+ adcs $h2,$h2,$h3,lsl#14
+ mov $h3,$h3,lsr#18
+ adcs $h3,$h3,$h4,lsl#8
+ adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
+
+ and $g0,$h4,#-4 @ ... so reduce
+ and $h4,$h3,#3
+ add $g0,$g0,$g0,lsr#2 @ *= 5
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
+
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+ it ne
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+ it ne
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+ it ne
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+ it ne
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0 @ accumulate nonce
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0] @ store the result
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+
+ ldmia sp!,{r4-r11}
+ ret @ bx lr
+.size poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+# ifndef __KERNEL__
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+#endif
+___
+} }
+$code.=<<___;
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/src/crypto/zinc/poly1305/poly1305-arm64.S b/src/crypto/zinc/poly1305/poly1305-arm64.S
deleted file mode 100644
index 5f4e7fb..0000000
--- a/src/crypto/zinc/poly1305/poly1305-arm64.S
+++ /dev/null
@@ -1,824 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/*
- * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- *
- * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
- */
-
-#include <linux/linkage.h>
-.text
-
-.align 5
-ENTRY(poly1305_init_arm)
- cmp x1,xzr
- stp xzr,xzr,[x0] // zero hash value
- stp xzr,xzr,[x0,#16] // [along with is_base2_26]
-
- csel x0,xzr,x0,eq
- b.eq .Lno_key
-
- ldp x7,x8,[x1] // load key
- mov x9,#0xfffffffc0fffffff
- movk x9,#0x0fff,lsl#48
-#ifdef __AARCH64EB__
- rev x7,x7 // flip bytes
- rev x8,x8
-#endif
- and x7,x7,x9 // &=0ffffffc0fffffff
- and x9,x9,#-4
- and x8,x8,x9 // &=0ffffffc0ffffffc
- stp x7,x8,[x0,#32] // save key value
-
-.Lno_key:
- ret
-ENDPROC(poly1305_init_arm)
-
-.align 5
-ENTRY(poly1305_blocks_arm)
- ands x2,x2,#-16
- b.eq .Lno_data
-
- ldp x4,x5,[x0] // load hash value
- ldp x7,x8,[x0,#32] // load key value
- ldr x6,[x0,#16]
- add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
- b .Loop
-
-.align 5
-.Loop:
- ldp x10,x11,[x1],#16 // load input
- sub x2,x2,#16
-#ifdef __AARCH64EB__
- rev x10,x10
- rev x11,x11
-#endif
- adds x4,x4,x10 // accumulate input
- adcs x5,x5,x11
-
- mul x12,x4,x7 // h0*r0
- adc x6,x6,x3
- umulh x13,x4,x7
-
- mul x10,x5,x9 // h1*5*r1
- umulh x11,x5,x9
-
- adds x12,x12,x10
- mul x10,x4,x8 // h0*r1
- adc x13,x13,x11
- umulh x14,x4,x8
-
- adds x13,x13,x10
- mul x10,x5,x7 // h1*r0
- adc x14,x14,xzr
- umulh x11,x5,x7
-
- adds x13,x13,x10
- mul x10,x6,x9 // h2*5*r1
- adc x14,x14,x11
- mul x11,x6,x7 // h2*r0
-
- adds x13,x13,x10
- adc x14,x14,x11
-
- and x10,x14,#-4 // final reduction
- and x6,x14,#3
- add x10,x10,x14,lsr#2
- adds x4,x12,x10
- adcs x5,x13,xzr
- adc x6,x6,xzr
-
- cbnz x2,.Loop
-
- stp x4,x5,[x0] // store hash value
- str x6,[x0,#16]
-
-.Lno_data:
- ret
-ENDPROC(poly1305_blocks_arm)
-
-.align 5
-ENTRY(poly1305_emit_arm)
- ldp x4,x5,[x0] // load hash base 2^64
- ldr x6,[x0,#16]
- ldp x10,x11,[x2] // load nonce
-
- adds x12,x4,#5 // compare to modulus
- adcs x13,x5,xzr
- adc x14,x6,xzr
-
- tst x14,#-4 // see if it's carried/borrowed
-
- csel x4,x4,x12,eq
- csel x5,x5,x13,eq
-
-#ifdef __AARCH64EB__
- ror x10,x10,#32 // flip nonce words
- ror x11,x11,#32
-#endif
- adds x4,x4,x10 // accumulate nonce
- adc x5,x5,x11
-#ifdef __AARCH64EB__
- rev x4,x4 // flip output bytes
- rev x5,x5
-#endif
- stp x4,x5,[x1] // write result
-
- ret
-ENDPROC(poly1305_emit_arm)
-
-.align 5
-__poly1305_mult:
- mul x12,x4,x7 // h0*r0
- umulh x13,x4,x7
-
- mul x10,x5,x9 // h1*5*r1
- umulh x11,x5,x9
-
- adds x12,x12,x10
- mul x10,x4,x8 // h0*r1
- adc x13,x13,x11
- umulh x14,x4,x8
-
- adds x13,x13,x10
- mul x10,x5,x7 // h1*r0
- adc x14,x14,xzr
- umulh x11,x5,x7
-
- adds x13,x13,x10
- mul x10,x6,x9 // h2*5*r1
- adc x14,x14,x11
- mul x11,x6,x7 // h2*r0
-
- adds x13,x13,x10
- adc x14,x14,x11
-
- and x10,x14,#-4 // final reduction
- and x6,x14,#3
- add x10,x10,x14,lsr#2
- adds x4,x12,x10
- adcs x5,x13,xzr
- adc x6,x6,xzr
-
- ret
-
-__poly1305_splat:
- and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x13,x4,#26,#26
- extr x14,x5,x4,#52
- and x14,x14,#0x03ffffff
- ubfx x15,x5,#14,#26
- extr x16,x6,x5,#40
-
- str w12,[x0,#16*0] // r0
- add w12,w13,w13,lsl#2 // r1*5
- str w13,[x0,#16*1] // r1
- add w13,w14,w14,lsl#2 // r2*5
- str w12,[x0,#16*2] // s1
- str w14,[x0,#16*3] // r2
- add w14,w15,w15,lsl#2 // r3*5
- str w13,[x0,#16*4] // s2
- str w15,[x0,#16*5] // r3
- add w15,w16,w16,lsl#2 // r4*5
- str w14,[x0,#16*6] // s3
- str w16,[x0,#16*7] // r4
- str w15,[x0,#16*8] // s4
-
- ret
-
-#ifdef CONFIG_KERNEL_MODE_NEON
-.align 5
-ENTRY(poly1305_blocks_neon)
- ldr x17,[x0,#24]
- cmp x2,#128
- b.hs .Lblocks_neon
- cbz x17,poly1305_blocks_arm
-
-.Lblocks_neon:
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
-
- ands x2,x2,#-16
- b.eq .Lno_data_neon
-
- cbz x17,.Lbase2_64_neon
-
- ldp w10,w11,[x0] // load hash value base 2^26
- ldp w12,w13,[x0,#8]
- ldr w14,[x0,#16]
-
- tst x2,#31
- b.eq .Leven_neon
-
- ldp x7,x8,[x0,#32] // load key value
-
- add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr x5,x12,#12
- adds x4,x4,x12,lsl#52
- add x5,x5,x13,lsl#14
- adc x5,x5,xzr
- lsr x6,x14,#24
- adds x5,x5,x14,lsl#40
- adc x14,x6,xzr // can be partially reduced...
-
- ldp x12,x13,[x1],#16 // load input
- sub x2,x2,#16
- add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
-
- and x10,x14,#-4 // ... so reduce
- and x6,x14,#3
- add x10,x10,x14,lsr#2
- adds x4,x4,x10
- adcs x5,x5,xzr
- adc x6,x6,xzr
-
-#ifdef __AARCH64EB__
- rev x12,x12
- rev x13,x13
-#endif
- adds x4,x4,x12 // accumulate input
- adcs x5,x5,x13
- adc x6,x6,x3
-
- bl __poly1305_mult
- ldr x30,[sp,#8]
-
- cbz x3,.Lstore_base2_64_neon
-
- and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,x4,#26,#26
- extr x12,x5,x4,#52
- and x12,x12,#0x03ffffff
- ubfx x13,x5,#14,#26
- extr x14,x6,x5,#40
-
- cbnz x2,.Leven_neon
-
- stp w10,w11,[x0] // store hash value base 2^26
- stp w12,w13,[x0,#8]
- str w14,[x0,#16]
- b .Lno_data_neon
-
-.align 4
-.Lstore_base2_64_neon:
- stp x4,x5,[x0] // store hash value base 2^64
- stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
- b .Lno_data_neon
-
-.align 4
-.Lbase2_64_neon:
- ldp x7,x8,[x0,#32] // load key value
-
- ldp x4,x5,[x0] // load hash value base 2^64
- ldr x6,[x0,#16]
-
- tst x2,#31
- b.eq .Linit_neon
-
- ldp x12,x13,[x1],#16 // load input
- sub x2,x2,#16
- add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __AARCH64EB__
- rev x12,x12
- rev x13,x13
-#endif
- adds x4,x4,x12 // accumulate input
- adcs x5,x5,x13
- adc x6,x6,x3
-
- bl __poly1305_mult
-
-.Linit_neon:
- and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,x4,#26,#26
- extr x12,x5,x4,#52
- and x12,x12,#0x03ffffff
- ubfx x13,x5,#14,#26
- extr x14,x6,x5,#40
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- fmov d24,x10
- fmov d25,x11
- fmov d26,x12
- fmov d27,x13
- fmov d28,x14
-
- ////////////////////////////////// initialize r^n table
- mov x4,x7 // r^1
- add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
- mov x5,x8
- mov x6,xzr
- add x0,x0,#48+12
- bl __poly1305_splat
-
- bl __poly1305_mult // r^2
- sub x0,x0,#4
- bl __poly1305_splat
-
- bl __poly1305_mult // r^3
- sub x0,x0,#4
- bl __poly1305_splat
-
- bl __poly1305_mult // r^4
- sub x0,x0,#4
- bl __poly1305_splat
- ldr x30,[sp,#8]
-
- add x16,x1,#32
- adr x17,.Lzeros
- subs x2,x2,#64
- csel x16,x17,x16,lo
-
- mov x4,#1
- str x4,[x0,#-24] // set is_base2_26
- sub x0,x0,#48 // restore original x0
- b .Ldo_neon
-
-.align 4
-.Leven_neon:
- add x16,x1,#32
- adr x17,.Lzeros
- subs x2,x2,#64
- csel x16,x17,x16,lo
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- fmov d24,x10
- fmov d25,x11
- fmov d26,x12
- fmov d27,x13
- fmov d28,x14
-
-.Ldo_neon:
- ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
- ldp x9,x13,[x16],#48
-
- lsl x3,x3,#24
- add x15,x0,#48
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov d14,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,x3,x12,lsr#40
- add x13,x3,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov d15,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- fmov d16,x8
- fmov d17,x10
- fmov d18,x12
-
- ldp x8,x12,[x1],#16 // inp[0:1]
- ldp x9,x13,[x1],#48
-
- ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
- ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
- ld1 {v8.4s},[x15]
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov d9,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,x3,x12,lsr#40
- add x13,x3,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov d10,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- movi v31.2d,#-1
- fmov d11,x8
- fmov d12,x10
- fmov d13,x12
- ushr v31.2d,v31.2d,#38
-
- b.ls .Lskip_loop
-
-.align 4
-.Loop_neon:
- ////////////////////////////////////////////////////////////////
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- // ___________________/
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- // ___________________/ ____________________/
- //
- // Note that we start with inp[2:3]*r^2. This is because it
- // doesn't depend on reduction in previous iteration.
- ////////////////////////////////////////////////////////////////
- // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
- // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
- // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
- // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
- // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- subs x2,x2,#64
- umull v23.2d,v14.2s,v7.s[2]
- csel x16,x17,x16,lo
- umull v22.2d,v14.2s,v5.s[2]
- umull v21.2d,v14.2s,v3.s[2]
- ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
- umull v20.2d,v14.2s,v1.s[2]
- ldp x9,x13,[x16],#48
- umull v19.2d,v14.2s,v0.s[2]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- umlal v23.2d,v15.2s,v5.s[2]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal v22.2d,v15.2s,v3.s[2]
- and x5,x9,#0x03ffffff
- umlal v21.2d,v15.2s,v1.s[2]
- ubfx x6,x8,#26,#26
- umlal v20.2d,v15.2s,v0.s[2]
- ubfx x7,x9,#26,#26
- umlal v19.2d,v15.2s,v8.s[2]
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
-
- umlal v23.2d,v16.2s,v3.s[2]
- extr x8,x12,x8,#52
- umlal v22.2d,v16.2s,v1.s[2]
- extr x9,x13,x9,#52
- umlal v21.2d,v16.2s,v0.s[2]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal v20.2d,v16.2s,v8.s[2]
- fmov d14,x4
- umlal v19.2d,v16.2s,v6.s[2]
- and x8,x8,#0x03ffffff
-
- umlal v23.2d,v17.2s,v1.s[2]
- and x9,x9,#0x03ffffff
- umlal v22.2d,v17.2s,v0.s[2]
- ubfx x10,x12,#14,#26
- umlal v21.2d,v17.2s,v8.s[2]
- ubfx x11,x13,#14,#26
- umlal v20.2d,v17.2s,v6.s[2]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal v19.2d,v17.2s,v4.s[2]
- fmov d15,x6
-
- add v11.2s,v11.2s,v26.2s
- add x12,x3,x12,lsr#40
- umlal v23.2d,v18.2s,v0.s[2]
- add x13,x3,x13,lsr#40
- umlal v22.2d,v18.2s,v8.s[2]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal v21.2d,v18.2s,v6.s[2]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal v20.2d,v18.2s,v4.s[2]
- fmov d16,x8
- umlal v19.2d,v18.2s,v2.s[2]
- fmov d17,x10
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4 and accumulate
-
- add v9.2s,v9.2s,v24.2s
- fmov d18,x12
- umlal v22.2d,v11.2s,v1.s[0]
- ldp x8,x12,[x1],#16 // inp[0:1]
- umlal v19.2d,v11.2s,v6.s[0]
- ldp x9,x13,[x1],#48
- umlal v23.2d,v11.2s,v3.s[0]
- umlal v20.2d,v11.2s,v8.s[0]
- umlal v21.2d,v11.2s,v0.s[0]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- add v10.2s,v10.2s,v25.2s
- umlal v22.2d,v9.2s,v5.s[0]
- umlal v23.2d,v9.2s,v7.s[0]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal v21.2d,v9.2s,v3.s[0]
- and x5,x9,#0x03ffffff
- umlal v19.2d,v9.2s,v0.s[0]
- ubfx x6,x8,#26,#26
- umlal v20.2d,v9.2s,v1.s[0]
- ubfx x7,x9,#26,#26
-
- add v12.2s,v12.2s,v27.2s
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- umlal v22.2d,v10.2s,v3.s[0]
- extr x8,x12,x8,#52
- umlal v23.2d,v10.2s,v5.s[0]
- extr x9,x13,x9,#52
- umlal v19.2d,v10.2s,v8.s[0]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal v21.2d,v10.2s,v1.s[0]
- fmov d9,x4
- umlal v20.2d,v10.2s,v0.s[0]
- and x8,x8,#0x03ffffff
-
- add v13.2s,v13.2s,v28.2s
- and x9,x9,#0x03ffffff
- umlal v22.2d,v12.2s,v0.s[0]
- ubfx x10,x12,#14,#26
- umlal v19.2d,v12.2s,v4.s[0]
- ubfx x11,x13,#14,#26
- umlal v23.2d,v12.2s,v1.s[0]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal v20.2d,v12.2s,v6.s[0]
- fmov d10,x6
- umlal v21.2d,v12.2s,v8.s[0]
- add x12,x3,x12,lsr#40
-
- umlal v22.2d,v13.2s,v8.s[0]
- add x13,x3,x13,lsr#40
- umlal v19.2d,v13.2s,v2.s[0]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal v23.2d,v13.2s,v0.s[0]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal v20.2d,v13.2s,v4.s[0]
- fmov d11,x8
- umlal v21.2d,v13.2s,v6.s[0]
- fmov d12,x10
- fmov d13,x12
-
- /////////////////////////////////////////////////////////////////
- // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
- //
- // [see discussion in poly1305-armv4 module]
-
- ushr v29.2d,v22.2d,#26
- xtn v27.2s,v22.2d
- ushr v30.2d,v19.2d,#26
- and v19.16b,v19.16b,v31.16b
- add v23.2d,v23.2d,v29.2d // h3 -> h4
- bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
- add v20.2d,v20.2d,v30.2d // h0 -> h1
-
- ushr v29.2d,v23.2d,#26
- xtn v28.2s,v23.2d
- ushr v30.2d,v20.2d,#26
- xtn v25.2s,v20.2d
- bic v28.2s,#0xfc,lsl#24
- add v21.2d,v21.2d,v30.2d // h1 -> h2
-
- add v19.2d,v19.2d,v29.2d
- shl v29.2d,v29.2d,#2
- shrn v30.2s,v21.2d,#26
- xtn v26.2s,v21.2d
- add v19.2d,v19.2d,v29.2d // h4 -> h0
- bic v25.2s,#0xfc,lsl#24
- add v27.2s,v27.2s,v30.2s // h2 -> h3
- bic v26.2s,#0xfc,lsl#24
-
- shrn v29.2s,v19.2d,#26
- xtn v24.2s,v19.2d
- ushr v30.2s,v27.2s,#26
- bic v27.2s,#0xfc,lsl#24
- bic v24.2s,#0xfc,lsl#24
- add v25.2s,v25.2s,v29.2s // h0 -> h1
- add v28.2s,v28.2s,v30.2s // h3 -> h4
-
- b.hi .Loop_neon
-
-.Lskip_loop:
- dup v16.2d,v16.d[0]
- add v11.2s,v11.2s,v26.2s
-
- ////////////////////////////////////////////////////////////////
- // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- adds x2,x2,#32
- b.ne .Long_tail
-
- dup v16.2d,v11.d[0]
- add v14.2s,v9.2s,v24.2s
- add v17.2s,v12.2s,v27.2s
- add v15.2s,v10.2s,v25.2s
- add v18.2s,v13.2s,v28.2s
-
-.Long_tail:
- dup v14.2d,v14.d[0]
- umull2 v19.2d,v16.4s,v6.4s
- umull2 v22.2d,v16.4s,v1.4s
- umull2 v23.2d,v16.4s,v3.4s
- umull2 v21.2d,v16.4s,v0.4s
- umull2 v20.2d,v16.4s,v8.4s
-
- dup v15.2d,v15.d[0]
- umlal2 v19.2d,v14.4s,v0.4s
- umlal2 v21.2d,v14.4s,v3.4s
- umlal2 v22.2d,v14.4s,v5.4s
- umlal2 v23.2d,v14.4s,v7.4s
- umlal2 v20.2d,v14.4s,v1.4s
-
- dup v17.2d,v17.d[0]
- umlal2 v19.2d,v15.4s,v8.4s
- umlal2 v22.2d,v15.4s,v3.4s
- umlal2 v21.2d,v15.4s,v1.4s
- umlal2 v23.2d,v15.4s,v5.4s
- umlal2 v20.2d,v15.4s,v0.4s
-
- dup v18.2d,v18.d[0]
- umlal2 v22.2d,v17.4s,v0.4s
- umlal2 v23.2d,v17.4s,v1.4s
- umlal2 v19.2d,v17.4s,v4.4s
- umlal2 v20.2d,v17.4s,v6.4s
- umlal2 v21.2d,v17.4s,v8.4s
-
- umlal2 v22.2d,v18.4s,v8.4s
- umlal2 v19.2d,v18.4s,v2.4s
- umlal2 v23.2d,v18.4s,v0.4s
- umlal2 v20.2d,v18.4s,v4.4s
- umlal2 v21.2d,v18.4s,v6.4s
-
- b.eq .Lshort_tail
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4:r^3 and accumulate
-
- add v9.2s,v9.2s,v24.2s
- umlal v22.2d,v11.2s,v1.2s
- umlal v19.2d,v11.2s,v6.2s
- umlal v23.2d,v11.2s,v3.2s
- umlal v20.2d,v11.2s,v8.2s
- umlal v21.2d,v11.2s,v0.2s
-
- add v10.2s,v10.2s,v25.2s
- umlal v22.2d,v9.2s,v5.2s
- umlal v19.2d,v9.2s,v0.2s
- umlal v23.2d,v9.2s,v7.2s
- umlal v20.2d,v9.2s,v1.2s
- umlal v21.2d,v9.2s,v3.2s
-
- add v12.2s,v12.2s,v27.2s
- umlal v22.2d,v10.2s,v3.2s
- umlal v19.2d,v10.2s,v8.2s
- umlal v23.2d,v10.2s,v5.2s
- umlal v20.2d,v10.2s,v0.2s
- umlal v21.2d,v10.2s,v1.2s
-
- add v13.2s,v13.2s,v28.2s
- umlal v22.2d,v12.2s,v0.2s
- umlal v19.2d,v12.2s,v4.2s
- umlal v23.2d,v12.2s,v1.2s
- umlal v20.2d,v12.2s,v6.2s
- umlal v21.2d,v12.2s,v8.2s
-
- umlal v22.2d,v13.2s,v8.2s
- umlal v19.2d,v13.2s,v2.2s
- umlal v23.2d,v13.2s,v0.2s
- umlal v20.2d,v13.2s,v4.2s
- umlal v21.2d,v13.2s,v6.2s
-
-.Lshort_tail:
- ////////////////////////////////////////////////////////////////
- // horizontal add
-
- addp v22.2d,v22.2d,v22.2d
- ldp d8,d9,[sp,#16] // meet ABI requirements
- addp v19.2d,v19.2d,v19.2d
- ldp d10,d11,[sp,#32]
- addp v23.2d,v23.2d,v23.2d
- ldp d12,d13,[sp,#48]
- addp v20.2d,v20.2d,v20.2d
- ldp d14,d15,[sp,#64]
- addp v21.2d,v21.2d,v21.2d
-
- ////////////////////////////////////////////////////////////////
- // lazy reduction, but without narrowing
-
- ushr v29.2d,v22.2d,#26
- and v22.16b,v22.16b,v31.16b
- ushr v30.2d,v19.2d,#26
- and v19.16b,v19.16b,v31.16b
-
- add v23.2d,v23.2d,v29.2d // h3 -> h4
- add v20.2d,v20.2d,v30.2d // h0 -> h1
-
- ushr v29.2d,v23.2d,#26
- and v23.16b,v23.16b,v31.16b
- ushr v30.2d,v20.2d,#26
- and v20.16b,v20.16b,v31.16b
- add v21.2d,v21.2d,v30.2d // h1 -> h2
-
- add v19.2d,v19.2d,v29.2d
- shl v29.2d,v29.2d,#2
- ushr v30.2d,v21.2d,#26
- and v21.16b,v21.16b,v31.16b
- add v19.2d,v19.2d,v29.2d // h4 -> h0
- add v22.2d,v22.2d,v30.2d // h2 -> h3
-
- ushr v29.2d,v19.2d,#26
- and v19.16b,v19.16b,v31.16b
- ushr v30.2d,v22.2d,#26
- and v22.16b,v22.16b,v31.16b
- add v20.2d,v20.2d,v29.2d // h0 -> h1
- add v23.2d,v23.2d,v30.2d // h3 -> h4
-
- ////////////////////////////////////////////////////////////////
- // write the result, can be partially reduced
-
- st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
- st1 {v23.s}[0],[x0]
-
-.Lno_data_neon:
- ldr x29,[sp],#80
- ret
-ENDPROC(poly1305_blocks_neon)
-
-.align 5
-ENTRY(poly1305_emit_neon)
- ldr x17,[x0,#24]
- cbz x17,poly1305_emit_arm
-
- ldp w10,w11,[x0] // load hash value base 2^26
- ldp w12,w13,[x0,#8]
- ldr w14,[x0,#16]
-
- add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr x5,x12,#12
- adds x4,x4,x12,lsl#52
- add x5,x5,x13,lsl#14
- adc x5,x5,xzr
- lsr x6,x14,#24
- adds x5,x5,x14,lsl#40
- adc x6,x6,xzr // can be partially reduced...
-
- ldp x10,x11,[x2] // load nonce
-
- and x12,x6,#-4 // ... so reduce
- add x12,x12,x6,lsr#2
- and x6,x6,#3
- adds x4,x4,x12
- adcs x5,x5,xzr
- adc x6,x6,xzr
-
- adds x12,x4,#5 // compare to modulus
- adcs x13,x5,xzr
- adc x14,x6,xzr
-
- tst x14,#-4 // see if it's carried/borrowed
-
- csel x4,x4,x12,eq
- csel x5,x5,x13,eq
-
-#ifdef __AARCH64EB__
- ror x10,x10,#32 // flip nonce words
- ror x11,x11,#32
-#endif
- adds x4,x4,x10 // accumulate nonce
- adc x5,x5,x11
-#ifdef __AARCH64EB__
- rev x4,x4 // flip output bytes
- rev x5,x5
-#endif
- stp x4,x5,[x1] // write result
-
- ret
-ENDPROC(poly1305_emit_neon)
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0
-#endif
diff --git a/src/crypto/zinc/poly1305/poly1305-arm64.pl b/src/crypto/zinc/poly1305/poly1305-arm64.pl
new file mode 100644
index 0000000..cf0ce9d
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-arm64.pl
@@ -0,0 +1,972 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc-4.9 NEON
+#
+# Apple A7 1.86/+5% 0.72
+# Cortex-A53 2.69/+58% 1.47
+# Cortex-A57 2.70/+7% 1.14
+# Denver 1.64/+50% 1.18(*)
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+# Kryo 2.70/+55% 1.13
+#
+# (*) estimate based on resources availability is less than 1.0,
+# i.e. measured result is worse than expected, presumably binary
+# translator is not almighty;
+
+$flavour=shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#else
+# define poly1305_init poly1305_init_arm
+# define poly1305_blocks poly1305_blocks_arm
+# define poly1305_emit poly1305_emit_arm
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+.globl poly1305_emit_neon
+#endif
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifndef __KERNEL__
+# ifdef __ILP32__
+ ldrsw $t1,.LOPENSSL_armcap_P
+# else
+ ldr $t1,.LOPENSSL_armcap_P
+# endif
+ adr $t0,.LOPENSSL_armcap_P
+ ldr w17,[$t0,$t1]
+#endif
+
+ ldp $r0,$r1,[$inp] // load key
+ mov $s1,#0xfffffffc0fffffff
+ movk $s1,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+ rev $r0,$r0 // flip bytes
+ rev $r1,$r1
+#endif
+ and $r0,$r0,$s1 // &=0ffffffc0fffffff
+ and $s1,$s1,#-4
+ and $r1,$r1,$s1 // &=0ffffffc0ffffffc
+ stp $r0,$r1,[$ctx,#32] // save key value
+
+#ifndef __KERNEL__
+ tst w17,#ARMV7_NEON
+
+ adr $d0,poly1305_blocks
+ adr $r0,poly1305_blocks_neon
+ adr $d1,poly1305_emit
+ adr $r1,poly1305_emit_neon
+
+ csel $d0,$d0,$r0,eq
+ csel $d1,$d1,$r1,eq
+
+# ifdef __ILP32__
+ stp w12,w13,[$len]
+# else
+ stp $d0,$d1,[$len]
+# endif
+
+ mov x0,#1
+#else
+ mov x0,#0
+#endif
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+ ldp $h0,$h1,[$ctx] // load hash value
+ ldp $r0,$r1,[$ctx,#32] // load key value
+ ldr $h2,[$ctx,#16]
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ b .Loop
+
+.align 5
+.Loop:
+ ldp $t0,$t1,[$inp],#16 // load input
+ sub $len,$len,#16
+#ifdef __AARCH64EB__
+ rev $t0,$t0
+ rev $t1,$t1
+#endif
+ adds $h0,$h0,$t0 // accumulate input
+ adcs $h1,$h1,$t1
+
+ mul $d0,$h0,$r0 // h0*r0
+ adc $h2,$h2,$padbit
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ cbnz $len,.Loop
+
+ stp $h0,$h1,[$ctx] // store hash value
+ str $h2,[$ctx,#16]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldr $h2,[$ctx,#16]
+ ldp $t0,$t1,[$nonce] // load nonce
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __AARCH64EB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __AARCH64EB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros; # borrow
+
+$code.=<<___;
+.type __poly1305_mult,%function
+.align 5
+__poly1305_mult:
+ mul $d0,$h0,$r0 // h0*r0
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ ret
+.size __poly1305_mult,.-__poly1305_mult
+
+.type __poly1305_splat,%function
+.align 5
+__poly1305_splat:
+ and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,$h0,#26,#26
+ extr x14,$h1,$h0,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,$h1,#14,#26
+ extr x16,$h2,$h1,#40
+
+ str w12,[$ctx,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[$ctx,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[$ctx,#16*2] // s1
+ str w14,[$ctx,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[$ctx,#16*4] // s2
+ str w15,[$ctx,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[$ctx,#16*6] // s3
+ str w16,[$ctx,#16*7] // r4
+ str w15,[$ctx,#16*8] // s4
+
+ ret
+.size __poly1305_splat,.-__poly1305_splat
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.hs .Lblocks_neon
+ cbz $is_base2_26,poly1305_blocks
+
+.Lblocks_neon:
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ ands $len,$len,#-16
+ b.eq .Lno_data_neon
+
+ cbz $is_base2_26,.Lbase2_64_neon
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Leven_neon
+
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $d2,$h2,xzr // can be partially reduced...
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+
+ and $t0,$d2,#-4 // ... so reduce
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$h0,$t0
+ adcs $h1,$h1,xzr
+ adc $h2,$h2,xzr
+
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl __poly1305_mult
+ ldr x30,[sp,#8]
+
+ cbz $padbit,.Lstore_base2_64_neon
+
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ cbnz $len,.Leven_neon
+
+ stp w10,w11,[$ctx] // store hash value base 2^26
+ stp w12,w13,[$ctx,#8]
+ str w14,[$ctx,#16]
+ b .Lno_data_neon
+
+.align 4
+.Lstore_base2_64_neon:
+ stp $h0,$h1,[$ctx] // store hash value base 2^64
+ stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
+ b .Lno_data_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ ldp $h0,$h1,[$ctx] // load hash value base 2^64
+ ldr $h2,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Linit_neon
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl __poly1305_mult
+
+.Linit_neon:
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+ ////////////////////////////////// initialize r^n table
+ mov $h0,$r0 // r^1
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov $h1,$r1
+ mov $h2,xzr
+ add $ctx,$ctx,#48+12
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^2
+ sub $ctx,$ctx,#4
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^3
+ sub $ctx,$ctx,#4
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^4
+ sub $ctx,$ctx,#4
+ bl __poly1305_splat
+ ldr x30,[sp,#8]
+
+ add $in2,$inp,#32
+ adr $zeros,.Lzeros
+ subs $len,$len,#64
+ csel $in2,$zeros,$in2,lo
+
+ mov x4,#1
+ str x4,[$ctx,#-24] // set is_base2_26
+ sub $ctx,$ctx,#48 // restore original $ctx
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ add $in2,$inp,#32
+ adr $zeros,.Lzeros
+ subs $len,$len,#64
+ csel $in2,$zeros,$in2,lo
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+.Ldo_neon:
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ ldp x9,x13,[$in2],#48
+
+ lsl $padbit,$padbit,#24
+ add x15,$ctx,#48
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN23_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN23_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN23_2,x8
+ fmov $IN23_3,x10
+ fmov $IN23_4,x12
+
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ ldp x9,x13,[$inp],#48
+
+ ld1 {$R0,$R1,$S1,$R2},[x15],#64
+ ld1 {$S2,$R3,$S3,$R4},[x15],#64
+ ld1 {$S4},[x15]
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN01_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN01_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
+ fmov $IN01_2,x8
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // \___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // \___________________/ \____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs $len,$len,#64
+ umull $ACC4,$IN23_0,${R4}[2]
+ csel $in2,$zeros,$in2,lo
+ umull $ACC3,$IN23_0,${R3}[2]
+ umull $ACC2,$IN23_0,${R2}[2]
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ umull $ACC1,$IN23_0,${R1}[2]
+ ldp x9,x13,[$in2],#48
+ umull $ACC0,$IN23_0,${R0}[2]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal $ACC4,$IN23_1,${R3}[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC3,$IN23_1,${R2}[2]
+ and x5,x9,#0x03ffffff
+ umlal $ACC2,$IN23_1,${R1}[2]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN23_1,${R0}[2]
+ ubfx x7,x9,#26,#26
+ umlal $ACC0,$IN23_1,${S4}[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal $ACC4,$IN23_2,${R2}[2]
+ extr x8,x12,x8,#52
+ umlal $ACC3,$IN23_2,${R1}[2]
+ extr x9,x13,x9,#52
+ umlal $ACC2,$IN23_2,${R0}[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC1,$IN23_2,${S4}[2]
+ fmov $IN23_0,x4
+ umlal $ACC0,$IN23_2,${S3}[2]
+ and x8,x8,#0x03ffffff
+
+ umlal $ACC4,$IN23_3,${R1}[2]
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN23_3,${R0}[2]
+ ubfx x10,x12,#14,#26
+ umlal $ACC2,$IN23_3,${S4}[2]
+ ubfx x11,x13,#14,#26
+ umlal $ACC1,$IN23_3,${S3}[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC0,$IN23_3,${S2}[2]
+ fmov $IN23_1,x6
+
+ add $IN01_2,$IN01_2,$H2
+ add x12,$padbit,x12,lsr#40
+ umlal $ACC4,$IN23_4,${R0}[2]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC3,$IN23_4,${S4}[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC2,$IN23_4,${S3}[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN23_4,${S2}[2]
+ fmov $IN23_2,x8
+ umlal $ACC0,$IN23_4,${S1}[2]
+ fmov $IN23_3,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ fmov $IN23_4,x12
+ umlal $ACC3,$IN01_2,${R1}[0]
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ umlal $ACC0,$IN01_2,${S3}[0]
+ ldp x9,x13,[$inp],#48
+ umlal $ACC4,$IN01_2,${R2}[0]
+ umlal $ACC1,$IN01_2,${S4}[0]
+ umlal $ACC2,$IN01_2,${R0}[0]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}[0]
+ umlal $ACC4,$IN01_0,${R4}[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC2,$IN01_0,${R2}[0]
+ and x5,x9,#0x03ffffff
+ umlal $ACC0,$IN01_0,${R0}[0]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN01_0,${R1}[0]
+ ubfx x7,x9,#26,#26
+
+ add $IN01_3,$IN01_3,$H3
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal $ACC3,$IN01_1,${R2}[0]
+ extr x8,x12,x8,#52
+ umlal $ACC4,$IN01_1,${R3}[0]
+ extr x9,x13,x9,#52
+ umlal $ACC0,$IN01_1,${S4}[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC2,$IN01_1,${R1}[0]
+ fmov $IN01_0,x4
+ umlal $ACC1,$IN01_1,${R0}[0]
+ and x8,x8,#0x03ffffff
+
+ add $IN01_4,$IN01_4,$H4
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN01_3,${R0}[0]
+ ubfx x10,x12,#14,#26
+ umlal $ACC0,$IN01_3,${S2}[0]
+ ubfx x11,x13,#14,#26
+ umlal $ACC4,$IN01_3,${R1}[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC1,$IN01_3,${S3}[0]
+ fmov $IN01_1,x6
+ umlal $ACC2,$IN01_3,${S4}[0]
+ add x12,$padbit,x12,lsr#40
+
+ umlal $ACC3,$IN01_4,${S4}[0]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC0,$IN01_4,${S1}[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC4,$IN01_4,${R0}[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN01_4,${S2}[0]
+ fmov $IN01_2,x8
+ umlal $ACC2,$IN01_4,${S3}[0]
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr $T0.2d,$ACC3,#26
+ xtn $H3,$ACC3
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ bic $H3,#0xfc,lsl#24 // &=0x03ffffff
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ xtn $H4,$ACC4
+ ushr $T1.2d,$ACC1,#26
+ xtn $H1,$ACC1
+ bic $H4,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ shrn $T1.2s,$ACC2,#26
+ xtn $H2,$ACC2
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
+ add $H3,$H3,$T1.2s // h2 -> h3
+ bic $H2,#0xfc,lsl#24
+
+ shrn $T0.2s,$ACC0,#26
+ xtn $H0,$ACC0
+ ushr $T1.2s,$H3,#26
+ bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
+ add $H1,$H1,$T0.2s // h0 -> h1
+ add $H4,$H4,$T1.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup $IN23_2,${IN23_2}[0]
+ add $IN01_2,$IN01_2,$H2
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds $len,$len,#32
+ b.ne .Long_tail
+
+ dup $IN23_2,${IN01_2}[0]
+ add $IN23_0,$IN01_0,$H0
+ add $IN23_3,$IN01_3,$H3
+ add $IN23_1,$IN01_1,$H1
+ add $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+ dup $IN23_0,${IN23_0}[0]
+ umull2 $ACC0,$IN23_2,${S3}
+ umull2 $ACC3,$IN23_2,${R1}
+ umull2 $ACC4,$IN23_2,${R2}
+ umull2 $ACC2,$IN23_2,${R0}
+ umull2 $ACC1,$IN23_2,${S4}
+
+ dup $IN23_1,${IN23_1}[0]
+ umlal2 $ACC0,$IN23_0,${R0}
+ umlal2 $ACC2,$IN23_0,${R2}
+ umlal2 $ACC3,$IN23_0,${R3}
+ umlal2 $ACC4,$IN23_0,${R4}
+ umlal2 $ACC1,$IN23_0,${R1}
+
+ dup $IN23_3,${IN23_3}[0]
+ umlal2 $ACC0,$IN23_1,${S4}
+ umlal2 $ACC3,$IN23_1,${R2}
+ umlal2 $ACC2,$IN23_1,${R1}
+ umlal2 $ACC4,$IN23_1,${R3}
+ umlal2 $ACC1,$IN23_1,${R0}
+
+ dup $IN23_4,${IN23_4}[0]
+ umlal2 $ACC3,$IN23_3,${R0}
+ umlal2 $ACC4,$IN23_3,${R1}
+ umlal2 $ACC0,$IN23_3,${S2}
+ umlal2 $ACC1,$IN23_3,${S3}
+ umlal2 $ACC2,$IN23_3,${S4}
+
+ umlal2 $ACC3,$IN23_4,${S4}
+ umlal2 $ACC0,$IN23_4,${S1}
+ umlal2 $ACC4,$IN23_4,${R0}
+ umlal2 $ACC1,$IN23_4,${S2}
+ umlal2 $ACC2,$IN23_4,${S3}
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ umlal $ACC3,$IN01_2,${R1}
+ umlal $ACC0,$IN01_2,${S3}
+ umlal $ACC4,$IN01_2,${R2}
+ umlal $ACC1,$IN01_2,${S4}
+ umlal $ACC2,$IN01_2,${R0}
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}
+ umlal $ACC0,$IN01_0,${R0}
+ umlal $ACC4,$IN01_0,${R4}
+ umlal $ACC1,$IN01_0,${R1}
+ umlal $ACC2,$IN01_0,${R2}
+
+ add $IN01_3,$IN01_3,$H3
+ umlal $ACC3,$IN01_1,${R2}
+ umlal $ACC0,$IN01_1,${S4}
+ umlal $ACC4,$IN01_1,${R3}
+ umlal $ACC1,$IN01_1,${R0}
+ umlal $ACC2,$IN01_1,${R1}
+
+ add $IN01_4,$IN01_4,$H4
+ umlal $ACC3,$IN01_3,${R0}
+ umlal $ACC0,$IN01_3,${S2}
+ umlal $ACC4,$IN01_3,${R1}
+ umlal $ACC1,$IN01_3,${S3}
+ umlal $ACC2,$IN01_3,${S4}
+
+ umlal $ACC3,$IN01_4,${S4}
+ umlal $ACC0,$IN01_4,${S1}
+ umlal $ACC4,$IN01_4,${R0}
+ umlal $ACC1,$IN01_4,${S2}
+ umlal $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC3,$ACC3,$ACC3
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC4,$ACC4,$ACC4
+ ldp d12,d13,[sp,#48]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d14,d15,[sp,#64]
+ addp $ACC2,$ACC2,$ACC2
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr $T0.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ and $ACC4,$ACC4,$MASK.2d
+ ushr $T1.2d,$ACC1,#26
+ and $ACC1,$ACC1,$MASK.2d
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ ushr $T1.2d,$ACC2,#26
+ and $ACC2,$ACC2,$MASK.2d
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ add $ACC3,$ACC3,$T1.2d // h2 -> h3
+
+ ushr $T0.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ ushr $T1.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ add $ACC1,$ACC1,$T0.2d // h0 -> h1
+ add $ACC4,$ACC4,$T1.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+ st1 {$ACC4}[0],[$ctx]
+
+.Lno_data_neon:
+ ldr x29,[sp],#80
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cbz $is_base2_26,poly1305_emit
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $h2,$h2,xzr // can be partially reduced...
+
+ ldp $t0,$t1,[$nonce] // load nonce
+
+ and $d0,$h2,#-4 // ... so reduce
+ add $d0,$d0,$h2,lsr#2
+ and $h2,$h2,#3
+ adds $h0,$h0,$d0
+ adcs $h1,$h1,xzr
+ adc $h2,$h2,xzr
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __AARCH64EB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __AARCH64EB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+#ifndef __KERNEL__
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+#else
+.quad OPENSSL_armcap_P-.
+#endif
+#endif
+.align 2
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach (split("\n",$code)) {
+ s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
+ s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
+ (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
+ (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
+ (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
+ (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
+ (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+ s/\.[124]([sd])\[/.$1\[/;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/src/crypto/zinc/poly1305/poly1305-mips64.S b/src/crypto/zinc/poly1305/poly1305-mips64.S
deleted file mode 100644
index 272a86c..0000000
--- a/src/crypto/zinc/poly1305/poly1305-mips64.S
+++ /dev/null
@@ -1,360 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/*
- * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- *
- * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
- */
-
-#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \
- defined(_MIPS_ARCH_MIPS64R6)) && !defined(_MIPS_ARCH_MIPS64R2)
-#define _MIPS_ARCH_MIPS64R2
-#endif
-
-#ifdef __MIPSEB__
-#define MSB 0
-#define LSB 7
-#else
-#define MSB 7
-#define LSB 0
-#endif
-
-#if defined(_MIPS_ARCH_MIPS64R6)
-#define dmultu(rs,rt)
-#define mflo(rd,rs,rt) dmulu rd,rs,rt
-#define mfhi(rd,rs,rt) dmuhu rd,rs,rt
-#else
-#define dmultu(rs,rt) dmultu rs,rt
-#define multu(rs,rt) multu rs,rt
-#define mflo(rd,rs,rt) mflo rd
-#define mfhi(rd,rs,rt) mfhi rd
-#endif
-
-.text
-.set noat
-.set noreorder
-
-/* While most of the assembly in the kernel prefers ENTRY() and ENDPROC(),
- * there is no existing MIPS assembly that uses it, and MIPS assembler seems
- * to like its own .ent/.end notation, which the MIPS include files don't
- * provide in a MIPS-specific ENTRY/ENDPROC definition. So, we skip these
- * for now, until somebody complains. */
-
-.align 5
-.globl poly1305_init_mips
-.ent poly1305_init_mips
-poly1305_init_mips:
- .frame $29,0,$31
- .set reorder
-
- sd $0,0($4)
- sd $0,8($4)
- sd $0,16($4)
-
- beqz $5,.Lno_key
-
-#if defined(_MIPS_ARCH_MIPS64R6)
- ld $8,0($5)
- ld $9,8($5)
-#else
- ldl $8,0+MSB($5)
- ldl $9,8+MSB($5)
- ldr $8,0+LSB($5)
- ldr $9,8+LSB($5)
-#endif
-#ifdef __MIPSEB__
-#if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $8,$8 # byte swap
- dsbh $9,$9
- dshd $8,$8
- dshd $9,$9
-#else
- ori $10,$0,0xFF
- dsll $1,$10,32
- or $10,$1 # 0x000000FF000000FF
-
- and $11,$8,$10 # byte swap
- and $2,$9,$10
- dsrl $1,$8,24
- dsrl $24,$9,24
- dsll $11,24
- dsll $2,24
- and $1,$10
- and $24,$10
- dsll $10,8 # 0x0000FF000000FF00
- or $11,$1
- or $2,$24
- and $1,$8,$10
- and $24,$9,$10
- dsrl $8,8
- dsrl $9,8
- dsll $1,8
- dsll $24,8
- and $8,$10
- and $9,$10
- or $11,$1
- or $2,$24
- or $8,$11
- or $9,$2
- dsrl $11,$8,32
- dsrl $2,$9,32
- dsll $8,32
- dsll $9,32
- or $8,$11
- or $9,$2
-#endif
-#endif
- li $10,1
- dsll $10,32
- daddiu $10,-63
- dsll $10,28
- daddiu $10,-1 # 0ffffffc0fffffff
-
- and $8,$10
- daddiu $10,-3 # 0ffffffc0ffffffc
- and $9,$10
-
- sd $8,24($4)
- dsrl $10,$9,2
- sd $9,32($4)
- daddu $10,$9 # s1 = r1 + (r1 >> 2)
- sd $10,40($4)
-
-.Lno_key:
- li $2,0 # return 0
- jr $31
-.end poly1305_init_mips
-
-.align 5
-.globl poly1305_blocks_mips
-.ent poly1305_blocks_mips
-poly1305_blocks_mips:
- .set noreorder
- dsrl $6,4 # number of complete blocks
- bnez $6,poly1305_blocks_internal
- nop
- jr $31
- nop
-.end poly1305_blocks_mips
-
-.align 5
-.ent poly1305_blocks_internal
-poly1305_blocks_internal:
- .frame $29,6*8,$31
- .mask 0x00030000,-8
- .set noreorder
- dsubu $29,6*8
- sd $17,40($29)
- sd $16,32($29)
- .set reorder
-
- ld $12,0($4) # load hash value
- ld $13,8($4)
- ld $14,16($4)
-
- ld $15,24($4) # load key
- ld $16,32($4)
- ld $17,40($4)
-
-.Loop:
-#if defined(_MIPS_ARCH_MIPS64R6)
- ld $8,0($5) # load input
- ld $9,8($5)
-#else
- ldl $8,0+MSB($5) # load input
- ldl $9,8+MSB($5)
- ldr $8,0+LSB($5)
- ldr $9,8+LSB($5)
-#endif
- daddiu $6,-1
- daddiu $5,16
-#ifdef __MIPSEB__
-#if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $8,$8 # byte swap
- dsbh $9,$9
- dshd $8,$8
- dshd $9,$9
-#else
- ori $10,$0,0xFF
- dsll $1,$10,32
- or $10,$1 # 0x000000FF000000FF
-
- and $11,$8,$10 # byte swap
- and $2,$9,$10
- dsrl $1,$8,24
- dsrl $24,$9,24
- dsll $11,24
- dsll $2,24
- and $1,$10
- and $24,$10
- dsll $10,8 # 0x0000FF000000FF00
- or $11,$1
- or $2,$24
- and $1,$8,$10
- and $24,$9,$10
- dsrl $8,8
- dsrl $9,8
- dsll $1,8
- dsll $24,8
- and $8,$10
- and $9,$10
- or $11,$1
- or $2,$24
- or $8,$11
- or $9,$2
- dsrl $11,$8,32
- dsrl $2,$9,32
- dsll $8,32
- dsll $9,32
- or $8,$11
- or $9,$2
-#endif
-#endif
- daddu $12,$8 # accumulate input
- daddu $13,$9
- sltu $10,$12,$8
- sltu $11,$13,$9
- daddu $13,$10
-
- dmultu ($15,$12) # h0*r0
- daddu $14,$7
- sltu $10,$13,$10
- mflo ($8,$15,$12)
- mfhi ($9,$15,$12)
-
- dmultu ($17,$13) # h1*5*r1
- daddu $10,$11
- daddu $14,$10
- mflo ($10,$17,$13)
- mfhi ($11,$17,$13)
-
- dmultu ($16,$12) # h0*r1
- daddu $8,$10
- daddu $9,$11
- mflo ($1,$16,$12)
- mfhi ($25,$16,$12)
- sltu $10,$8,$10
- daddu $9,$10
-
- dmultu ($15,$13) # h1*r0
- daddu $9,$1
- sltu $1,$9,$1
- mflo ($10,$15,$13)
- mfhi ($11,$15,$13)
- daddu $25,$1
-
- dmultu ($17,$14) # h2*5*r1
- daddu $9,$10
- daddu $25,$11
- mflo ($1,$17,$14)
-
- dmultu ($15,$14) # h2*r0
- sltu $10,$9,$10
- daddu $25,$10
- mflo ($2,$15,$14)
-
- daddu $9,$1
- daddu $25,$2
- sltu $1,$9,$1
- daddu $25,$1
-
- li $10,-4 # final reduction
- and $10,$25
- dsrl $11,$25,2
- andi $14,$25,3
- daddu $10,$11
- daddu $12,$8,$10
- sltu $10,$12,$10
- daddu $13,$9,$10
- sltu $10,$13,$10
- daddu $14,$14,$10
-
- bnez $6,.Loop
-
- sd $12,0($4) # store hash value
- sd $13,8($4)
- sd $14,16($4)
-
- .set noreorder
- ld $17,40($29) # epilogue
- ld $16,32($29)
- jr $31
- daddu $29,6*8
-.end poly1305_blocks_internal
-
-.align 5
-.globl poly1305_emit_mips
-.ent poly1305_emit_mips
-poly1305_emit_mips:
- .frame $29,0,$31
- .set reorder
-
- ld $10,0($4)
- ld $11,8($4)
- ld $1,16($4)
-
- daddiu $8,$10,5 # compare to modulus
- sltiu $2,$8,5
- daddu $9,$11,$2
- sltu $2,$9,$2
- daddu $1,$1,$2
-
- dsrl $1,2 # see if it carried/borrowed
- dsubu $1,$0,$1
- nor $2,$0,$1
-
- and $8,$1
- and $10,$2
- and $9,$1
- and $11,$2
- or $8,$10
- or $9,$11
-
- lwu $10,0($6) # load nonce
- lwu $11,4($6)
- lwu $1,8($6)
- lwu $2,12($6)
- dsll $11,32
- dsll $2,32
- or $10,$11
- or $1,$2
-
- daddu $8,$10 # accumulate nonce
- daddu $9,$1
- sltu $10,$8,$10
- daddu $9,$10
-
- dsrl $10,$8,8 # write mac value
- dsrl $11,$8,16
- dsrl $1,$8,24
- sb $8,0($5)
- dsrl $2,$8,32
- sb $10,1($5)
- dsrl $10,$8,40
- sb $11,2($5)
- dsrl $11,$8,48
- sb $1,3($5)
- dsrl $1,$8,56
- sb $2,4($5)
- dsrl $2,$9,8
- sb $10,5($5)
- dsrl $10,$9,16
- sb $11,6($5)
- dsrl $11,$9,24
- sb $1,7($5)
-
- sb $9,8($5)
- dsrl $1,$9,32
- sb $2,9($5)
- dsrl $2,$9,40
- sb $10,10($5)
- dsrl $10,$9,48
- sb $11,11($5)
- dsrl $11,$9,56
- sb $1,12($5)
- sb $2,13($5)
- sb $10,14($5)
- sb $11,15($5)
-
- jr $31
-.end poly1305_emit_mips
diff --git a/src/crypto/zinc/poly1305/poly1305-mips64.pl b/src/crypto/zinc/poly1305/poly1305-mips64.pl
new file mode 100644
index 0000000..d30a03d
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-mips64.pl
@@ -0,0 +1,467 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Poly1305 hash for MIPS64.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc
+# R1x000 5.64/+120% (big-endian)
+# Octeon II 3.80/+280% (little-endian)
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+# excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+die "MIPS64 only" unless ($flavour =~ /64|n32/i);
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
+ defined(_MIPS_ARCH_MIPS64R6)) \\
+ && !defined(_MIPS_ARCH_MIPS64R2)
+# define _MIPS_ARCH_MIPS64R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+# define dmultu(rs,rt)
+# define mflo(rd,rs,rt) dmulu rd,rs,rt
+# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
+#else
+# define dmultu(rs,rt) dmultu rs,rt
+# define mflo(rd,rs,rt) mflo rd
+# define mfhi(rd,rs,rt) mfhi rd
+#endif
+
+#ifdef __KERNEL__
+# define poly1305_init poly1305_init_mips
+# define poly1305_blocks poly1305_blocks_mips
+# define poly1305_emit poly1305_emit_mips
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set noat
+.set noreorder
+
+.align 5
+.globl poly1305_init
+.ent poly1305_init
+poly1305_init:
+ .frame $sp,0,$ra
+ .set reorder
+
+ sd $zero,0($ctx)
+ sd $zero,8($ctx)
+ sd $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+ ld $in0,0($inp)
+ ld $in1,8($inp)
+#else
+ ldl $in0,0+MSB($inp)
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ li $tmp0,1
+ dsll $tmp0,32
+ daddiu $tmp0,-63
+ dsll $tmp0,28
+ daddiu $tmp0,-1 # 0ffffffc0fffffff
+
+ and $in0,$tmp0
+ daddiu $tmp0,-3 # 0ffffffc0ffffffc
+ and $in1,$tmp0
+
+ sd $in0,24($ctx)
+ dsrl $tmp0,$in1,2
+ sd $in1,32($ctx)
+ daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
+ sd $tmp0,40($ctx)
+
+.Lno_key:
+ li $v0,0 # return 0
+ jr $ra
+.end poly1305_init
+___
+{
+my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
+ ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+
+$code.=<<___;
+.align 5
+.globl poly1305_blocks
+.ent poly1305_blocks
+poly1305_blocks:
+ .set noreorder
+ dsrl $len,4 # number of complete blocks
+ bnez $len,poly1305_blocks_internal
+ nop
+ jr $ra
+ nop
+.end poly1305_blocks
+
+.align 5
+.ent poly1305_blocks_internal
+poly1305_blocks_internal:
+ .frame $sp,6*8,$ra
+ .mask $SAVED_REGS_MASK,-8
+ .set noreorder
+ dsubu $sp,6*8
+ sd $s5,40($sp)
+ sd $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ sd $s3,24($sp)
+ sd $s2,16($sp)
+ sd $s1,8($sp)
+ sd $s0,0($sp)
+___
+$code.=<<___;
+ .set reorder
+
+ ld $h0,0($ctx) # load hash value
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ ld $r0,24($ctx) # load key
+ ld $r1,32($ctx)
+ ld $s1,40($ctx)
+
+.Loop:
+#if defined(_MIPS_ARCH_MIPS64R6)
+ ld $in0,0($inp) # load input
+ ld $in1,8($inp)
+#else
+ ldl $in0,0+MSB($inp) # load input
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#endif
+ daddiu $len,-1
+ daddiu $inp,16
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ daddu $h0,$in0 # accumulate input
+ daddu $h1,$in1
+ sltu $tmp0,$h0,$in0
+ sltu $tmp1,$h1,$in1
+ daddu $h1,$tmp0
+
+ dmultu ($r0,$h0) # h0*r0
+ daddu $h2,$padbit
+ sltu $tmp0,$h1,$tmp0
+ mflo ($d0,$r0,$h0)
+ mfhi ($d1,$r0,$h0)
+
+ dmultu ($s1,$h1) # h1*5*r1
+ daddu $tmp0,$tmp1
+ daddu $h2,$tmp0
+ mflo ($tmp0,$s1,$h1)
+ mfhi ($tmp1,$s1,$h1)
+
+ dmultu ($r1,$h0) # h0*r1
+ daddu $d0,$tmp0
+ daddu $d1,$tmp1
+ mflo ($tmp2,$r1,$h0)
+ mfhi ($d2,$r1,$h0)
+ sltu $tmp0,$d0,$tmp0
+ daddu $d1,$tmp0
+
+ dmultu ($r0,$h1) # h1*r0
+ daddu $d1,$tmp2
+ sltu $tmp2,$d1,$tmp2
+ mflo ($tmp0,$r0,$h1)
+ mfhi ($tmp1,$r0,$h1)
+ daddu $d2,$tmp2
+
+ dmultu ($s1,$h2) # h2*5*r1
+ daddu $d1,$tmp0
+ daddu $d2,$tmp1
+ mflo ($tmp2,$s1,$h2)
+
+ dmultu ($r0,$h2) # h2*r0
+ sltu $tmp0,$d1,$tmp0
+ daddu $d2,$tmp0
+ mflo ($tmp3,$r0,$h2)
+
+ daddu $d1,$tmp2
+ daddu $d2,$tmp3
+ sltu $tmp2,$d1,$tmp2
+ daddu $d2,$tmp2
+
+ li $tmp0,-4 # final reduction
+ and $tmp0,$d2
+ dsrl $tmp1,$d2,2
+ andi $h2,$d2,3
+ daddu $tmp0,$tmp1
+ daddu $h0,$d0,$tmp0
+ sltu $tmp0,$h0,$tmp0
+ daddu $h1,$d1,$tmp0
+ sltu $tmp0,$h1,$tmp0
+ daddu $h2,$h2,$tmp0
+
+ bnez $len,.Loop
+
+ sd $h0,0($ctx) # store hash value
+ sd $h1,8($ctx)
+ sd $h2,16($ctx)
+
+ .set noreorder
+ ld $s5,40($sp) # epilogue
+ ld $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
+ ld $s3,24($sp)
+ ld $s2,16($sp)
+ ld $s1,8($sp)
+ ld $s0,0($sp)
+___
+$code.=<<___;
+ jr $ra
+ daddu $sp,6*8
+.end poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align 5
+.globl poly1305_emit
+.ent poly1305_emit
+poly1305_emit:
+ .frame $sp,0,$ra
+ .set reorder
+
+ ld $tmp0,0($ctx)
+ ld $tmp1,8($ctx)
+ ld $tmp2,16($ctx)
+
+ daddiu $in0,$tmp0,5 # compare to modulus
+ sltiu $tmp3,$in0,5
+ daddu $in1,$tmp1,$tmp3
+ sltu $tmp3,$in1,$tmp3
+ daddu $tmp2,$tmp2,$tmp3
+
+ dsrl $tmp2,2 # see if it carried/borrowed
+ dsubu $tmp2,$zero,$tmp2
+ nor $tmp3,$zero,$tmp2
+
+ and $in0,$tmp2
+ and $tmp0,$tmp3
+ and $in1,$tmp2
+ and $tmp1,$tmp3
+ or $in0,$tmp0
+ or $in1,$tmp1
+
+ lwu $tmp0,0($nonce) # load nonce
+ lwu $tmp1,4($nonce)
+ lwu $tmp2,8($nonce)
+ lwu $tmp3,12($nonce)
+ dsll $tmp1,32
+ dsll $tmp3,32
+ or $tmp0,$tmp1
+ or $tmp2,$tmp3
+
+ daddu $in0,$tmp0 # accumulate nonce
+ daddu $in1,$tmp2
+ sltu $tmp0,$in0,$tmp0
+ daddu $in1,$tmp0
+
+ dsrl $tmp0,$in0,8 # write mac value
+ dsrl $tmp1,$in0,16
+ dsrl $tmp2,$in0,24
+ sb $in0,0($mac)
+ dsrl $tmp3,$in0,32
+ sb $tmp0,1($mac)
+ dsrl $tmp0,$in0,40
+ sb $tmp1,2($mac)
+ dsrl $tmp1,$in0,48
+ sb $tmp2,3($mac)
+ dsrl $tmp2,$in0,56
+ sb $tmp3,4($mac)
+ dsrl $tmp3,$in1,8
+ sb $tmp0,5($mac)
+ dsrl $tmp0,$in1,16
+ sb $tmp1,6($mac)
+ dsrl $tmp1,$in1,24
+ sb $tmp2,7($mac)
+
+ sb $in1,8($mac)
+ dsrl $tmp2,$in1,32
+ sb $tmp3,9($mac)
+ dsrl $tmp3,$in1,40
+ sb $tmp0,10($mac)
+ dsrl $tmp0,$in1,48
+ sb $tmp1,11($mac)
+ dsrl $tmp1,$in1,56
+ sb $tmp2,12($mac)
+ sb $tmp3,13($mac)
+ sb $tmp0,14($mac)
+ sb $tmp1,15($mac)
+
+ jr $ra
+.end poly1305_emit
+.rdata
+.align 2
+___
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/src/tests/qemu/Makefile b/src/tests/qemu/Makefile
index e511b69..e20509f 100644
--- a/src/tests/qemu/Makefile
+++ b/src/tests/qemu/Makefile
@@ -23,7 +23,7 @@ NR_CPUS ?= 4
MIRROR := https://download.wireguard.com/qemu-test/distfiles/
rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-WIREGUARD_SOURCES := ../../Kbuild ../../Kconfig $(filter-out ../../tools/% ../../tests/%,$(call rwildcard,../../,*.c *.h *.S *.include))
+WIREGUARD_SOURCES := ../../Kbuild ../../Kconfig $(filter-out ../../tools/% ../../tests/%,$(call rwildcard,../../,*.c *.h *.S *.pl *.include))
TOOLS_SOURCES := $(wildcard ../../tools/*.c ../../tools/*.h ../../uapi/*.h ../../crypto/zinc/curve25519/curve25519-hacl64.c ../../crypto/zinc/curve25519/curve25519-fiat32.c)
default: qemu