diff options
author | Jason A. Donenfeld <Jason@zx2c4.com> | 2018-11-08 17:08:22 +0100 |
---|---|---|
committer | Jason A. Donenfeld <Jason@zx2c4.com> | 2018-11-14 23:59:05 -0800 |
commit | cc36bde00d67f15d8657c2fa6f450dccf4fb76b7 (patch) | |
tree | feca77f876b7552bb4a83883c4e193e7f35020cc /src/crypto/zinc/chacha20/chacha20-arm64.S | |
parent | 8813c7d8d2608aca8d451b3cb4d7f3285c043691 (diff) |
chacha20,poly1305: switch to perlasm originals on mips and arm
We also separate out Eric Biggers' Cortex A7 implementation into its own
file.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'src/crypto/zinc/chacha20/chacha20-arm64.S')
-rw-r--r-- | src/crypto/zinc/chacha20/chacha20-arm64.S | 1942 |
1 files changed, 0 insertions, 1942 deletions
diff --git a/src/crypto/zinc/chacha20/chacha20-arm64.S b/src/crypto/zinc/chacha20/chacha20-arm64.S deleted file mode 100644 index 1ae11a5..0000000 --- a/src/crypto/zinc/chacha20/chacha20-arm64.S +++ /dev/null @@ -1,1942 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. - * - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. - */ - -#include <linux/linkage.h> - -.text -.align 5 -.Lsigma: -.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral -.Lone: -.long 1,0,0,0 - -.align 5 -ENTRY(chacha20_arm) - cbz x2,.Labort - - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ldp x28,x30,[x4] // load counter -#ifdef __AARCH64EB__ - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - -.Loop_outer: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov w7,w23 - lsr x8,x23,#32 - mov w9,w24 - lsr x10,x24,#32 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#64 -.Loop: - sub x4,x4,#1 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - ror w21,w21,#16 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#20 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - ror w21,w21,#24 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#25 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#16 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - ror w9,w9,#20 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#24 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - ror w9,w9,#25 - cbnz x4,.Loop - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - b.lo .Ltail - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - - b.hi .Loop_outer - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.Labort: - ret - -.align 4 -.Ltail: - add x2,x2,#64 -.Less_than_64: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - stp x5,x7,[sp,#0] - stp x9,x11,[sp,#16] - stp x13,x15,[sp,#32] - stp x17,x20,[sp,#48] - -.Loop_tail: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,.Loop_tail - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret -ENDPROC(chacha20_arm) - -#ifdef CONFIG_KERNEL_MODE_NEON -.align 5 -ENTRY(chacha20_neon) - cbz x2,.Labort_neon - - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - cmp x2,#512 - b.hs .L512_or_more_neon - - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __AARCH64EB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - -.Loop_outer_neon: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov v0.16b,v24.16b - mov w7,w23 - lsr x8,x23,#32 - mov v4.16b,v24.16b - mov w9,w24 - lsr x10,x24,#32 - mov v16.16b,v24.16b - mov w11,w25 - mov v1.16b,v25.16b - lsr x12,x25,#32 - mov v5.16b,v25.16b - mov w13,w26 - mov v17.16b,v25.16b - lsr x14,x26,#32 - mov v3.16b,v27.16b - mov w15,w27 - mov v7.16b,v28.16b - lsr x16,x27,#32 - mov v19.16b,v29.16b - mov w17,w28 - mov v2.16b,v26.16b - lsr x19,x28,#32 - mov v6.16b,v26.16b - mov w20,w30 - mov v18.16b,v26.16b - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#256 -.Loop_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v16.4s,v16.4s,v17.4s - add w7,w7,w11 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w12 - eor v7.16b,v7.16b,v4.16b - eor w17,w17,w5 - eor v19.16b,v19.16b,v16.16b - eor w19,w19,w6 - rev32 v3.8h,v3.8h - eor w20,w20,w7 - rev32 v7.8h,v7.8h - eor w21,w21,w8 - rev32 v19.8h,v19.8h - ror w17,w17,#16 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#16 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#16 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#16 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#20 - add w16,w16,w21 - ushr v5.4s,v21.4s,#20 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#20 - eor w10,w10,w14 - sli v1.4s,v20.4s,#12 - eor w11,w11,w15 - sli v5.4s,v21.4s,#12 - eor w12,w12,w16 - sli v17.4s,v22.4s,#12 - ror w9,w9,#20 - add v0.4s,v0.4s,v1.4s - ror w10,w10,#20 - add v4.4s,v4.4s,v5.4s - ror w11,w11,#20 - add v16.4s,v16.4s,v17.4s - ror w12,w12,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w9 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w10 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w11 - ushr v3.4s,v20.4s,#24 - add w8,w8,w12 - ushr v7.4s,v21.4s,#24 - eor w17,w17,w5 - ushr v19.4s,v22.4s,#24 - eor w19,w19,w6 - sli v3.4s,v20.4s,#8 - eor w20,w20,w7 - sli v7.4s,v21.4s,#8 - eor w21,w21,w8 - sli v19.4s,v22.4s,#8 - ror w17,w17,#24 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#24 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#24 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#24 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#25 - add w16,w16,w21 - ushr v5.4s,v21.4s,#25 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#25 - eor w10,w10,w14 - sli v1.4s,v20.4s,#7 - eor w11,w11,w15 - sli v5.4s,v21.4s,#7 - eor w12,w12,w16 - sli v17.4s,v22.4s,#7 - ror w9,w9,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w10,w10,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w10 - add v4.4s,v4.4s,v5.4s - add w6,w6,w11 - add v16.4s,v16.4s,v17.4s - add w7,w7,w12 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w9 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w5 - eor v19.16b,v19.16b,v16.16b - eor w17,w17,w6 - rev32 v3.8h,v3.8h - eor w19,w19,w7 - rev32 v7.8h,v7.8h - eor w20,w20,w8 - rev32 v19.8h,v19.8h - ror w21,w21,#16 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#16 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#16 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#16 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#20 - add w14,w14,w20 - ushr v5.4s,v21.4s,#20 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#20 - eor w11,w11,w16 - sli v1.4s,v20.4s,#12 - eor w12,w12,w13 - sli v5.4s,v21.4s,#12 - eor w9,w9,w14 - sli v17.4s,v22.4s,#12 - ror w10,w10,#20 - add v0.4s,v0.4s,v1.4s - ror w11,w11,#20 - add v4.4s,v4.4s,v5.4s - ror w12,w12,#20 - add v16.4s,v16.4s,v17.4s - ror w9,w9,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w12 - ushr v3.4s,v20.4s,#24 - add w8,w8,w9 - ushr v7.4s,v21.4s,#24 - eor w21,w21,w5 - ushr v19.4s,v22.4s,#24 - eor w17,w17,w6 - sli v3.4s,v20.4s,#8 - eor w19,w19,w7 - sli v7.4s,v21.4s,#8 - eor w20,w20,w8 - sli v19.4s,v22.4s,#8 - ror w21,w21,#24 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#24 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#24 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#24 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#25 - add w14,w14,w20 - ushr v5.4s,v21.4s,#25 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#25 - eor w11,w11,w16 - sli v1.4s,v20.4s,#7 - eor w12,w12,w13 - sli v5.4s,v21.4s,#7 - eor w9,w9,w14 - sli v17.4s,v22.4s,#7 - ror w10,w10,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w11,w11,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w12,w12,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - cbnz x4,.Loop_neon - - add w5,w5,w22 // accumulate key block - add v0.4s,v0.4s,v24.4s - add x6,x6,x22,lsr#32 - add v4.4s,v4.4s,v24.4s - add w7,w7,w23 - add v16.4s,v16.4s,v24.4s - add x8,x8,x23,lsr#32 - add v2.4s,v2.4s,v26.4s - add w9,w9,w24 - add v6.4s,v6.4s,v26.4s - add x10,x10,x24,lsr#32 - add v18.4s,v18.4s,v26.4s - add w11,w11,w25 - add v3.4s,v3.4s,v27.4s - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add v7.4s,v7.4s,v28.4s - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add v19.4s,v19.4s,v29.4s - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add v1.4s,v1.4s,v25.4s - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add v5.4s,v5.4s,v25.4s - add x21,x21,x30,lsr#32 - add v17.4s,v17.4s,v25.4s - - b.lo .Ltail_neon - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v20.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v21.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v22.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v23.16b - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - add v27.4s,v27.4s,v31.4s // += 4 - stp x13,x15,[x0,#32] - add v28.4s,v28.4s,v31.4s - stp x17,x20,[x0,#48] - add v29.4s,v29.4s,v31.4s - add x0,x0,#64 - - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - eor v16.16b,v16.16b,v0.16b - eor v17.16b,v17.16b,v1.16b - eor v18.16b,v18.16b,v2.16b - eor v19.16b,v19.16b,v3.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - b.hi .Loop_outer_neon - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret - -.Ltail_neon: - add x2,x2,#256 - cmp x2,#64 - b.lo .Less_than_64 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - b.eq .Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo .Less_than_128 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v0.16b,v0.16b,v20.16b - eor v1.16b,v1.16b,v21.16b - eor v2.16b,v2.16b,v22.16b - eor v3.16b,v3.16b,v23.16b - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - b.eq .Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo .Less_than_192 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - b.eq .Ldone_neon - sub x2,x2,#64 - - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] - b .Last_neon - -.Less_than_128: - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] - b .Last_neon -.Less_than_192: - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] - b .Last_neon - -.align 4 -.Last_neon: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - -.Loop_tail_neon: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,.Loop_tail_neon - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - -.Ldone_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret - -.L512_or_more_neon: - sub sp,sp,#128+64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __AARCH64EB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - stp q24,q25,[sp,#0] // off-load key block, invariant part - add v27.4s,v27.4s,v31.4s // not typo - str q26,[sp,#32] - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - add v30.4s,v29.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - - stp d8,d9,[sp,#128+0] // meet ABI requirements - stp d10,d11,[sp,#128+16] - stp d12,d13,[sp,#128+32] - stp d14,d15,[sp,#128+48] - - sub x2,x2,#512 // not typo - -.Loop_outer_512_neon: - mov v0.16b,v24.16b - mov v4.16b,v24.16b - mov v8.16b,v24.16b - mov v12.16b,v24.16b - mov v16.16b,v24.16b - mov v20.16b,v24.16b - mov v1.16b,v25.16b - mov w5,w22 // unpack key block - mov v5.16b,v25.16b - lsr x6,x22,#32 - mov v9.16b,v25.16b - mov w7,w23 - mov v13.16b,v25.16b - lsr x8,x23,#32 - mov v17.16b,v25.16b - mov w9,w24 - mov v21.16b,v25.16b - lsr x10,x24,#32 - mov v3.16b,v27.16b - mov w11,w25 - mov v7.16b,v28.16b - lsr x12,x25,#32 - mov v11.16b,v29.16b - mov w13,w26 - mov v15.16b,v30.16b - lsr x14,x26,#32 - mov v2.16b,v26.16b - mov w15,w27 - mov v6.16b,v26.16b - lsr x16,x27,#32 - add v19.4s,v3.4s,v31.4s // +4 - mov w17,w28 - add v23.4s,v7.4s,v31.4s // +4 - lsr x19,x28,#32 - mov v10.16b,v26.16b - mov w20,w30 - mov v14.16b,v26.16b - lsr x21,x30,#32 - mov v18.16b,v26.16b - stp q27,q28,[sp,#48] // off-load key block, variable part - mov v22.16b,v26.16b - str q29,[sp,#80] - - mov x4,#5 - subs x2,x2,#512 -.Loop_upper_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_upper_neon - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - mov w5,w22 // unpack key block - lsr x6,x22,#32 - stp x9,x11,[x0,#16] - mov w7,w23 - lsr x8,x23,#32 - stp x13,x15,[x0,#32] - mov w9,w24 - lsr x10,x24,#32 - stp x17,x20,[x0,#48] - add x0,x0,#64 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#5 -.Loop_lower_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_lower_neon - - add w5,w5,w22 // accumulate key block - ldp q24,q25,[sp,#0] - add x6,x6,x22,lsr#32 - ldp q26,q27,[sp,#32] - add w7,w7,w23 - ldp q28,q29,[sp,#64] - add x8,x8,x23,lsr#32 - add v0.4s,v0.4s,v24.4s - add w9,w9,w24 - add v4.4s,v4.4s,v24.4s - add x10,x10,x24,lsr#32 - add v8.4s,v8.4s,v24.4s - add w11,w11,w25 - add v12.4s,v12.4s,v24.4s - add x12,x12,x25,lsr#32 - add v16.4s,v16.4s,v24.4s - add w13,w13,w26 - add v20.4s,v20.4s,v24.4s - add x14,x14,x26,lsr#32 - add v2.4s,v2.4s,v26.4s - add w15,w15,w27 - add v6.4s,v6.4s,v26.4s - add x16,x16,x27,lsr#32 - add v10.4s,v10.4s,v26.4s - add w17,w17,w28 - add v14.4s,v14.4s,v26.4s - add x19,x19,x28,lsr#32 - add v18.4s,v18.4s,v26.4s - add w20,w20,w30 - add v22.4s,v22.4s,v26.4s - add x21,x21,x30,lsr#32 - add v19.4s,v19.4s,v31.4s // +4 - add x5,x5,x6,lsl#32 // pack - add v23.4s,v23.4s,v31.4s // +4 - add x7,x7,x8,lsl#32 - add v3.4s,v3.4s,v27.4s - ldp x6,x8,[x1,#0] // load input - add v7.4s,v7.4s,v28.4s - add x9,x9,x10,lsl#32 - add v11.4s,v11.4s,v29.4s - add x11,x11,x12,lsl#32 - add v15.4s,v15.4s,v30.4s - ldp x10,x12,[x1,#16] - add v19.4s,v19.4s,v27.4s - add x13,x13,x14,lsl#32 - add v23.4s,v23.4s,v28.4s - add x15,x15,x16,lsl#32 - add v1.4s,v1.4s,v25.4s - ldp x14,x16,[x1,#32] - add v5.4s,v5.4s,v25.4s - add x17,x17,x19,lsl#32 - add v9.4s,v9.4s,v25.4s - add x20,x20,x21,lsl#32 - add v13.4s,v13.4s,v25.4s - ldp x19,x21,[x1,#48] - add v17.4s,v17.4s,v25.4s - add x1,x1,#64 - add v21.4s,v21.4s,v25.4s - -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v24.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v25.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v26.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v27.16b - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#7 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - eor v4.16b,v4.16b,v24.16b - eor v5.16b,v5.16b,v25.16b - eor v6.16b,v6.16b,v26.16b - eor v7.16b,v7.16b,v27.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - eor v8.16b,v8.16b,v0.16b - ldp q24,q25,[sp,#0] - eor v9.16b,v9.16b,v1.16b - ldp q26,q27,[sp,#32] - eor v10.16b,v10.16b,v2.16b - eor v11.16b,v11.16b,v3.16b - st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 - - ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 - eor v12.16b,v12.16b,v4.16b - eor v13.16b,v13.16b,v5.16b - eor v14.16b,v14.16b,v6.16b - eor v15.16b,v15.16b,v7.16b - st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 - - ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 - eor v16.16b,v16.16b,v8.16b - eor v17.16b,v17.16b,v9.16b - eor v18.16b,v18.16b,v10.16b - eor v19.16b,v19.16b,v11.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - shl v0.4s,v31.4s,#1 // 4 -> 8 - eor v20.16b,v20.16b,v12.16b - eor v21.16b,v21.16b,v13.16b - eor v22.16b,v22.16b,v14.16b - eor v23.16b,v23.16b,v15.16b - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 - - add v27.4s,v27.4s,v0.4s // += 8 - add v28.4s,v28.4s,v0.4s - add v29.4s,v29.4s,v0.4s - add v30.4s,v30.4s,v0.4s - - b.hs .Loop_outer_512_neon - - adds x2,x2,#512 - ushr v0.4s,v31.4s,#2 // 4 -> 1 - - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] - - stp q24,q31,[sp,#0] // wipe off-load area - stp q24,q31,[sp,#32] - stp q24,q31,[sp,#64] - - b.eq .Ldone_512_neon - - cmp x2,#192 - sub v27.4s,v27.4s,v0.4s // -= 1 - sub v28.4s,v28.4s,v0.4s - sub v29.4s,v29.4s,v0.4s - add sp,sp,#128 - b.hs .Loop_outer_neon - - eor v25.16b,v25.16b,v25.16b - eor v26.16b,v26.16b,v26.16b - eor v27.16b,v27.16b,v27.16b - eor v28.16b,v28.16b,v28.16b - eor v29.16b,v29.16b,v29.16b - eor v30.16b,v30.16b,v30.16b - b .Loop_outer - -.Ldone_512_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.Labort_neon: - ret -ENDPROC(chacha20_neon) -#endif |