/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. * Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. * * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. */ #include .text .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,0,0,0 .align 5 ENTRY(chacha20_arm) cbz x2,.Labort stp x29,x30,[sp,#-96]! add x29,sp,#0 adr x5,.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ldp x28,x30,[x4] // load counter #ifdef __ARMEB__ ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif .Loop_outer: mov w5,w22 // unpack key block lsr x6,x22,#32 mov w7,w23 lsr x8,x23,#32 mov w9,w24 lsr x10,x24,#32 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#10 subs x2,x2,#64 .Loop: sub x4,x4,#1 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 ror w21,w21,#16 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#20 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 ror w21,w21,#24 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#25 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#16 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 ror w9,w9,#20 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#24 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 ror w9,w9,#25 cbnz x4,.Loop add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 b.lo .Ltail add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __ARMEB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.hi .Loop_outer ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 .Labort: ret .align 4 .Ltail: add x2,x2,#64 .Less_than_64: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 #ifdef __ARMEB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif stp x5,x7,[sp,#0] stp x9,x11,[sp,#16] stp x13,x15,[sp,#32] stp x17,x20,[sp,#48] .Loop_tail: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,.Loop_tail stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 ret ENDPROC(chacha20_arm) #ifdef CONFIG_KERNEL_MODE_NEON .align 5 ENTRY(chacha20_neon) cbz x2,.Labort_neon stp x29,x30,[sp,#-96]! add x29,sp,#0 adr x5,.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] cmp x2,#512 b.hs .L512_or_more_neon sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ld1 {v24.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v25.4s,v26.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v27.4s},[x4] ld1 {v31.4s},[x5] #ifdef __ARMEB__ rev64 v24.4s,v24.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v27.4s,v27.4s,v31.4s // += 1 add v28.4s,v27.4s,v31.4s add v29.4s,v28.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 .Loop_outer_neon: mov w5,w22 // unpack key block lsr x6,x22,#32 mov v0.16b,v24.16b mov w7,w23 lsr x8,x23,#32 mov v4.16b,v24.16b mov w9,w24 lsr x10,x24,#32 mov v16.16b,v24.16b mov w11,w25 mov v1.16b,v25.16b lsr x12,x25,#32 mov v5.16b,v25.16b mov w13,w26 mov v17.16b,v25.16b lsr x14,x26,#32 mov v3.16b,v27.16b mov w15,w27 mov v7.16b,v28.16b lsr x16,x27,#32 mov v19.16b,v29.16b mov w17,w28 mov v2.16b,v26.16b lsr x19,x28,#32 mov v6.16b,v26.16b mov w20,w30 mov v18.16b,v26.16b lsr x21,x30,#32 mov x4,#10 subs x2,x2,#256 .Loop_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v16.4s,v16.4s,v17.4s add w7,w7,w11 eor v3.16b,v3.16b,v0.16b add w8,w8,w12 eor v7.16b,v7.16b,v4.16b eor w17,w17,w5 eor v19.16b,v19.16b,v16.16b eor w19,w19,w6 rev32 v3.8h,v3.8h eor w20,w20,w7 rev32 v7.8h,v7.8h eor w21,w21,w8 rev32 v19.8h,v19.8h ror w17,w17,#16 add v2.4s,v2.4s,v3.4s ror w19,w19,#16 add v6.4s,v6.4s,v7.4s ror w20,w20,#16 add v18.4s,v18.4s,v19.4s ror w21,w21,#16 eor v20.16b,v1.16b,v2.16b add w13,w13,w17 eor v21.16b,v5.16b,v6.16b add w14,w14,w19 eor v22.16b,v17.16b,v18.16b add w15,w15,w20 ushr v1.4s,v20.4s,#20 add w16,w16,w21 ushr v5.4s,v21.4s,#20 eor w9,w9,w13 ushr v17.4s,v22.4s,#20 eor w10,w10,w14 sli v1.4s,v20.4s,#12 eor w11,w11,w15 sli v5.4s,v21.4s,#12 eor w12,w12,w16 sli v17.4s,v22.4s,#12 ror w9,w9,#20 add v0.4s,v0.4s,v1.4s ror w10,w10,#20 add v4.4s,v4.4s,v5.4s ror w11,w11,#20 add v16.4s,v16.4s,v17.4s ror w12,w12,#20 eor v20.16b,v3.16b,v0.16b add w5,w5,w9 eor v21.16b,v7.16b,v4.16b add w6,w6,w10 eor v22.16b,v19.16b,v16.16b add w7,w7,w11 ushr v3.4s,v20.4s,#24 add w8,w8,w12 ushr v7.4s,v21.4s,#24 eor w17,w17,w5 ushr v19.4s,v22.4s,#24 eor w19,w19,w6 sli v3.4s,v20.4s,#8 eor w20,w20,w7 sli v7.4s,v21.4s,#8 eor w21,w21,w8 sli v19.4s,v22.4s,#8 ror w17,w17,#24 add v2.4s,v2.4s,v3.4s ror w19,w19,#24 add v6.4s,v6.4s,v7.4s ror w20,w20,#24 add v18.4s,v18.4s,v19.4s ror w21,w21,#24 eor v20.16b,v1.16b,v2.16b add w13,w13,w17 eor v21.16b,v5.16b,v6.16b add w14,w14,w19 eor v22.16b,v17.16b,v18.16b add w15,w15,w20 ushr v1.4s,v20.4s,#25 add w16,w16,w21 ushr v5.4s,v21.4s,#25 eor w9,w9,w13 ushr v17.4s,v22.4s,#25 eor w10,w10,w14 sli v1.4s,v20.4s,#7 eor w11,w11,w15 sli v5.4s,v21.4s,#7 eor w12,w12,w16 sli v17.4s,v22.4s,#7 ror w9,w9,#25 ext v2.16b,v2.16b,v2.16b,#8 ror w10,w10,#25 ext v6.16b,v6.16b,v6.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w10 add v4.4s,v4.4s,v5.4s add w6,w6,w11 add v16.4s,v16.4s,v17.4s add w7,w7,w12 eor v3.16b,v3.16b,v0.16b add w8,w8,w9 eor v7.16b,v7.16b,v4.16b eor w21,w21,w5 eor v19.16b,v19.16b,v16.16b eor w17,w17,w6 rev32 v3.8h,v3.8h eor w19,w19,w7 rev32 v7.8h,v7.8h eor w20,w20,w8 rev32 v19.8h,v19.8h ror w21,w21,#16 add v2.4s,v2.4s,v3.4s ror w17,w17,#16 add v6.4s,v6.4s,v7.4s ror w19,w19,#16 add v18.4s,v18.4s,v19.4s ror w20,w20,#16 eor v20.16b,v1.16b,v2.16b add w15,w15,w21 eor v21.16b,v5.16b,v6.16b add w16,w16,w17 eor v22.16b,v17.16b,v18.16b add w13,w13,w19 ushr v1.4s,v20.4s,#20 add w14,w14,w20 ushr v5.4s,v21.4s,#20 eor w10,w10,w15 ushr v17.4s,v22.4s,#20 eor w11,w11,w16 sli v1.4s,v20.4s,#12 eor w12,w12,w13 sli v5.4s,v21.4s,#12 eor w9,w9,w14 sli v17.4s,v22.4s,#12 ror w10,w10,#20 add v0.4s,v0.4s,v1.4s ror w11,w11,#20 add v4.4s,v4.4s,v5.4s ror w12,w12,#20 add v16.4s,v16.4s,v17.4s ror w9,w9,#20 eor v20.16b,v3.16b,v0.16b add w5,w5,w10 eor v21.16b,v7.16b,v4.16b add w6,w6,w11 eor v22.16b,v19.16b,v16.16b add w7,w7,w12 ushr v3.4s,v20.4s,#24 add w8,w8,w9 ushr v7.4s,v21.4s,#24 eor w21,w21,w5 ushr v19.4s,v22.4s,#24 eor w17,w17,w6 sli v3.4s,v20.4s,#8 eor w19,w19,w7 sli v7.4s,v21.4s,#8 eor w20,w20,w8 sli v19.4s,v22.4s,#8 ror w21,w21,#24 add v2.4s,v2.4s,v3.4s ror w17,w17,#24 add v6.4s,v6.4s,v7.4s ror w19,w19,#24 add v18.4s,v18.4s,v19.4s ror w20,w20,#24 eor v20.16b,v1.16b,v2.16b add w15,w15,w21 eor v21.16b,v5.16b,v6.16b add w16,w16,w17 eor v22.16b,v17.16b,v18.16b add w13,w13,w19 ushr v1.4s,v20.4s,#25 add w14,w14,w20 ushr v5.4s,v21.4s,#25 eor w10,w10,w15 ushr v17.4s,v22.4s,#25 eor w11,w11,w16 sli v1.4s,v20.4s,#7 eor w12,w12,w13 sli v5.4s,v21.4s,#7 eor w9,w9,w14 sli v17.4s,v22.4s,#7 ror w10,w10,#25 ext v2.16b,v2.16b,v2.16b,#8 ror w11,w11,#25 ext v6.16b,v6.16b,v6.16b,#8 ror w12,w12,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 cbnz x4,.Loop_neon add w5,w5,w22 // accumulate key block add v0.4s,v0.4s,v24.4s add x6,x6,x22,lsr#32 add v4.4s,v4.4s,v24.4s add w7,w7,w23 add v16.4s,v16.4s,v24.4s add x8,x8,x23,lsr#32 add v2.4s,v2.4s,v26.4s add w9,w9,w24 add v6.4s,v6.4s,v26.4s add x10,x10,x24,lsr#32 add v18.4s,v18.4s,v26.4s add w11,w11,w25 add v3.4s,v3.4s,v27.4s add x12,x12,x25,lsr#32 add w13,w13,w26 add v7.4s,v7.4s,v28.4s add x14,x14,x26,lsr#32 add w15,w15,w27 add v19.4s,v19.4s,v29.4s add x16,x16,x27,lsr#32 add w17,w17,w28 add v1.4s,v1.4s,v25.4s add x19,x19,x28,lsr#32 add w20,w20,w30 add v5.4s,v5.4s,v25.4s add x21,x21,x30,lsr#32 add v17.4s,v17.4s,v25.4s b.lo .Ltail_neon add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __ARMEB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v0.16b,v0.16b,v20.16b eor x15,x15,x16 eor v1.16b,v1.16b,v21.16b eor x17,x17,x19 eor v2.16b,v2.16b,v22.16b eor x20,x20,x21 eor v3.16b,v3.16b,v23.16b ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#4 // increment counter stp x9,x11,[x0,#16] add v27.4s,v27.4s,v31.4s // += 4 stp x13,x15,[x0,#32] add v28.4s,v28.4s,v31.4s stp x17,x20,[x0,#48] add v29.4s,v29.4s,v31.4s add x0,x0,#64 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b eor v5.16b,v5.16b,v21.16b eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 eor v16.16b,v16.16b,v0.16b eor v17.16b,v17.16b,v1.16b eor v18.16b,v18.16b,v2.16b eor v19.16b,v19.16b,v3.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 b.hi .Loop_outer_neon ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 ret .Ltail_neon: add x2,x2,#256 cmp x2,#64 b.lo .Less_than_64 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __ARMEB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#4 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.eq .Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo .Less_than_128 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v0.16b,v0.16b,v20.16b eor v1.16b,v1.16b,v21.16b eor v2.16b,v2.16b,v22.16b eor v3.16b,v3.16b,v23.16b st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 b.eq .Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo .Less_than_192 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b eor v5.16b,v5.16b,v21.16b eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 b.eq .Ldone_neon sub x2,x2,#64 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] b .Last_neon .Less_than_128: st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] b .Last_neon .Less_than_192: st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] b .Last_neon .align 4 .Last_neon: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 .Loop_tail_neon: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,.Loop_tail_neon stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] .Ldone_neon: ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 ret .L512_or_more_neon: sub sp,sp,#128+64 ldp x22,x23,[x5] // load sigma ld1 {v24.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v25.4s,v26.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v27.4s},[x4] ld1 {v31.4s},[x5] #ifdef __ARMEB__ rev64 v24.4s,v24.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v27.4s,v27.4s,v31.4s // += 1 stp q24,q25,[sp,#0] // off-load key block, invariant part add v27.4s,v27.4s,v31.4s // not typo str q26,[sp,#32] add v28.4s,v27.4s,v31.4s add v29.4s,v28.4s,v31.4s add v30.4s,v29.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] stp d12,d13,[sp,#128+32] stp d14,d15,[sp,#128+48] sub x2,x2,#512 // not typo .Loop_outer_512_neon: mov v0.16b,v24.16b mov v4.16b,v24.16b mov v8.16b,v24.16b mov v12.16b,v24.16b mov v16.16b,v24.16b mov v20.16b,v24.16b mov v1.16b,v25.16b mov w5,w22 // unpack key block mov v5.16b,v25.16b lsr x6,x22,#32 mov v9.16b,v25.16b mov w7,w23 mov v13.16b,v25.16b lsr x8,x23,#32 mov v17.16b,v25.16b mov w9,w24 mov v21.16b,v25.16b lsr x10,x24,#32 mov v3.16b,v27.16b mov w11,w25 mov v7.16b,v28.16b lsr x12,x25,#32 mov v11.16b,v29.16b mov w13,w26 mov v15.16b,v30.16b lsr x14,x26,#32 mov v2.16b,v26.16b mov w15,w27 mov v6.16b,v26.16b lsr x16,x27,#32 add v19.4s,v3.4s,v31.4s // +4 mov w17,w28 add v23.4s,v7.4s,v31.4s // +4 lsr x19,x28,#32 mov v10.16b,v26.16b mov w20,w30 mov v14.16b,v26.16b lsr x21,x30,#32 mov v18.16b,v26.16b stp q27,q28,[sp,#48] // off-load key block, variable part mov v22.16b,v26.16b str q29,[sp,#80] mov x4,#5 subs x2,x2,#512 .Loop_upper_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v11.16b,v11.16b,v11.16b,#12 ext v15.16b,v15.16b,v15.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v23.16b,v23.16b,v23.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v11.16b,v11.16b,v11.16b,#4 ext v15.16b,v15.16b,v15.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v23.16b,v23.16b,v23.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 cbnz x4,.Loop_upper_neon add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __ARMEB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter mov w5,w22 // unpack key block lsr x6,x22,#32 stp x9,x11,[x0,#16] mov w7,w23 lsr x8,x23,#32 stp x13,x15,[x0,#32] mov w9,w24 lsr x10,x24,#32 stp x17,x20,[x0,#48] add x0,x0,#64 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#5 .Loop_lower_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#12 ext v7.16b,v7.16b,v7.16b,#12 ext v11.16b,v11.16b,v11.16b,#12 ext v15.16b,v15.16b,v15.16b,#12 ext v19.16b,v19.16b,v19.16b,#12 ext v23.16b,v23.16b,v23.16b,#12 ext v1.16b,v1.16b,v1.16b,#4 ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 add v0.4s,v0.4s,v1.4s add w5,w5,w9 add v4.4s,v4.4s,v5.4s add w6,w6,w10 add v8.4s,v8.4s,v9.4s add w7,w7,w11 add v12.4s,v12.4s,v13.4s add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 eor v3.16b,v3.16b,v0.16b eor w20,w20,w7 eor v7.16b,v7.16b,v4.16b eor w21,w21,w8 eor v11.16b,v11.16b,v8.16b ror w17,w17,#16 eor v15.16b,v15.16b,v12.16b ror w19,w19,#16 eor v19.16b,v19.16b,v16.16b ror w20,w20,#16 eor v23.16b,v23.16b,v20.16b ror w21,w21,#16 rev32 v3.8h,v3.8h add w13,w13,w17 rev32 v7.8h,v7.8h add w14,w14,w19 rev32 v11.8h,v11.8h add w15,w15,w20 rev32 v15.8h,v15.8h add w16,w16,w21 rev32 v19.8h,v19.8h eor w9,w9,w13 rev32 v23.8h,v23.8h eor w10,w10,w14 add v2.4s,v2.4s,v3.4s eor w11,w11,w15 add v6.4s,v6.4s,v7.4s eor w12,w12,w16 add v10.4s,v10.4s,v11.4s ror w9,w9,#20 add v14.4s,v14.4s,v15.4s ror w10,w10,#20 add v18.4s,v18.4s,v19.4s ror w11,w11,#20 add v22.4s,v22.4s,v23.4s ror w12,w12,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w9 eor v25.16b,v5.16b,v6.16b add w6,w6,w10 eor v26.16b,v9.16b,v10.16b add w7,w7,w11 eor v27.16b,v13.16b,v14.16b add w8,w8,w12 eor v28.16b,v17.16b,v18.16b eor w17,w17,w5 eor v29.16b,v21.16b,v22.16b eor w19,w19,w6 ushr v1.4s,v24.4s,#20 eor w20,w20,w7 ushr v5.4s,v25.4s,#20 eor w21,w21,w8 ushr v9.4s,v26.4s,#20 ror w17,w17,#24 ushr v13.4s,v27.4s,#20 ror w19,w19,#24 ushr v17.4s,v28.4s,#20 ror w20,w20,#24 ushr v21.4s,v29.4s,#20 ror w21,w21,#24 sli v1.4s,v24.4s,#12 add w13,w13,w17 sli v5.4s,v25.4s,#12 add w14,w14,w19 sli v9.4s,v26.4s,#12 add w15,w15,w20 sli v13.4s,v27.4s,#12 add w16,w16,w21 sli v17.4s,v28.4s,#12 eor w9,w9,w13 sli v21.4s,v29.4s,#12 eor w10,w10,w14 add v0.4s,v0.4s,v1.4s eor w11,w11,w15 add v4.4s,v4.4s,v5.4s eor w12,w12,w16 add v8.4s,v8.4s,v9.4s ror w9,w9,#25 add v12.4s,v12.4s,v13.4s ror w10,w10,#25 add v16.4s,v16.4s,v17.4s ror w11,w11,#25 add v20.4s,v20.4s,v21.4s ror w12,w12,#25 eor v24.16b,v3.16b,v0.16b add w5,w5,w10 eor v25.16b,v7.16b,v4.16b add w6,w6,w11 eor v26.16b,v11.16b,v8.16b add w7,w7,w12 eor v27.16b,v15.16b,v12.16b add w8,w8,w9 eor v28.16b,v19.16b,v16.16b eor w21,w21,w5 eor v29.16b,v23.16b,v20.16b eor w17,w17,w6 ushr v3.4s,v24.4s,#24 eor w19,w19,w7 ushr v7.4s,v25.4s,#24 eor w20,w20,w8 ushr v11.4s,v26.4s,#24 ror w21,w21,#16 ushr v15.4s,v27.4s,#24 ror w17,w17,#16 ushr v19.4s,v28.4s,#24 ror w19,w19,#16 ushr v23.4s,v29.4s,#24 ror w20,w20,#16 sli v3.4s,v24.4s,#8 add w15,w15,w21 sli v7.4s,v25.4s,#8 add w16,w16,w17 sli v11.4s,v26.4s,#8 add w13,w13,w19 sli v15.4s,v27.4s,#8 add w14,w14,w20 sli v19.4s,v28.4s,#8 eor w10,w10,w15 sli v23.4s,v29.4s,#8 eor w11,w11,w16 add v2.4s,v2.4s,v3.4s eor w12,w12,w13 add v6.4s,v6.4s,v7.4s eor w9,w9,w14 add v10.4s,v10.4s,v11.4s ror w10,w10,#20 add v14.4s,v14.4s,v15.4s ror w11,w11,#20 add v18.4s,v18.4s,v19.4s ror w12,w12,#20 add v22.4s,v22.4s,v23.4s ror w9,w9,#20 eor v24.16b,v1.16b,v2.16b add w5,w5,w10 eor v25.16b,v5.16b,v6.16b add w6,w6,w11 eor v26.16b,v9.16b,v10.16b add w7,w7,w12 eor v27.16b,v13.16b,v14.16b add w8,w8,w9 eor v28.16b,v17.16b,v18.16b eor w21,w21,w5 eor v29.16b,v21.16b,v22.16b eor w17,w17,w6 ushr v1.4s,v24.4s,#25 eor w19,w19,w7 ushr v5.4s,v25.4s,#25 eor w20,w20,w8 ushr v9.4s,v26.4s,#25 ror w21,w21,#24 ushr v13.4s,v27.4s,#25 ror w17,w17,#24 ushr v17.4s,v28.4s,#25 ror w19,w19,#24 ushr v21.4s,v29.4s,#25 ror w20,w20,#24 sli v1.4s,v24.4s,#7 add w15,w15,w21 sli v5.4s,v25.4s,#7 add w16,w16,w17 sli v9.4s,v26.4s,#7 add w13,w13,w19 sli v13.4s,v27.4s,#7 add w14,w14,w20 sli v17.4s,v28.4s,#7 eor w10,w10,w15 sli v21.4s,v29.4s,#7 eor w11,w11,w16 ext v2.16b,v2.16b,v2.16b,#8 eor w12,w12,w13 ext v6.16b,v6.16b,v6.16b,#8 eor w9,w9,w14 ext v10.16b,v10.16b,v10.16b,#8 ror w10,w10,#25 ext v14.16b,v14.16b,v14.16b,#8 ror w11,w11,#25 ext v18.16b,v18.16b,v18.16b,#8 ror w12,w12,#25 ext v22.16b,v22.16b,v22.16b,#8 ror w9,w9,#25 ext v3.16b,v3.16b,v3.16b,#4 ext v7.16b,v7.16b,v7.16b,#4 ext v11.16b,v11.16b,v11.16b,#4 ext v15.16b,v15.16b,v15.16b,#4 ext v19.16b,v19.16b,v19.16b,#4 ext v23.16b,v23.16b,v23.16b,#4 ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 cbnz x4,.Loop_lower_neon add w5,w5,w22 // accumulate key block ldp q24,q25,[sp,#0] add x6,x6,x22,lsr#32 ldp q26,q27,[sp,#32] add w7,w7,w23 ldp q28,q29,[sp,#64] add x8,x8,x23,lsr#32 add v0.4s,v0.4s,v24.4s add w9,w9,w24 add v4.4s,v4.4s,v24.4s add x10,x10,x24,lsr#32 add v8.4s,v8.4s,v24.4s add w11,w11,w25 add v12.4s,v12.4s,v24.4s add x12,x12,x25,lsr#32 add v16.4s,v16.4s,v24.4s add w13,w13,w26 add v20.4s,v20.4s,v24.4s add x14,x14,x26,lsr#32 add v2.4s,v2.4s,v26.4s add w15,w15,w27 add v6.4s,v6.4s,v26.4s add x16,x16,x27,lsr#32 add v10.4s,v10.4s,v26.4s add w17,w17,w28 add v14.4s,v14.4s,v26.4s add x19,x19,x28,lsr#32 add v18.4s,v18.4s,v26.4s add w20,w20,w30 add v22.4s,v22.4s,v26.4s add x21,x21,x30,lsr#32 add v19.4s,v19.4s,v31.4s // +4 add x5,x5,x6,lsl#32 // pack add v23.4s,v23.4s,v31.4s // +4 add x7,x7,x8,lsl#32 add v3.4s,v3.4s,v27.4s ldp x6,x8,[x1,#0] // load input add v7.4s,v7.4s,v28.4s add x9,x9,x10,lsl#32 add v11.4s,v11.4s,v29.4s add x11,x11,x12,lsl#32 add v15.4s,v15.4s,v30.4s ldp x10,x12,[x1,#16] add v19.4s,v19.4s,v27.4s add x13,x13,x14,lsl#32 add v23.4s,v23.4s,v28.4s add x15,x15,x16,lsl#32 add v1.4s,v1.4s,v25.4s ldp x14,x16,[x1,#32] add v5.4s,v5.4s,v25.4s add x17,x17,x19,lsl#32 add v9.4s,v9.4s,v25.4s add x20,x20,x21,lsl#32 add v13.4s,v13.4s,v25.4s ldp x19,x21,[x1,#48] add v17.4s,v17.4s,v25.4s add x1,x1,#64 add v21.4s,v21.4s,v25.4s #ifdef __ARMEB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v0.16b,v0.16b,v24.16b eor x15,x15,x16 eor v1.16b,v1.16b,v25.16b eor x17,x17,x19 eor v2.16b,v2.16b,v26.16b eor x20,x20,x21 eor v3.16b,v3.16b,v27.16b ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#7 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor v4.16b,v4.16b,v24.16b eor v5.16b,v5.16b,v25.16b eor v6.16b,v6.16b,v26.16b eor v7.16b,v7.16b,v27.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor v8.16b,v8.16b,v0.16b ldp q24,q25,[sp,#0] eor v9.16b,v9.16b,v1.16b ldp q26,q27,[sp,#32] eor v10.16b,v10.16b,v2.16b eor v11.16b,v11.16b,v3.16b st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 eor v12.16b,v12.16b,v4.16b eor v13.16b,v13.16b,v5.16b eor v14.16b,v14.16b,v6.16b eor v15.16b,v15.16b,v7.16b st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 eor v16.16b,v16.16b,v8.16b eor v17.16b,v17.16b,v9.16b eor v18.16b,v18.16b,v10.16b eor v19.16b,v19.16b,v11.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 shl v0.4s,v31.4s,#1 // 4 -> 8 eor v20.16b,v20.16b,v12.16b eor v21.16b,v21.16b,v13.16b eor v22.16b,v22.16b,v14.16b eor v23.16b,v23.16b,v15.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 add v27.4s,v27.4s,v0.4s // += 8 add v28.4s,v28.4s,v0.4s add v29.4s,v29.4s,v0.4s add v30.4s,v30.4s,v0.4s b.hs .Loop_outer_512_neon adds x2,x2,#512 ushr v0.4s,v31.4s,#2 // 4 -> 1 ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp d10,d11,[sp,#128+16] ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] stp q24,q31,[sp,#0] // wipe off-load area stp q24,q31,[sp,#32] stp q24,q31,[sp,#64] b.eq .Ldone_512_neon cmp x2,#192 sub v27.4s,v27.4s,v0.4s // -= 1 sub v28.4s,v28.4s,v0.4s sub v29.4s,v29.4s,v0.4s add sp,sp,#128 b.hs .Loop_outer_neon eor v25.16b,v25.16b,v25.16b eor v26.16b,v26.16b,v26.16b eor v27.16b,v27.16b,v27.16b eor v28.16b,v28.16b,v28.16b eor v29.16b,v29.16b,v29.16b eor v30.16b,v30.16b,v30.16b b .Loop_outer .Ldone_512_neon: ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 .Labort_neon: ret ENDPROC(chacha20_neon) #endif