/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
 *
 * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
 */

#include <linux/linkage.h>

.text
.align	5
.Lsigma:
.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
.Lone:
.long	1,0,0,0

.align	5
ENTRY(chacha20_arm)
	cbz	x2,.Labort
.Lshort:
	stp	x29,x30,[sp,#-96]!
	add	x29,sp,#0

	adr	x5,.Lsigma
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	sub	sp,sp,#64

	ldp	x22,x23,[x5]		// load sigma
	ldp	x24,x25,[x3]		// load key
	ldp	x26,x27,[x3,#16]
	ldp	x28,x30,[x4]		// load counter
#ifdef	__ARMEB__
	ror	x24,x24,#32
	ror	x25,x25,#32
	ror	x26,x26,#32
	ror	x27,x27,#32
	ror	x28,x28,#32
	ror	x30,x30,#32
#endif

.Loop_outer:
	mov	w5,w22			// unpack key block
	lsr	x6,x22,#32
	mov	w7,w23
	lsr	x8,x23,#32
	mov	w9,w24
	lsr	x10,x24,#32
	mov	w11,w25
	lsr	x12,x25,#32
	mov	w13,w26
	lsr	x14,x26,#32
	mov	w15,w27
	lsr	x16,x27,#32
	mov	w17,w28
	lsr	x19,x28,#32
	mov	w20,w30
	lsr	x21,x30,#32

	mov	x4,#10
	subs	x2,x2,#64
.Loop:
	sub	x4,x4,#1
	add	w5,w5,w9
	add	w6,w6,w10
	add	w7,w7,w11
	add	w8,w8,w12
	eor	w17,w17,w5
	eor	w19,w19,w6
	eor	w20,w20,w7
	eor	w21,w21,w8
	ror	w17,w17,#16
	ror	w19,w19,#16
	ror	w20,w20,#16
	ror	w21,w21,#16
	add	w13,w13,w17
	add	w14,w14,w19
	add	w15,w15,w20
	add	w16,w16,w21
	eor	w9,w9,w13
	eor	w10,w10,w14
	eor	w11,w11,w15
	eor	w12,w12,w16
	ror	w9,w9,#20
	ror	w10,w10,#20
	ror	w11,w11,#20
	ror	w12,w12,#20
	add	w5,w5,w9
	add	w6,w6,w10
	add	w7,w7,w11
	add	w8,w8,w12
	eor	w17,w17,w5
	eor	w19,w19,w6
	eor	w20,w20,w7
	eor	w21,w21,w8
	ror	w17,w17,#24
	ror	w19,w19,#24
	ror	w20,w20,#24
	ror	w21,w21,#24
	add	w13,w13,w17
	add	w14,w14,w19
	add	w15,w15,w20
	add	w16,w16,w21
	eor	w9,w9,w13
	eor	w10,w10,w14
	eor	w11,w11,w15
	eor	w12,w12,w16
	ror	w9,w9,#25
	ror	w10,w10,#25
	ror	w11,w11,#25
	ror	w12,w12,#25
	add	w5,w5,w10
	add	w6,w6,w11
	add	w7,w7,w12
	add	w8,w8,w9
	eor	w21,w21,w5
	eor	w17,w17,w6
	eor	w19,w19,w7
	eor	w20,w20,w8
	ror	w21,w21,#16
	ror	w17,w17,#16
	ror	w19,w19,#16
	ror	w20,w20,#16
	add	w15,w15,w21
	add	w16,w16,w17
	add	w13,w13,w19
	add	w14,w14,w20
	eor	w10,w10,w15
	eor	w11,w11,w16
	eor	w12,w12,w13
	eor	w9,w9,w14
	ror	w10,w10,#20
	ror	w11,w11,#20
	ror	w12,w12,#20
	ror	w9,w9,#20
	add	w5,w5,w10
	add	w6,w6,w11
	add	w7,w7,w12
	add	w8,w8,w9
	eor	w21,w21,w5
	eor	w17,w17,w6
	eor	w19,w19,w7
	eor	w20,w20,w8
	ror	w21,w21,#24
	ror	w17,w17,#24
	ror	w19,w19,#24
	ror	w20,w20,#24
	add	w15,w15,w21
	add	w16,w16,w17
	add	w13,w13,w19
	add	w14,w14,w20
	eor	w10,w10,w15
	eor	w11,w11,w16
	eor	w12,w12,w13
	eor	w9,w9,w14
	ror	w10,w10,#25
	ror	w11,w11,#25
	ror	w12,w12,#25
	ror	w9,w9,#25
	cbnz	x4,.Loop

	add	w5,w5,w22		// accumulate key block
	add	x6,x6,x22,lsr#32
	add	w7,w7,w23
	add	x8,x8,x23,lsr#32
	add	w9,w9,w24
	add	x10,x10,x24,lsr#32
	add	w11,w11,w25
	add	x12,x12,x25,lsr#32
	add	w13,w13,w26
	add	x14,x14,x26,lsr#32
	add	w15,w15,w27
	add	x16,x16,x27,lsr#32
	add	w17,w17,w28
	add	x19,x19,x28,lsr#32
	add	w20,w20,w30
	add	x21,x21,x30,lsr#32

	b.lo	.Ltail

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	ldp	x6,x8,[x1,#0]		// load input
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	ldp	x10,x12,[x1,#16]
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	ldp	x14,x16,[x1,#32]
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
	ldp	x19,x21,[x1,#48]
	add	x1,x1,#64
#ifdef	__ARMEB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	x15,x15,x16
	eor	x17,x17,x19
	eor	x20,x20,x21

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#1			// increment counter
	stp	x9,x11,[x0,#16]
	stp	x13,x15,[x0,#32]
	stp	x17,x20,[x0,#48]
	add	x0,x0,#64

	b.hi	.Loop_outer

	ldp	x19,x20,[x29,#16]
	add	sp,sp,#64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
.Labort:
	ret

.align	4
.Ltail:
	add	x2,x2,#64
.Less_than_64:
	sub	x0,x0,#1
	add	x1,x1,x2
	add	x0,x0,x2
	add	x4,sp,x2
	neg	x2,x2

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
#ifdef	__ARMEB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	stp	x5,x7,[sp,#0]
	stp	x9,x11,[sp,#16]
	stp	x13,x15,[sp,#32]
	stp	x17,x20,[sp,#48]

.Loop_tail:
	ldrb	w10,[x1,x2]
	ldrb	w11,[x4,x2]
	add	x2,x2,#1
	eor	w10,w10,w11
	strb	w10,[x0,x2]
	cbnz	x2,.Loop_tail

	stp	xzr,xzr,[sp,#0]
	stp	xzr,xzr,[sp,#16]
	stp	xzr,xzr,[sp,#32]
	stp	xzr,xzr,[sp,#48]

	ldp	x19,x20,[x29,#16]
	add	sp,sp,#64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
	ret
ENDPROC(chacha20_arm)

.align	5
ENTRY(chacha20_neon)
	cbz	x2,.Labort_neon
	cmp	x2,#192
	b.lo	.Lshort

	stp	x29,x30,[sp,#-96]!
	add	x29,sp,#0

	adr	x5,.Lsigma
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	cmp	x2,#512
	b.hs	.L512_or_more_neon

	sub	sp,sp,#64

	ldp	x22,x23,[x5]		// load sigma
	ld1	{v24.4s},[x5],#16
	ldp	x24,x25,[x3]		// load key
	ldp	x26,x27,[x3,#16]
	ld1	{v25.4s,v26.4s},[x3]
	ldp	x28,x30,[x4]		// load counter
	ld1	{v27.4s},[x4]
	ld1	{v31.4s},[x5]
#ifdef	__ARMEB__
	rev64	v24.4s,v24.4s
	ror	x24,x24,#32
	ror	x25,x25,#32
	ror	x26,x26,#32
	ror	x27,x27,#32
	ror	x28,x28,#32
	ror	x30,x30,#32
#endif
	add	v27.4s,v27.4s,v31.4s		// += 1
	add	v28.4s,v27.4s,v31.4s
	add	v29.4s,v28.4s,v31.4s
	shl	v31.4s,v31.4s,#2			// 1 -> 4

.Loop_outer_neon:
	mov	w5,w22			// unpack key block
	lsr	x6,x22,#32
	mov	v0.16b,v24.16b
	mov	w7,w23
	lsr	x8,x23,#32
	mov	v4.16b,v24.16b
	mov	w9,w24
	lsr	x10,x24,#32
	mov	v16.16b,v24.16b
	mov	w11,w25
	mov	v1.16b,v25.16b
	lsr	x12,x25,#32
	mov	v5.16b,v25.16b
	mov	w13,w26
	mov	v17.16b,v25.16b
	lsr	x14,x26,#32
	mov	v3.16b,v27.16b
	mov	w15,w27
	mov	v7.16b,v28.16b
	lsr	x16,x27,#32
	mov	v19.16b,v29.16b
	mov	w17,w28
	mov	v2.16b,v26.16b
	lsr	x19,x28,#32
	mov	v6.16b,v26.16b
	mov	w20,w30
	mov	v18.16b,v26.16b
	lsr	x21,x30,#32

	mov	x4,#10
	subs	x2,x2,#256
.Loop_neon:
	sub	x4,x4,#1
	add	v0.4s,v0.4s,v1.4s
	add	w5,w5,w9
	add	v4.4s,v4.4s,v5.4s
	add	w6,w6,w10
	add	v16.4s,v16.4s,v17.4s
	add	w7,w7,w11
	eor	v3.16b,v3.16b,v0.16b
	add	w8,w8,w12
	eor	v7.16b,v7.16b,v4.16b
	eor	w17,w17,w5
	eor	v19.16b,v19.16b,v16.16b
	eor	w19,w19,w6
	rev32	v3.8h,v3.8h
	eor	w20,w20,w7
	rev32	v7.8h,v7.8h
	eor	w21,w21,w8
	rev32	v19.8h,v19.8h
	ror	w17,w17,#16
	add	v2.4s,v2.4s,v3.4s
	ror	w19,w19,#16
	add	v6.4s,v6.4s,v7.4s
	ror	w20,w20,#16
	add	v18.4s,v18.4s,v19.4s
	ror	w21,w21,#16
	eor	v20.16b,v1.16b,v2.16b
	add	w13,w13,w17
	eor	v21.16b,v5.16b,v6.16b
	add	w14,w14,w19
	eor	v22.16b,v17.16b,v18.16b
	add	w15,w15,w20
	ushr	v1.4s,v20.4s,#20
	add	w16,w16,w21
	ushr	v5.4s,v21.4s,#20
	eor	w9,w9,w13
	ushr	v17.4s,v22.4s,#20
	eor	w10,w10,w14
	sli	v1.4s,v20.4s,#12
	eor	w11,w11,w15
	sli	v5.4s,v21.4s,#12
	eor	w12,w12,w16
	sli	v17.4s,v22.4s,#12
	ror	w9,w9,#20
	add	v0.4s,v0.4s,v1.4s
	ror	w10,w10,#20
	add	v4.4s,v4.4s,v5.4s
	ror	w11,w11,#20
	add	v16.4s,v16.4s,v17.4s
	ror	w12,w12,#20
	eor	v20.16b,v3.16b,v0.16b
	add	w5,w5,w9
	eor	v21.16b,v7.16b,v4.16b
	add	w6,w6,w10
	eor	v22.16b,v19.16b,v16.16b
	add	w7,w7,w11
	ushr	v3.4s,v20.4s,#24
	add	w8,w8,w12
	ushr	v7.4s,v21.4s,#24
	eor	w17,w17,w5
	ushr	v19.4s,v22.4s,#24
	eor	w19,w19,w6
	sli	v3.4s,v20.4s,#8
	eor	w20,w20,w7
	sli	v7.4s,v21.4s,#8
	eor	w21,w21,w8
	sli	v19.4s,v22.4s,#8
	ror	w17,w17,#24
	add	v2.4s,v2.4s,v3.4s
	ror	w19,w19,#24
	add	v6.4s,v6.4s,v7.4s
	ror	w20,w20,#24
	add	v18.4s,v18.4s,v19.4s
	ror	w21,w21,#24
	eor	v20.16b,v1.16b,v2.16b
	add	w13,w13,w17
	eor	v21.16b,v5.16b,v6.16b
	add	w14,w14,w19
	eor	v22.16b,v17.16b,v18.16b
	add	w15,w15,w20
	ushr	v1.4s,v20.4s,#25
	add	w16,w16,w21
	ushr	v5.4s,v21.4s,#25
	eor	w9,w9,w13
	ushr	v17.4s,v22.4s,#25
	eor	w10,w10,w14
	sli	v1.4s,v20.4s,#7
	eor	w11,w11,w15
	sli	v5.4s,v21.4s,#7
	eor	w12,w12,w16
	sli	v17.4s,v22.4s,#7
	ror	w9,w9,#25
	ext	v2.16b,v2.16b,v2.16b,#8
	ror	w10,w10,#25
	ext	v6.16b,v6.16b,v6.16b,#8
	ror	w11,w11,#25
	ext	v18.16b,v18.16b,v18.16b,#8
	ror	w12,w12,#25
	ext	v3.16b,v3.16b,v3.16b,#12
	ext	v7.16b,v7.16b,v7.16b,#12
	ext	v19.16b,v19.16b,v19.16b,#12
	ext	v1.16b,v1.16b,v1.16b,#4
	ext	v5.16b,v5.16b,v5.16b,#4
	ext	v17.16b,v17.16b,v17.16b,#4
	add	v0.4s,v0.4s,v1.4s
	add	w5,w5,w10
	add	v4.4s,v4.4s,v5.4s
	add	w6,w6,w11
	add	v16.4s,v16.4s,v17.4s
	add	w7,w7,w12
	eor	v3.16b,v3.16b,v0.16b
	add	w8,w8,w9
	eor	v7.16b,v7.16b,v4.16b
	eor	w21,w21,w5
	eor	v19.16b,v19.16b,v16.16b
	eor	w17,w17,w6
	rev32	v3.8h,v3.8h
	eor	w19,w19,w7
	rev32	v7.8h,v7.8h
	eor	w20,w20,w8
	rev32	v19.8h,v19.8h
	ror	w21,w21,#16
	add	v2.4s,v2.4s,v3.4s
	ror	w17,w17,#16
	add	v6.4s,v6.4s,v7.4s
	ror	w19,w19,#16
	add	v18.4s,v18.4s,v19.4s
	ror	w20,w20,#16
	eor	v20.16b,v1.16b,v2.16b
	add	w15,w15,w21
	eor	v21.16b,v5.16b,v6.16b
	add	w16,w16,w17
	eor	v22.16b,v17.16b,v18.16b
	add	w13,w13,w19
	ushr	v1.4s,v20.4s,#20
	add	w14,w14,w20
	ushr	v5.4s,v21.4s,#20
	eor	w10,w10,w15
	ushr	v17.4s,v22.4s,#20
	eor	w11,w11,w16
	sli	v1.4s,v20.4s,#12
	eor	w12,w12,w13
	sli	v5.4s,v21.4s,#12
	eor	w9,w9,w14
	sli	v17.4s,v22.4s,#12
	ror	w10,w10,#20
	add	v0.4s,v0.4s,v1.4s
	ror	w11,w11,#20
	add	v4.4s,v4.4s,v5.4s
	ror	w12,w12,#20
	add	v16.4s,v16.4s,v17.4s
	ror	w9,w9,#20
	eor	v20.16b,v3.16b,v0.16b
	add	w5,w5,w10
	eor	v21.16b,v7.16b,v4.16b
	add	w6,w6,w11
	eor	v22.16b,v19.16b,v16.16b
	add	w7,w7,w12
	ushr	v3.4s,v20.4s,#24
	add	w8,w8,w9
	ushr	v7.4s,v21.4s,#24
	eor	w21,w21,w5
	ushr	v19.4s,v22.4s,#24
	eor	w17,w17,w6
	sli	v3.4s,v20.4s,#8
	eor	w19,w19,w7
	sli	v7.4s,v21.4s,#8
	eor	w20,w20,w8
	sli	v19.4s,v22.4s,#8
	ror	w21,w21,#24
	add	v2.4s,v2.4s,v3.4s
	ror	w17,w17,#24
	add	v6.4s,v6.4s,v7.4s
	ror	w19,w19,#24
	add	v18.4s,v18.4s,v19.4s
	ror	w20,w20,#24
	eor	v20.16b,v1.16b,v2.16b
	add	w15,w15,w21
	eor	v21.16b,v5.16b,v6.16b
	add	w16,w16,w17
	eor	v22.16b,v17.16b,v18.16b
	add	w13,w13,w19
	ushr	v1.4s,v20.4s,#25
	add	w14,w14,w20
	ushr	v5.4s,v21.4s,#25
	eor	w10,w10,w15
	ushr	v17.4s,v22.4s,#25
	eor	w11,w11,w16
	sli	v1.4s,v20.4s,#7
	eor	w12,w12,w13
	sli	v5.4s,v21.4s,#7
	eor	w9,w9,w14
	sli	v17.4s,v22.4s,#7
	ror	w10,w10,#25
	ext	v2.16b,v2.16b,v2.16b,#8
	ror	w11,w11,#25
	ext	v6.16b,v6.16b,v6.16b,#8
	ror	w12,w12,#25
	ext	v18.16b,v18.16b,v18.16b,#8
	ror	w9,w9,#25
	ext	v3.16b,v3.16b,v3.16b,#4
	ext	v7.16b,v7.16b,v7.16b,#4
	ext	v19.16b,v19.16b,v19.16b,#4
	ext	v1.16b,v1.16b,v1.16b,#12
	ext	v5.16b,v5.16b,v5.16b,#12
	ext	v17.16b,v17.16b,v17.16b,#12
	cbnz	x4,.Loop_neon

	add	w5,w5,w22		// accumulate key block
	add	v0.4s,v0.4s,v24.4s
	add	x6,x6,x22,lsr#32
	add	v4.4s,v4.4s,v24.4s
	add	w7,w7,w23
	add	v16.4s,v16.4s,v24.4s
	add	x8,x8,x23,lsr#32
	add	v2.4s,v2.4s,v26.4s
	add	w9,w9,w24
	add	v6.4s,v6.4s,v26.4s
	add	x10,x10,x24,lsr#32
	add	v18.4s,v18.4s,v26.4s
	add	w11,w11,w25
	add	v3.4s,v3.4s,v27.4s
	add	x12,x12,x25,lsr#32
	add	w13,w13,w26
	add	v7.4s,v7.4s,v28.4s
	add	x14,x14,x26,lsr#32
	add	w15,w15,w27
	add	v19.4s,v19.4s,v29.4s
	add	x16,x16,x27,lsr#32
	add	w17,w17,w28
	add	v1.4s,v1.4s,v25.4s
	add	x19,x19,x28,lsr#32
	add	w20,w20,w30
	add	v5.4s,v5.4s,v25.4s
	add	x21,x21,x30,lsr#32
	add	v17.4s,v17.4s,v25.4s

	b.lo	.Ltail_neon

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	ldp	x6,x8,[x1,#0]		// load input
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	ldp	x10,x12,[x1,#16]
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	ldp	x14,x16,[x1,#32]
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
	ldp	x19,x21,[x1,#48]
	add	x1,x1,#64
#ifdef	__ARMEB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	v0.16b,v0.16b,v20.16b
	eor	x15,x15,x16
	eor	v1.16b,v1.16b,v21.16b
	eor	x17,x17,x19
	eor	v2.16b,v2.16b,v22.16b
	eor	x20,x20,x21
	eor	v3.16b,v3.16b,v23.16b
	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#4			// increment counter
	stp	x9,x11,[x0,#16]
	add	v27.4s,v27.4s,v31.4s		// += 4
	stp	x13,x15,[x0,#32]
	add	v28.4s,v28.4s,v31.4s
	stp	x17,x20,[x0,#48]
	add	v29.4s,v29.4s,v31.4s
	add	x0,x0,#64

	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64

	eor	v4.16b,v4.16b,v20.16b
	eor	v5.16b,v5.16b,v21.16b
	eor	v6.16b,v6.16b,v22.16b
	eor	v7.16b,v7.16b,v23.16b
	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64

	eor	v16.16b,v16.16b,v0.16b
	eor	v17.16b,v17.16b,v1.16b
	eor	v18.16b,v18.16b,v2.16b
	eor	v19.16b,v19.16b,v3.16b
	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64

	b.hi	.Loop_outer_neon

	ldp	x19,x20,[x29,#16]
	add	sp,sp,#64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
	ret

.Ltail_neon:
	add	x2,x2,#256
	cmp	x2,#64
	b.lo	.Less_than_64

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	ldp	x6,x8,[x1,#0]		// load input
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	ldp	x10,x12,[x1,#16]
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	ldp	x14,x16,[x1,#32]
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
	ldp	x19,x21,[x1,#48]
	add	x1,x1,#64
#ifdef	__ARMEB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	x15,x15,x16
	eor	x17,x17,x19
	eor	x20,x20,x21

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#4			// increment counter
	stp	x9,x11,[x0,#16]
	stp	x13,x15,[x0,#32]
	stp	x17,x20,[x0,#48]
	add	x0,x0,#64
	b.eq	.Ldone_neon
	sub	x2,x2,#64
	cmp	x2,#64
	b.lo	.Less_than_128

	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
	eor	v0.16b,v0.16b,v20.16b
	eor	v1.16b,v1.16b,v21.16b
	eor	v2.16b,v2.16b,v22.16b
	eor	v3.16b,v3.16b,v23.16b
	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
	b.eq	.Ldone_neon
	sub	x2,x2,#64
	cmp	x2,#64
	b.lo	.Less_than_192

	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
	eor	v4.16b,v4.16b,v20.16b
	eor	v5.16b,v5.16b,v21.16b
	eor	v6.16b,v6.16b,v22.16b
	eor	v7.16b,v7.16b,v23.16b
	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
	b.eq	.Ldone_neon
	sub	x2,x2,#64

	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
	b	.Last_neon

.Less_than_128:
	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
	b	.Last_neon
.Less_than_192:
	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
	b	.Last_neon

.align	4
.Last_neon:
	sub	x0,x0,#1
	add	x1,x1,x2
	add	x0,x0,x2
	add	x4,sp,x2
	neg	x2,x2

.Loop_tail_neon:
	ldrb	w10,[x1,x2]
	ldrb	w11,[x4,x2]
	add	x2,x2,#1
	eor	w10,w10,w11
	strb	w10,[x0,x2]
	cbnz	x2,.Loop_tail_neon

	stp	xzr,xzr,[sp,#0]
	stp	xzr,xzr,[sp,#16]
	stp	xzr,xzr,[sp,#32]
	stp	xzr,xzr,[sp,#48]

.Ldone_neon:
	ldp	x19,x20,[x29,#16]
	add	sp,sp,#64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
	ret

.L512_or_more_neon:
	sub	sp,sp,#128+64

	ldp	x22,x23,[x5]		// load sigma
	ld1	{v24.4s},[x5],#16
	ldp	x24,x25,[x3]		// load key
	ldp	x26,x27,[x3,#16]
	ld1	{v25.4s,v26.4s},[x3]
	ldp	x28,x30,[x4]		// load counter
	ld1	{v27.4s},[x4]
	ld1	{v31.4s},[x5]
#ifdef	__ARMEB__
	rev64	v24.4s,v24.4s
	ror	x24,x24,#32
	ror	x25,x25,#32
	ror	x26,x26,#32
	ror	x27,x27,#32
	ror	x28,x28,#32
	ror	x30,x30,#32
#endif
	add	v27.4s,v27.4s,v31.4s		// += 1
	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
	add	v27.4s,v27.4s,v31.4s		// not typo
	str	q26,[sp,#32]
	add	v28.4s,v27.4s,v31.4s
	add	v29.4s,v28.4s,v31.4s
	add	v30.4s,v29.4s,v31.4s
	shl	v31.4s,v31.4s,#2			// 1 -> 4

	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
	stp	d10,d11,[sp,#128+16]
	stp	d12,d13,[sp,#128+32]
	stp	d14,d15,[sp,#128+48]

	sub	x2,x2,#512			// not typo

.Loop_outer_512_neon:
	mov	v0.16b,v24.16b
	mov	v4.16b,v24.16b
	mov	v8.16b,v24.16b
	mov	v12.16b,v24.16b
	mov	v16.16b,v24.16b
	mov	v20.16b,v24.16b
	mov	v1.16b,v25.16b
	mov	w5,w22			// unpack key block
	mov	v5.16b,v25.16b
	lsr	x6,x22,#32
	mov	v9.16b,v25.16b
	mov	w7,w23
	mov	v13.16b,v25.16b
	lsr	x8,x23,#32
	mov	v17.16b,v25.16b
	mov	w9,w24
	mov	v21.16b,v25.16b
	lsr	x10,x24,#32
	mov	v3.16b,v27.16b
	mov	w11,w25
	mov	v7.16b,v28.16b
	lsr	x12,x25,#32
	mov	v11.16b,v29.16b
	mov	w13,w26
	mov	v15.16b,v30.16b
	lsr	x14,x26,#32
	mov	v2.16b,v26.16b
	mov	w15,w27
	mov	v6.16b,v26.16b
	lsr	x16,x27,#32
	add	v19.4s,v3.4s,v31.4s			// +4
	mov	w17,w28
	add	v23.4s,v7.4s,v31.4s			// +4
	lsr	x19,x28,#32
	mov	v10.16b,v26.16b
	mov	w20,w30
	mov	v14.16b,v26.16b
	lsr	x21,x30,#32
	mov	v18.16b,v26.16b
	stp	q27,q28,[sp,#48]		// off-load key block, variable part
	mov	v22.16b,v26.16b
	str	q29,[sp,#80]

	mov	x4,#5
	subs	x2,x2,#512
.Loop_upper_neon:
	sub	x4,x4,#1
	add	v0.4s,v0.4s,v1.4s
	add	w5,w5,w9
	add	v4.4s,v4.4s,v5.4s
	add	w6,w6,w10
	add	v8.4s,v8.4s,v9.4s
	add	w7,w7,w11
	add	v12.4s,v12.4s,v13.4s
	add	w8,w8,w12
	add	v16.4s,v16.4s,v17.4s
	eor	w17,w17,w5
	add	v20.4s,v20.4s,v21.4s
	eor	w19,w19,w6
	eor	v3.16b,v3.16b,v0.16b
	eor	w20,w20,w7
	eor	v7.16b,v7.16b,v4.16b
	eor	w21,w21,w8
	eor	v11.16b,v11.16b,v8.16b
	ror	w17,w17,#16
	eor	v15.16b,v15.16b,v12.16b
	ror	w19,w19,#16
	eor	v19.16b,v19.16b,v16.16b
	ror	w20,w20,#16
	eor	v23.16b,v23.16b,v20.16b
	ror	w21,w21,#16
	rev32	v3.8h,v3.8h
	add	w13,w13,w17
	rev32	v7.8h,v7.8h
	add	w14,w14,w19
	rev32	v11.8h,v11.8h
	add	w15,w15,w20
	rev32	v15.8h,v15.8h
	add	w16,w16,w21
	rev32	v19.8h,v19.8h
	eor	w9,w9,w13
	rev32	v23.8h,v23.8h
	eor	w10,w10,w14
	add	v2.4s,v2.4s,v3.4s
	eor	w11,w11,w15
	add	v6.4s,v6.4s,v7.4s
	eor	w12,w12,w16
	add	v10.4s,v10.4s,v11.4s
	ror	w9,w9,#20
	add	v14.4s,v14.4s,v15.4s
	ror	w10,w10,#20
	add	v18.4s,v18.4s,v19.4s
	ror	w11,w11,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w12,w12,#20
	eor	v24.16b,v1.16b,v2.16b
	add	w5,w5,w9
	eor	v25.16b,v5.16b,v6.16b
	add	w6,w6,w10
	eor	v26.16b,v9.16b,v10.16b
	add	w7,w7,w11
	eor	v27.16b,v13.16b,v14.16b
	add	w8,w8,w12
	eor	v28.16b,v17.16b,v18.16b
	eor	w17,w17,w5
	eor	v29.16b,v21.16b,v22.16b
	eor	w19,w19,w6
	ushr	v1.4s,v24.4s,#20
	eor	w20,w20,w7
	ushr	v5.4s,v25.4s,#20
	eor	w21,w21,w8
	ushr	v9.4s,v26.4s,#20
	ror	w17,w17,#24
	ushr	v13.4s,v27.4s,#20
	ror	w19,w19,#24
	ushr	v17.4s,v28.4s,#20
	ror	w20,w20,#24
	ushr	v21.4s,v29.4s,#20
	ror	w21,w21,#24
	sli	v1.4s,v24.4s,#12
	add	w13,w13,w17
	sli	v5.4s,v25.4s,#12
	add	w14,w14,w19
	sli	v9.4s,v26.4s,#12
	add	w15,w15,w20
	sli	v13.4s,v27.4s,#12
	add	w16,w16,w21
	sli	v17.4s,v28.4s,#12
	eor	w9,w9,w13
	sli	v21.4s,v29.4s,#12
	eor	w10,w10,w14
	add	v0.4s,v0.4s,v1.4s
	eor	w11,w11,w15
	add	v4.4s,v4.4s,v5.4s
	eor	w12,w12,w16
	add	v8.4s,v8.4s,v9.4s
	ror	w9,w9,#25
	add	v12.4s,v12.4s,v13.4s
	ror	w10,w10,#25
	add	v16.4s,v16.4s,v17.4s
	ror	w11,w11,#25
	add	v20.4s,v20.4s,v21.4s
	ror	w12,w12,#25
	eor	v24.16b,v3.16b,v0.16b
	add	w5,w5,w10
	eor	v25.16b,v7.16b,v4.16b
	add	w6,w6,w11
	eor	v26.16b,v11.16b,v8.16b
	add	w7,w7,w12
	eor	v27.16b,v15.16b,v12.16b
	add	w8,w8,w9
	eor	v28.16b,v19.16b,v16.16b
	eor	w21,w21,w5
	eor	v29.16b,v23.16b,v20.16b
	eor	w17,w17,w6
	ushr	v3.4s,v24.4s,#24
	eor	w19,w19,w7
	ushr	v7.4s,v25.4s,#24
	eor	w20,w20,w8
	ushr	v11.4s,v26.4s,#24
	ror	w21,w21,#16
	ushr	v15.4s,v27.4s,#24
	ror	w17,w17,#16
	ushr	v19.4s,v28.4s,#24
	ror	w19,w19,#16
	ushr	v23.4s,v29.4s,#24
	ror	w20,w20,#16
	sli	v3.4s,v24.4s,#8
	add	w15,w15,w21
	sli	v7.4s,v25.4s,#8
	add	w16,w16,w17
	sli	v11.4s,v26.4s,#8
	add	w13,w13,w19
	sli	v15.4s,v27.4s,#8
	add	w14,w14,w20
	sli	v19.4s,v28.4s,#8
	eor	w10,w10,w15
	sli	v23.4s,v29.4s,#8
	eor	w11,w11,w16
	add	v2.4s,v2.4s,v3.4s
	eor	w12,w12,w13
	add	v6.4s,v6.4s,v7.4s
	eor	w9,w9,w14
	add	v10.4s,v10.4s,v11.4s
	ror	w10,w10,#20
	add	v14.4s,v14.4s,v15.4s
	ror	w11,w11,#20
	add	v18.4s,v18.4s,v19.4s
	ror	w12,w12,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w9,w9,#20
	eor	v24.16b,v1.16b,v2.16b
	add	w5,w5,w10
	eor	v25.16b,v5.16b,v6.16b
	add	w6,w6,w11
	eor	v26.16b,v9.16b,v10.16b
	add	w7,w7,w12
	eor	v27.16b,v13.16b,v14.16b
	add	w8,w8,w9
	eor	v28.16b,v17.16b,v18.16b
	eor	w21,w21,w5
	eor	v29.16b,v21.16b,v22.16b
	eor	w17,w17,w6
	ushr	v1.4s,v24.4s,#25
	eor	w19,w19,w7
	ushr	v5.4s,v25.4s,#25
	eor	w20,w20,w8
	ushr	v9.4s,v26.4s,#25
	ror	w21,w21,#24
	ushr	v13.4s,v27.4s,#25
	ror	w17,w17,#24
	ushr	v17.4s,v28.4s,#25
	ror	w19,w19,#24
	ushr	v21.4s,v29.4s,#25
	ror	w20,w20,#24
	sli	v1.4s,v24.4s,#7
	add	w15,w15,w21
	sli	v5.4s,v25.4s,#7
	add	w16,w16,w17
	sli	v9.4s,v26.4s,#7
	add	w13,w13,w19
	sli	v13.4s,v27.4s,#7
	add	w14,w14,w20
	sli	v17.4s,v28.4s,#7
	eor	w10,w10,w15
	sli	v21.4s,v29.4s,#7
	eor	w11,w11,w16
	ext	v2.16b,v2.16b,v2.16b,#8
	eor	w12,w12,w13
	ext	v6.16b,v6.16b,v6.16b,#8
	eor	w9,w9,w14
	ext	v10.16b,v10.16b,v10.16b,#8
	ror	w10,w10,#25
	ext	v14.16b,v14.16b,v14.16b,#8
	ror	w11,w11,#25
	ext	v18.16b,v18.16b,v18.16b,#8
	ror	w12,w12,#25
	ext	v22.16b,v22.16b,v22.16b,#8
	ror	w9,w9,#25
	ext	v3.16b,v3.16b,v3.16b,#12
	ext	v7.16b,v7.16b,v7.16b,#12
	ext	v11.16b,v11.16b,v11.16b,#12
	ext	v15.16b,v15.16b,v15.16b,#12
	ext	v19.16b,v19.16b,v19.16b,#12
	ext	v23.16b,v23.16b,v23.16b,#12
	ext	v1.16b,v1.16b,v1.16b,#4
	ext	v5.16b,v5.16b,v5.16b,#4
	ext	v9.16b,v9.16b,v9.16b,#4
	ext	v13.16b,v13.16b,v13.16b,#4
	ext	v17.16b,v17.16b,v17.16b,#4
	ext	v21.16b,v21.16b,v21.16b,#4
	add	v0.4s,v0.4s,v1.4s
	add	w5,w5,w9
	add	v4.4s,v4.4s,v5.4s
	add	w6,w6,w10
	add	v8.4s,v8.4s,v9.4s
	add	w7,w7,w11
	add	v12.4s,v12.4s,v13.4s
	add	w8,w8,w12
	add	v16.4s,v16.4s,v17.4s
	eor	w17,w17,w5
	add	v20.4s,v20.4s,v21.4s
	eor	w19,w19,w6
	eor	v3.16b,v3.16b,v0.16b
	eor	w20,w20,w7
	eor	v7.16b,v7.16b,v4.16b
	eor	w21,w21,w8
	eor	v11.16b,v11.16b,v8.16b
	ror	w17,w17,#16
	eor	v15.16b,v15.16b,v12.16b
	ror	w19,w19,#16
	eor	v19.16b,v19.16b,v16.16b
	ror	w20,w20,#16
	eor	v23.16b,v23.16b,v20.16b
	ror	w21,w21,#16
	rev32	v3.8h,v3.8h
	add	w13,w13,w17
	rev32	v7.8h,v7.8h
	add	w14,w14,w19
	rev32	v11.8h,v11.8h
	add	w15,w15,w20
	rev32	v15.8h,v15.8h
	add	w16,w16,w21
	rev32	v19.8h,v19.8h
	eor	w9,w9,w13
	rev32	v23.8h,v23.8h
	eor	w10,w10,w14
	add	v2.4s,v2.4s,v3.4s
	eor	w11,w11,w15
	add	v6.4s,v6.4s,v7.4s
	eor	w12,w12,w16
	add	v10.4s,v10.4s,v11.4s
	ror	w9,w9,#20
	add	v14.4s,v14.4s,v15.4s
	ror	w10,w10,#20
	add	v18.4s,v18.4s,v19.4s
	ror	w11,w11,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w12,w12,#20
	eor	v24.16b,v1.16b,v2.16b
	add	w5,w5,w9
	eor	v25.16b,v5.16b,v6.16b
	add	w6,w6,w10
	eor	v26.16b,v9.16b,v10.16b
	add	w7,w7,w11
	eor	v27.16b,v13.16b,v14.16b
	add	w8,w8,w12
	eor	v28.16b,v17.16b,v18.16b
	eor	w17,w17,w5
	eor	v29.16b,v21.16b,v22.16b
	eor	w19,w19,w6
	ushr	v1.4s,v24.4s,#20
	eor	w20,w20,w7
	ushr	v5.4s,v25.4s,#20
	eor	w21,w21,w8
	ushr	v9.4s,v26.4s,#20
	ror	w17,w17,#24
	ushr	v13.4s,v27.4s,#20
	ror	w19,w19,#24
	ushr	v17.4s,v28.4s,#20
	ror	w20,w20,#24
	ushr	v21.4s,v29.4s,#20
	ror	w21,w21,#24
	sli	v1.4s,v24.4s,#12
	add	w13,w13,w17
	sli	v5.4s,v25.4s,#12
	add	w14,w14,w19
	sli	v9.4s,v26.4s,#12
	add	w15,w15,w20
	sli	v13.4s,v27.4s,#12
	add	w16,w16,w21
	sli	v17.4s,v28.4s,#12
	eor	w9,w9,w13
	sli	v21.4s,v29.4s,#12
	eor	w10,w10,w14
	add	v0.4s,v0.4s,v1.4s
	eor	w11,w11,w15
	add	v4.4s,v4.4s,v5.4s
	eor	w12,w12,w16
	add	v8.4s,v8.4s,v9.4s
	ror	w9,w9,#25
	add	v12.4s,v12.4s,v13.4s
	ror	w10,w10,#25
	add	v16.4s,v16.4s,v17.4s
	ror	w11,w11,#25
	add	v20.4s,v20.4s,v21.4s
	ror	w12,w12,#25
	eor	v24.16b,v3.16b,v0.16b
	add	w5,w5,w10
	eor	v25.16b,v7.16b,v4.16b
	add	w6,w6,w11
	eor	v26.16b,v11.16b,v8.16b
	add	w7,w7,w12
	eor	v27.16b,v15.16b,v12.16b
	add	w8,w8,w9
	eor	v28.16b,v19.16b,v16.16b
	eor	w21,w21,w5
	eor	v29.16b,v23.16b,v20.16b
	eor	w17,w17,w6
	ushr	v3.4s,v24.4s,#24
	eor	w19,w19,w7
	ushr	v7.4s,v25.4s,#24
	eor	w20,w20,w8
	ushr	v11.4s,v26.4s,#24
	ror	w21,w21,#16
	ushr	v15.4s,v27.4s,#24
	ror	w17,w17,#16
	ushr	v19.4s,v28.4s,#24
	ror	w19,w19,#16
	ushr	v23.4s,v29.4s,#24
	ror	w20,w20,#16
	sli	v3.4s,v24.4s,#8
	add	w15,w15,w21
	sli	v7.4s,v25.4s,#8
	add	w16,w16,w17
	sli	v11.4s,v26.4s,#8
	add	w13,w13,w19
	sli	v15.4s,v27.4s,#8
	add	w14,w14,w20
	sli	v19.4s,v28.4s,#8
	eor	w10,w10,w15
	sli	v23.4s,v29.4s,#8
	eor	w11,w11,w16
	add	v2.4s,v2.4s,v3.4s
	eor	w12,w12,w13
	add	v6.4s,v6.4s,v7.4s
	eor	w9,w9,w14
	add	v10.4s,v10.4s,v11.4s
	ror	w10,w10,#20
	add	v14.4s,v14.4s,v15.4s
	ror	w11,w11,#20
	add	v18.4s,v18.4s,v19.4s
	ror	w12,w12,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w9,w9,#20
	eor	v24.16b,v1.16b,v2.16b
	add	w5,w5,w10
	eor	v25.16b,v5.16b,v6.16b
	add	w6,w6,w11
	eor	v26.16b,v9.16b,v10.16b
	add	w7,w7,w12
	eor	v27.16b,v13.16b,v14.16b
	add	w8,w8,w9
	eor	v28.16b,v17.16b,v18.16b
	eor	w21,w21,w5
	eor	v29.16b,v21.16b,v22.16b
	eor	w17,w17,w6
	ushr	v1.4s,v24.4s,#25
	eor	w19,w19,w7
	ushr	v5.4s,v25.4s,#25
	eor	w20,w20,w8
	ushr	v9.4s,v26.4s,#25
	ror	w21,w21,#24
	ushr	v13.4s,v27.4s,#25
	ror	w17,w17,#24
	ushr	v17.4s,v28.4s,#25
	ror	w19,w19,#24
	ushr	v21.4s,v29.4s,#25
	ror	w20,w20,#24
	sli	v1.4s,v24.4s,#7
	add	w15,w15,w21
	sli	v5.4s,v25.4s,#7
	add	w16,w16,w17
	sli	v9.4s,v26.4s,#7
	add	w13,w13,w19
	sli	v13.4s,v27.4s,#7
	add	w14,w14,w20
	sli	v17.4s,v28.4s,#7
	eor	w10,w10,w15
	sli	v21.4s,v29.4s,#7
	eor	w11,w11,w16
	ext	v2.16b,v2.16b,v2.16b,#8
	eor	w12,w12,w13
	ext	v6.16b,v6.16b,v6.16b,#8
	eor	w9,w9,w14
	ext	v10.16b,v10.16b,v10.16b,#8
	ror	w10,w10,#25
	ext	v14.16b,v14.16b,v14.16b,#8
	ror	w11,w11,#25
	ext	v18.16b,v18.16b,v18.16b,#8
	ror	w12,w12,#25
	ext	v22.16b,v22.16b,v22.16b,#8
	ror	w9,w9,#25
	ext	v3.16b,v3.16b,v3.16b,#4
	ext	v7.16b,v7.16b,v7.16b,#4
	ext	v11.16b,v11.16b,v11.16b,#4
	ext	v15.16b,v15.16b,v15.16b,#4
	ext	v19.16b,v19.16b,v19.16b,#4
	ext	v23.16b,v23.16b,v23.16b,#4
	ext	v1.16b,v1.16b,v1.16b,#12
	ext	v5.16b,v5.16b,v5.16b,#12
	ext	v9.16b,v9.16b,v9.16b,#12
	ext	v13.16b,v13.16b,v13.16b,#12
	ext	v17.16b,v17.16b,v17.16b,#12
	ext	v21.16b,v21.16b,v21.16b,#12
	cbnz	x4,.Loop_upper_neon

	add	w5,w5,w22		// accumulate key block
	add	x6,x6,x22,lsr#32
	add	w7,w7,w23
	add	x8,x8,x23,lsr#32
	add	w9,w9,w24
	add	x10,x10,x24,lsr#32
	add	w11,w11,w25
	add	x12,x12,x25,lsr#32
	add	w13,w13,w26
	add	x14,x14,x26,lsr#32
	add	w15,w15,w27
	add	x16,x16,x27,lsr#32
	add	w17,w17,w28
	add	x19,x19,x28,lsr#32
	add	w20,w20,w30
	add	x21,x21,x30,lsr#32

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	ldp	x6,x8,[x1,#0]		// load input
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	ldp	x10,x12,[x1,#16]
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	ldp	x14,x16,[x1,#32]
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
	ldp	x19,x21,[x1,#48]
	add	x1,x1,#64
#ifdef	__ARMEB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	x15,x15,x16
	eor	x17,x17,x19
	eor	x20,x20,x21

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#1			// increment counter
	mov	w5,w22			// unpack key block
	lsr	x6,x22,#32
	stp	x9,x11,[x0,#16]
	mov	w7,w23
	lsr	x8,x23,#32
	stp	x13,x15,[x0,#32]
	mov	w9,w24
	lsr	x10,x24,#32
	stp	x17,x20,[x0,#48]
	add	x0,x0,#64
	mov	w11,w25
	lsr	x12,x25,#32
	mov	w13,w26
	lsr	x14,x26,#32
	mov	w15,w27
	lsr	x16,x27,#32
	mov	w17,w28
	lsr	x19,x28,#32
	mov	w20,w30
	lsr	x21,x30,#32

	mov	x4,#5
.Loop_lower_neon:
	sub	x4,x4,#1
	add	v0.4s,v0.4s,v1.4s
	add	w5,w5,w9
	add	v4.4s,v4.4s,v5.4s
	add	w6,w6,w10
	add	v8.4s,v8.4s,v9.4s
	add	w7,w7,w11
	add	v12.4s,v12.4s,v13.4s
	add	w8,w8,w12
	add	v16.4s,v16.4s,v17.4s
	eor	w17,w17,w5
	add	v20.4s,v20.4s,v21.4s
	eor	w19,w19,w6
	eor	v3.16b,v3.16b,v0.16b
	eor	w20,w20,w7
	eor	v7.16b,v7.16b,v4.16b
	eor	w21,w21,w8
	eor	v11.16b,v11.16b,v8.16b
	ror	w17,w17,#16
	eor	v15.16b,v15.16b,v12.16b
	ror	w19,w19,#16
	eor	v19.16b,v19.16b,v16.16b
	ror	w20,w20,#16
	eor	v23.16b,v23.16b,v20.16b
	ror	w21,w21,#16
	rev32	v3.8h,v3.8h
	add	w13,w13,w17
	rev32	v7.8h,v7.8h
	add	w14,w14,w19
	rev32	v11.8h,v11.8h
	add	w15,w15,w20
	rev32	v15.8h,v15.8h
	add	w16,w16,w21
	rev32	v19.8h,v19.8h
	eor	w9,w9,w13
	rev32	v23.8h,v23.8h
	eor	w10,w10,w14
	add	v2.4s,v2.4s,v3.4s
	eor	w11,w11,w15
	add	v6.4s,v6.4s,v7.4s
	eor	w12,w12,w16
	add	v10.4s,v10.4s,v11.4s
	ror	w9,w9,#20
	add	v14.4s,v14.4s,v15.4s
	ror	w10,w10,#20
	add	v18.4s,v18.4s,v19.4s
	ror	w11,w11,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w12,w12,#20
	eor	v24.16b,v1.16b,v2.16b
	add	w5,w5,w9
	eor	v25.16b,v5.16b,v6.16b
	add	w6,w6,w10
	eor	v26.16b,v9.16b,v10.16b
	add	w7,w7,w11
	eor	v27.16b,v13.16b,v14.16b
	add	w8,w8,w12
	eor	v28.16b,v17.16b,v18.16b
	eor	w17,w17,w5
	eor	v29.16b,v21.16b,v22.16b
	eor	w19,w19,w6
	ushr	v1.4s,v24.4s,#20
	eor	w20,w20,w7
	ushr	v5.4s,v25.4s,#20
	eor	w21,w21,w8
	ushr	v9.4s,v26.4s,#20
	ror	w17,w17,#24
	ushr	v13.4s,v27.4s,#20
	ror	w19,w19,#24
	ushr	v17.4s,v28.4s,#20
	ror	w20,w20,#24
	ushr	v21.4s,v29.4s,#20
	ror	w21,w21,#24
	sli	v1.4s,v24.4s,#12
	add	w13,w13,w17
	sli	v5.4s,v25.4s,#12
	add	w14,w14,w19
	sli	v9.4s,v26.4s,#12
	add	w15,w15,w20
	sli	v13.4s,v27.4s,#12
	add	w16,w16,w21
	sli	v17.4s,v28.4s,#12
	eor	w9,w9,w13
	sli	v21.4s,v29.4s,#12
	eor	w10,w10,w14
	add	v0.4s,v0.4s,v1.4s
	eor	w11,w11,w15
	add	v4.4s,v4.4s,v5.4s
	eor	w12,w12,w16
	add	v8.4s,v8.4s,v9.4s
	ror	w9,w9,#25
	add	v12.4s,v12.4s,v13.4s
	ror	w10,w10,#25
	add	v16.4s,v16.4s,v17.4s
	ror	w11,w11,#25
	add	v20.4s,v20.4s,v21.4s
	ror	w12,w12,#25
	eor	v24.16b,v3.16b,v0.16b
	add	w5,w5,w10
	eor	v25.16b,v7.16b,v4.16b
	add	w6,w6,w11
	eor	v26.16b,v11.16b,v8.16b
	add	w7,w7,w12
	eor	v27.16b,v15.16b,v12.16b
	add	w8,w8,w9
	eor	v28.16b,v19.16b,v16.16b
	eor	w21,w21,w5
	eor	v29.16b,v23.16b,v20.16b
	eor	w17,w17,w6
	ushr	v3.4s,v24.4s,#24
	eor	w19,w19,w7
	ushr	v7.4s,v25.4s,#24
	eor	w20,w20,w8
	ushr	v11.4s,v26.4s,#24
	ror	w21,w21,#16
	ushr	v15.4s,v27.4s,#24
	ror	w17,w17,#16
	ushr	v19.4s,v28.4s,#24
	ror	w19,w19,#16
	ushr	v23.4s,v29.4s,#24
	ror	w20,w20,#16
	sli	v3.4s,v24.4s,#8
	add	w15,w15,w21
	sli	v7.4s,v25.4s,#8
	add	w16,w16,w17
	sli	v11.4s,v26.4s,#8
	add	w13,w13,w19
	sli	v15.4s,v27.4s,#8
	add	w14,w14,w20
	sli	v19.4s,v28.4s,#8
	eor	w10,w10,w15
	sli	v23.4s,v29.4s,#8
	eor	w11,w11,w16
	add	v2.4s,v2.4s,v3.4s
	eor	w12,w12,w13
	add	v6.4s,v6.4s,v7.4s
	eor	w9,w9,w14
	add	v10.4s,v10.4s,v11.4s
	ror	w10,w10,#20
	add	v14.4s,v14.4s,v15.4s
	ror	w11,w11,#20
	add	v18.4s,v18.4s,v19.4s
	ror	w12,w12,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w9,w9,#20
	eor	v24.16b,v1.16b,v2.16b
	add	w5,w5,w10
	eor	v25.16b,v5.16b,v6.16b
	add	w6,w6,w11
	eor	v26.16b,v9.16b,v10.16b
	add	w7,w7,w12
	eor	v27.16b,v13.16b,v14.16b
	add	w8,w8,w9
	eor	v28.16b,v17.16b,v18.16b
	eor	w21,w21,w5
	eor	v29.16b,v21.16b,v22.16b
	eor	w17,w17,w6
	ushr	v1.4s,v24.4s,#25
	eor	w19,w19,w7
	ushr	v5.4s,v25.4s,#25
	eor	w20,w20,w8
	ushr	v9.4s,v26.4s,#25
	ror	w21,w21,#24
	ushr	v13.4s,v27.4s,#25
	ror	w17,w17,#24
	ushr	v17.4s,v28.4s,#25
	ror	w19,w19,#24
	ushr	v21.4s,v29.4s,#25
	ror	w20,w20,#24
	sli	v1.4s,v24.4s,#7
	add	w15,w15,w21
	sli	v5.4s,v25.4s,#7
	add	w16,w16,w17
	sli	v9.4s,v26.4s,#7
	add	w13,w13,w19
	sli	v13.4s,v27.4s,#7
	add	w14,w14,w20
	sli	v17.4s,v28.4s,#7
	eor	w10,w10,w15
	sli	v21.4s,v29.4s,#7
	eor	w11,w11,w16
	ext	v2.16b,v2.16b,v2.16b,#8
	eor	w12,w12,w13
	ext	v6.16b,v6.16b,v6.16b,#8
	eor	w9,w9,w14
	ext	v10.16b,v10.16b,v10.16b,#8
	ror	w10,w10,#25
	ext	v14.16b,v14.16b,v14.16b,#8
	ror	w11,w11,#25
	ext	v18.16b,v18.16b,v18.16b,#8
	ror	w12,w12,#25
	ext	v22.16b,v22.16b,v22.16b,#8
	ror	w9,w9,#25
	ext	v3.16b,v3.16b,v3.16b,#12
	ext	v7.16b,v7.16b,v7.16b,#12
	ext	v11.16b,v11.16b,v11.16b,#12
	ext	v15.16b,v15.16b,v15.16b,#12
	ext	v19.16b,v19.16b,v19.16b,#12
	ext	v23.16b,v23.16b,v23.16b,#12
	ext	v1.16b,v1.16b,v1.16b,#4
	ext	v5.16b,v5.16b,v5.16b,#4
	ext	v9.16b,v9.16b,v9.16b,#4
	ext	v13.16b,v13.16b,v13.16b,#4
	ext	v17.16b,v17.16b,v17.16b,#4
	ext	v21.16b,v21.16b,v21.16b,#4
	add	v0.4s,v0.4s,v1.4s
	add	w5,w5,w9
	add	v4.4s,v4.4s,v5.4s
	add	w6,w6,w10
	add	v8.4s,v8.4s,v9.4s
	add	w7,w7,w11
	add	v12.4s,v12.4s,v13.4s
	add	w8,w8,w12
	add	v16.4s,v16.4s,v17.4s
	eor	w17,w17,w5
	add	v20.4s,v20.4s,v21.4s
	eor	w19,w19,w6
	eor	v3.16b,v3.16b,v0.16b
	eor	w20,w20,w7
	eor	v7.16b,v7.16b,v4.16b
	eor	w21,w21,w8
	eor	v11.16b,v11.16b,v8.16b
	ror	w17,w17,#16
	eor	v15.16b,v15.16b,v12.16b
	ror	w19,w19,#16
	eor	v19.16b,v19.16b,v16.16b
	ror	w20,w20,#16
	eor	v23.16b,v23.16b,v20.16b
	ror	w21,w21,#16
	rev32	v3.8h,v3.8h
	add	w13,w13,w17
	rev32	v7.8h,v7.8h
	add	w14,w14,w19
	rev32	v11.8h,v11.8h
	add	w15,w15,w20
	rev32	v15.8h,v15.8h
	add	w16,w16,w21
	rev32	v19.8h,v19.8h
	eor	w9,w9,w13
	rev32	v23.8h,v23.8h
	eor	w10,w10,w14
	add	v2.4s,v2.4s,v3.4s
	eor	w11,w11,w15
	add	v6.4s,v6.4s,v7.4s
	eor	w12,w12,w16
	add	v10.4s,v10.4s,v11.4s
	ror	w9,w9,#20
	add	v14.4s,v14.4s,v15.4s
	ror	w10,w10,#20
	add	v18.4s,v18.4s,v19.4s
	ror	w11,w11,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w12,w12,#20
	eor	v24.16b,v1.16b,v2.16b
	add	w5,w5,w9
	eor	v25.16b,v5.16b,v6.16b
	add	w6,w6,w10
	eor	v26.16b,v9.16b,v10.16b
	add	w7,w7,w11
	eor	v27.16b,v13.16b,v14.16b
	add	w8,w8,w12
	eor	v28.16b,v17.16b,v18.16b
	eor	w17,w17,w5
	eor	v29.16b,v21.16b,v22.16b
	eor	w19,w19,w6
	ushr	v1.4s,v24.4s,#20
	eor	w20,w20,w7
	ushr	v5.4s,v25.4s,#20
	eor	w21,w21,w8
	ushr	v9.4s,v26.4s,#20
	ror	w17,w17,#24
	ushr	v13.4s,v27.4s,#20
	ror	w19,w19,#24
	ushr	v17.4s,v28.4s,#20
	ror	w20,w20,#24
	ushr	v21.4s,v29.4s,#20
	ror	w21,w21,#24
	sli	v1.4s,v24.4s,#12
	add	w13,w13,w17
	sli	v5.4s,v25.4s,#12
	add	w14,w14,w19
	sli	v9.4s,v26.4s,#12
	add	w15,w15,w20
	sli	v13.4s,v27.4s,#12
	add	w16,w16,w21
	sli	v17.4s,v28.4s,#12
	eor	w9,w9,w13
	sli	v21.4s,v29.4s,#12
	eor	w10,w10,w14
	add	v0.4s,v0.4s,v1.4s
	eor	w11,w11,w15
	add	v4.4s,v4.4s,v5.4s
	eor	w12,w12,w16
	add	v8.4s,v8.4s,v9.4s
	ror	w9,w9,#25
	add	v12.4s,v12.4s,v13.4s
	ror	w10,w10,#25
	add	v16.4s,v16.4s,v17.4s
	ror	w11,w11,#25
	add	v20.4s,v20.4s,v21.4s
	ror	w12,w12,#25
	eor	v24.16b,v3.16b,v0.16b
	add	w5,w5,w10
	eor	v25.16b,v7.16b,v4.16b
	add	w6,w6,w11
	eor	v26.16b,v11.16b,v8.16b
	add	w7,w7,w12
	eor	v27.16b,v15.16b,v12.16b
	add	w8,w8,w9
	eor	v28.16b,v19.16b,v16.16b
	eor	w21,w21,w5
	eor	v29.16b,v23.16b,v20.16b
	eor	w17,w17,w6
	ushr	v3.4s,v24.4s,#24
	eor	w19,w19,w7
	ushr	v7.4s,v25.4s,#24
	eor	w20,w20,w8
	ushr	v11.4s,v26.4s,#24
	ror	w21,w21,#16
	ushr	v15.4s,v27.4s,#24
	ror	w17,w17,#16
	ushr	v19.4s,v28.4s,#24
	ror	w19,w19,#16
	ushr	v23.4s,v29.4s,#24
	ror	w20,w20,#16
	sli	v3.4s,v24.4s,#8
	add	w15,w15,w21
	sli	v7.4s,v25.4s,#8
	add	w16,w16,w17
	sli	v11.4s,v26.4s,#8
	add	w13,w13,w19
	sli	v15.4s,v27.4s,#8
	add	w14,w14,w20
	sli	v19.4s,v28.4s,#8
	eor	w10,w10,w15
	sli	v23.4s,v29.4s,#8
	eor	w11,w11,w16
	add	v2.4s,v2.4s,v3.4s
	eor	w12,w12,w13
	add	v6.4s,v6.4s,v7.4s
	eor	w9,w9,w14
	add	v10.4s,v10.4s,v11.4s
	ror	w10,w10,#20
	add	v14.4s,v14.4s,v15.4s
	ror	w11,w11,#20
	add	v18.4s,v18.4s,v19.4s
	ror	w12,w12,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w9,w9,#20
	eor	v24.16b,v1.16b,v2.16b
	add	w5,w5,w10
	eor	v25.16b,v5.16b,v6.16b
	add	w6,w6,w11
	eor	v26.16b,v9.16b,v10.16b
	add	w7,w7,w12
	eor	v27.16b,v13.16b,v14.16b
	add	w8,w8,w9
	eor	v28.16b,v17.16b,v18.16b
	eor	w21,w21,w5
	eor	v29.16b,v21.16b,v22.16b
	eor	w17,w17,w6
	ushr	v1.4s,v24.4s,#25
	eor	w19,w19,w7
	ushr	v5.4s,v25.4s,#25
	eor	w20,w20,w8
	ushr	v9.4s,v26.4s,#25
	ror	w21,w21,#24
	ushr	v13.4s,v27.4s,#25
	ror	w17,w17,#24
	ushr	v17.4s,v28.4s,#25
	ror	w19,w19,#24
	ushr	v21.4s,v29.4s,#25
	ror	w20,w20,#24
	sli	v1.4s,v24.4s,#7
	add	w15,w15,w21
	sli	v5.4s,v25.4s,#7
	add	w16,w16,w17
	sli	v9.4s,v26.4s,#7
	add	w13,w13,w19
	sli	v13.4s,v27.4s,#7
	add	w14,w14,w20
	sli	v17.4s,v28.4s,#7
	eor	w10,w10,w15
	sli	v21.4s,v29.4s,#7
	eor	w11,w11,w16
	ext	v2.16b,v2.16b,v2.16b,#8
	eor	w12,w12,w13
	ext	v6.16b,v6.16b,v6.16b,#8
	eor	w9,w9,w14
	ext	v10.16b,v10.16b,v10.16b,#8
	ror	w10,w10,#25
	ext	v14.16b,v14.16b,v14.16b,#8
	ror	w11,w11,#25
	ext	v18.16b,v18.16b,v18.16b,#8
	ror	w12,w12,#25
	ext	v22.16b,v22.16b,v22.16b,#8
	ror	w9,w9,#25
	ext	v3.16b,v3.16b,v3.16b,#4
	ext	v7.16b,v7.16b,v7.16b,#4
	ext	v11.16b,v11.16b,v11.16b,#4
	ext	v15.16b,v15.16b,v15.16b,#4
	ext	v19.16b,v19.16b,v19.16b,#4
	ext	v23.16b,v23.16b,v23.16b,#4
	ext	v1.16b,v1.16b,v1.16b,#12
	ext	v5.16b,v5.16b,v5.16b,#12
	ext	v9.16b,v9.16b,v9.16b,#12
	ext	v13.16b,v13.16b,v13.16b,#12
	ext	v17.16b,v17.16b,v17.16b,#12
	ext	v21.16b,v21.16b,v21.16b,#12
	cbnz	x4,.Loop_lower_neon

	add	w5,w5,w22		// accumulate key block
	ldp	q24,q25,[sp,#0]
	add	x6,x6,x22,lsr#32
	ldp	q26,q27,[sp,#32]
	add	w7,w7,w23
	ldp	q28,q29,[sp,#64]
	add	x8,x8,x23,lsr#32
	add	v0.4s,v0.4s,v24.4s
	add	w9,w9,w24
	add	v4.4s,v4.4s,v24.4s
	add	x10,x10,x24,lsr#32
	add	v8.4s,v8.4s,v24.4s
	add	w11,w11,w25
	add	v12.4s,v12.4s,v24.4s
	add	x12,x12,x25,lsr#32
	add	v16.4s,v16.4s,v24.4s
	add	w13,w13,w26
	add	v20.4s,v20.4s,v24.4s
	add	x14,x14,x26,lsr#32
	add	v2.4s,v2.4s,v26.4s
	add	w15,w15,w27
	add	v6.4s,v6.4s,v26.4s
	add	x16,x16,x27,lsr#32
	add	v10.4s,v10.4s,v26.4s
	add	w17,w17,w28
	add	v14.4s,v14.4s,v26.4s
	add	x19,x19,x28,lsr#32
	add	v18.4s,v18.4s,v26.4s
	add	w20,w20,w30
	add	v22.4s,v22.4s,v26.4s
	add	x21,x21,x30,lsr#32
	add	v19.4s,v19.4s,v31.4s			// +4
	add	x5,x5,x6,lsl#32	// pack
	add	v23.4s,v23.4s,v31.4s			// +4
	add	x7,x7,x8,lsl#32
	add	v3.4s,v3.4s,v27.4s
	ldp	x6,x8,[x1,#0]		// load input
	add	v7.4s,v7.4s,v28.4s
	add	x9,x9,x10,lsl#32
	add	v11.4s,v11.4s,v29.4s
	add	x11,x11,x12,lsl#32
	add	v15.4s,v15.4s,v30.4s
	ldp	x10,x12,[x1,#16]
	add	v19.4s,v19.4s,v27.4s
	add	x13,x13,x14,lsl#32
	add	v23.4s,v23.4s,v28.4s
	add	x15,x15,x16,lsl#32
	add	v1.4s,v1.4s,v25.4s
	ldp	x14,x16,[x1,#32]
	add	v5.4s,v5.4s,v25.4s
	add	x17,x17,x19,lsl#32
	add	v9.4s,v9.4s,v25.4s
	add	x20,x20,x21,lsl#32
	add	v13.4s,v13.4s,v25.4s
	ldp	x19,x21,[x1,#48]
	add	v17.4s,v17.4s,v25.4s
	add	x1,x1,#64
	add	v21.4s,v21.4s,v25.4s

#ifdef	__ARMEB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	v0.16b,v0.16b,v24.16b
	eor	x15,x15,x16
	eor	v1.16b,v1.16b,v25.16b
	eor	x17,x17,x19
	eor	v2.16b,v2.16b,v26.16b
	eor	x20,x20,x21
	eor	v3.16b,v3.16b,v27.16b
	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#7			// increment counter
	stp	x9,x11,[x0,#16]
	stp	x13,x15,[x0,#32]
	stp	x17,x20,[x0,#48]
	add	x0,x0,#64
	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64

	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
	eor	v4.16b,v4.16b,v24.16b
	eor	v5.16b,v5.16b,v25.16b
	eor	v6.16b,v6.16b,v26.16b
	eor	v7.16b,v7.16b,v27.16b
	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64

	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
	eor	v8.16b,v8.16b,v0.16b
	ldp	q24,q25,[sp,#0]
	eor	v9.16b,v9.16b,v1.16b
	ldp	q26,q27,[sp,#32]
	eor	v10.16b,v10.16b,v2.16b
	eor	v11.16b,v11.16b,v3.16b
	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64

	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
	eor	v12.16b,v12.16b,v4.16b
	eor	v13.16b,v13.16b,v5.16b
	eor	v14.16b,v14.16b,v6.16b
	eor	v15.16b,v15.16b,v7.16b
	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64

	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
	eor	v16.16b,v16.16b,v8.16b
	eor	v17.16b,v17.16b,v9.16b
	eor	v18.16b,v18.16b,v10.16b
	eor	v19.16b,v19.16b,v11.16b
	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64

	shl	v0.4s,v31.4s,#1			// 4 -> 8
	eor	v20.16b,v20.16b,v12.16b
	eor	v21.16b,v21.16b,v13.16b
	eor	v22.16b,v22.16b,v14.16b
	eor	v23.16b,v23.16b,v15.16b
	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64

	add	v27.4s,v27.4s,v0.4s			// += 8
	add	v28.4s,v28.4s,v0.4s
	add	v29.4s,v29.4s,v0.4s
	add	v30.4s,v30.4s,v0.4s

	b.hs	.Loop_outer_512_neon

	adds	x2,x2,#512
	ushr	v0.4s,v31.4s,#2			// 4 -> 1

	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
	ldp	d10,d11,[sp,#128+16]
	ldp	d12,d13,[sp,#128+32]
	ldp	d14,d15,[sp,#128+48]

	stp	q24,q31,[sp,#0]		// wipe off-load area
	stp	q24,q31,[sp,#32]
	stp	q24,q31,[sp,#64]

	b.eq	.Ldone_512_neon

	cmp	x2,#192
	sub	v27.4s,v27.4s,v0.4s			// -= 1
	sub	v28.4s,v28.4s,v0.4s
	sub	v29.4s,v29.4s,v0.4s
	add	sp,sp,#128
	b.hs	.Loop_outer_neon

	eor	v25.16b,v25.16b,v25.16b
	eor	v26.16b,v26.16b,v26.16b
	eor	v27.16b,v27.16b,v27.16b
	eor	v28.16b,v28.16b,v28.16b
	eor	v29.16b,v29.16b,v29.16b
	eor	v30.16b,v30.16b,v30.16b
	b	.Loop_outer

.Ldone_512_neon:
	ldp	x19,x20,[x29,#16]
	add	sp,sp,#128+64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
.Labort_neon:
	ret
ENDPROC(chacha20_neon)