summaryrefslogtreecommitdiffhomepage
path: root/src/crypto/zinc/poly1305/poly1305-x86_64.S
diff options
context:
space:
mode:
authorSamuel Neves <sneves@dei.uc.pt>2018-11-12 08:14:51 +0000
committerJason A. Donenfeld <Jason@zx2c4.com>2018-11-14 23:59:05 -0800
commit44e2ef7b23f4b68008ed5c910a7cb881f9c0939f (patch)
treef0f2e43c46a819eed4f055321ecf34a95b8e504f /src/crypto/zinc/poly1305/poly1305-x86_64.S
parent5c67177dcc6a23ceccaf8e69daf92a8a12212732 (diff)
chacha20,poly1305: switch to perlasm originals on x86_64
Signed-off-by: Samuel Neves <sneves@dei.uc.pt> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'src/crypto/zinc/poly1305/poly1305-x86_64.S')
-rw-r--r--src/crypto/zinc/poly1305/poly1305-x86_64.S2792
1 files changed, 0 insertions, 2792 deletions
diff --git a/src/crypto/zinc/poly1305/poly1305-x86_64.S b/src/crypto/zinc/poly1305/poly1305-x86_64.S
deleted file mode 100644
index 3c3f2b4..0000000
--- a/src/crypto/zinc/poly1305/poly1305-x86_64.S
+++ /dev/null
@@ -1,2792 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/*
- * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
- * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- *
- * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst192.Lconst, "aM", @progbits, 192
-.align 64
-.Lconst:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.long 16777216,0,16777216,0,16777216,0,16777216,0
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.long 2,2,2,3,2,0,2,1
-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-
-.text
-
-.align 32
-ENTRY(poly1305_init_x86_64)
- xorq %rax,%rax
- movq %rax,0(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
-
- cmpq $0,%rsi
- je .Lno_key
-
- movq $0x0ffffffc0fffffff,%rax
- movq $0x0ffffffc0ffffffc,%rcx
- andq 0(%rsi),%rax
- andq 8(%rsi),%rcx
- movq %rax,24(%rdi)
- movq %rcx,32(%rdi)
- movl $1,%eax
-.Lno_key:
- ret
-ENDPROC(poly1305_init_x86_64)
-
-.align 32
-ENTRY(poly1305_blocks_x86_64)
-.Lblocks:
- shrq $4,%rdx
- jz .Lno_data
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rdi
-
-.Lblocks_body:
-
- movq %rdx,%r15
-
- movq 24(%rdi),%r11
- movq 32(%rdi),%r13
-
- movq 0(%rdi),%r14
- movq 8(%rdi),%rbx
- movq 16(%rdi),%r10
-
- movq %r13,%r12
- shrq $2,%r13
- movq %r12,%rax
- addq %r12,%r13
- jmp .Loop
-
-.align 32
-.Loop:
-
- addq 0(%rsi),%r14
- adcq 8(%rsi),%rbx
- leaq 16(%rsi),%rsi
- adcq %rcx,%r10
- mulq %r14
- movq %rax,%r9
- movq %r11,%rax
- movq %rdx,%rdi
-
- mulq %r14
- movq %rax,%r14
- movq %r11,%rax
- movq %rdx,%r8
-
- mulq %rbx
- addq %rax,%r9
- movq %r13,%rax
- adcq %rdx,%rdi
-
- mulq %rbx
- movq %r10,%rbx
- addq %rax,%r14
- adcq %rdx,%r8
-
- imulq %r13,%rbx
- addq %rbx,%r9
- movq %r8,%rbx
- adcq $0,%rdi
-
- imulq %r11,%r10
- addq %r9,%rbx
- movq $-4,%rax
- adcq %r10,%rdi
-
- andq %rdi,%rax
- movq %rdi,%r10
- shrq $2,%rdi
- andq $3,%r10
- addq %rdi,%rax
- addq %rax,%r14
- adcq $0,%rbx
- adcq $0,%r10
-
- movq %r12,%rax
- decq %r15
- jnz .Loop
-
- movq 0(%rsp),%rdi
-
- movq %r14,0(%rdi)
- movq %rbx,8(%rdi)
- movq %r10,16(%rdi)
-
- movq 8(%rsp),%r15
- movq 16(%rsp),%r14
- movq 24(%rsp),%r13
- movq 32(%rsp),%r12
- movq 40(%rsp),%rbx
- leaq 48(%rsp),%rsp
-.Lno_data:
-.Lblocks_epilogue:
- ret
-ENDPROC(poly1305_blocks_x86_64)
-
-.align 32
-ENTRY(poly1305_emit_x86_64)
-.Lemit:
- movq 0(%rdi),%r8
- movq 8(%rdi),%r9
- movq 16(%rdi),%r10
-
- movq %r8,%rax
- addq $5,%r8
- movq %r9,%rcx
- adcq $0,%r9
- adcq $0,%r10
- shrq $2,%r10
- cmovnzq %r8,%rax
- cmovnzq %r9,%rcx
-
- addq 0(%rdx),%rax
- adcq 8(%rdx),%rcx
- movq %rax,0(%rsi)
- movq %rcx,8(%rsi)
-
- ret
-ENDPROC(poly1305_emit_x86_64)
-
-.macro __poly1305_block
- mulq %r14
- movq %rax,%r9
- movq %r11,%rax
- movq %rdx,%rdi
-
- mulq %r14
- movq %rax,%r14
- movq %r11,%rax
- movq %rdx,%r8
-
- mulq %rbx
- addq %rax,%r9
- movq %r13,%rax
- adcq %rdx,%rdi
-
- mulq %rbx
- movq %r10,%rbx
- addq %rax,%r14
- adcq %rdx,%r8
-
- imulq %r13,%rbx
- addq %rbx,%r9
- movq %r8,%rbx
- adcq $0,%rdi
-
- imulq %r11,%r10
- addq %r9,%rbx
- movq $-4,%rax
- adcq %r10,%rdi
-
- andq %rdi,%rax
- movq %rdi,%r10
- shrq $2,%rdi
- andq $3,%r10
- addq %rdi,%rax
- addq %rax,%r14
- adcq $0,%rbx
- adcq $0,%r10
-.endm
-
-.macro __poly1305_init_avx
- movq %r11,%r14
- movq %r12,%rbx
- xorq %r10,%r10
-
- leaq 48+64(%rdi),%rdi
-
- movq %r12,%rax
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
-
- movl $0x3ffffff,%eax
- movl $0x3ffffff,%edx
- movq %r14,%r8
- andl %r14d,%eax
- movq %r11,%r9
- andl %r11d,%edx
- movl %eax,-64(%rdi)
- shrq $26,%r8
- movl %edx,-60(%rdi)
- shrq $26,%r9
-
- movl $0x3ffffff,%eax
- movl $0x3ffffff,%edx
- andl %r8d,%eax
- andl %r9d,%edx
- movl %eax,-48(%rdi)
- leal (%rax,%rax,4),%eax
- movl %edx,-44(%rdi)
- leal (%rdx,%rdx,4),%edx
- movl %eax,-32(%rdi)
- shrq $26,%r8
- movl %edx,-28(%rdi)
- shrq $26,%r9
-
- movq %rbx,%rax
- movq %r12,%rdx
- shlq $12,%rax
- shlq $12,%rdx
- orq %r8,%rax
- orq %r9,%rdx
- andl $0x3ffffff,%eax
- andl $0x3ffffff,%edx
- movl %eax,-16(%rdi)
- leal (%rax,%rax,4),%eax
- movl %edx,-12(%rdi)
- leal (%rdx,%rdx,4),%edx
- movl %eax,0(%rdi)
- movq %rbx,%r8
- movl %edx,4(%rdi)
- movq %r12,%r9
-
- movl $0x3ffffff,%eax
- movl $0x3ffffff,%edx
- shrq $14,%r8
- shrq $14,%r9
- andl %r8d,%eax
- andl %r9d,%edx
- movl %eax,16(%rdi)
- leal (%rax,%rax,4),%eax
- movl %edx,20(%rdi)
- leal (%rdx,%rdx,4),%edx
- movl %eax,32(%rdi)
- shrq $26,%r8
- movl %edx,36(%rdi)
- shrq $26,%r9
-
- movq %r10,%rax
- shlq $24,%rax
- orq %rax,%r8
- movl %r8d,48(%rdi)
- leaq (%r8,%r8,4),%r8
- movl %r9d,52(%rdi)
- leaq (%r9,%r9,4),%r9
- movl %r8d,64(%rdi)
- movl %r9d,68(%rdi)
-
- movq %r12,%rax
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
-
- movl $0x3ffffff,%eax
- movq %r14,%r8
- andl %r14d,%eax
- shrq $26,%r8
- movl %eax,-52(%rdi)
-
- movl $0x3ffffff,%edx
- andl %r8d,%edx
- movl %edx,-36(%rdi)
- leal (%rdx,%rdx,4),%edx
- shrq $26,%r8
- movl %edx,-20(%rdi)
-
- movq %rbx,%rax
- shlq $12,%rax
- orq %r8,%rax
- andl $0x3ffffff,%eax
- movl %eax,-4(%rdi)
- leal (%rax,%rax,4),%eax
- movq %rbx,%r8
- movl %eax,12(%rdi)
-
- movl $0x3ffffff,%edx
- shrq $14,%r8
- andl %r8d,%edx
- movl %edx,28(%rdi)
- leal (%rdx,%rdx,4),%edx
- shrq $26,%r8
- movl %edx,44(%rdi)
-
- movq %r10,%rax
- shlq $24,%rax
- orq %rax,%r8
- movl %r8d,60(%rdi)
- leaq (%r8,%r8,4),%r8
- movl %r8d,76(%rdi)
-
- movq %r12,%rax
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
-
- movl $0x3ffffff,%eax
- movq %r14,%r8
- andl %r14d,%eax
- shrq $26,%r8
- movl %eax,-56(%rdi)
-
- movl $0x3ffffff,%edx
- andl %r8d,%edx
- movl %edx,-40(%rdi)
- leal (%rdx,%rdx,4),%edx
- shrq $26,%r8
- movl %edx,-24(%rdi)
-
- movq %rbx,%rax
- shlq $12,%rax
- orq %r8,%rax
- andl $0x3ffffff,%eax
- movl %eax,-8(%rdi)
- leal (%rax,%rax,4),%eax
- movq %rbx,%r8
- movl %eax,8(%rdi)
-
- movl $0x3ffffff,%edx
- shrq $14,%r8
- andl %r8d,%edx
- movl %edx,24(%rdi)
- leal (%rdx,%rdx,4),%edx
- shrq $26,%r8
- movl %edx,40(%rdi)
-
- movq %r10,%rax
- shlq $24,%rax
- orq %rax,%r8
- movl %r8d,56(%rdi)
- leaq (%r8,%r8,4),%r8
- movl %r8d,72(%rdi)
-
- leaq -48-64(%rdi),%rdi
-.endm
-
-#ifdef CONFIG_AS_AVX
-.align 32
-ENTRY(poly1305_blocks_avx)
-
- movl 20(%rdi),%r8d
- cmpq $128,%rdx
- jae .Lblocks_avx
- testl %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx:
- andq $-16,%rdx
- jz .Lno_data_avx
-
- vzeroupper
-
- testl %r8d,%r8d
- jz .Lbase2_64_avx
-
- testq $31,%rdx
- jz .Leven_avx
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rdi
-
-.Lblocks_avx_body:
-
- movq %rdx,%r15
-
- movq 0(%rdi),%r8
- movq 8(%rdi),%r9
- movl 16(%rdi),%r10d
-
- movq 24(%rdi),%r11
- movq 32(%rdi),%r13
-
-
- movl %r8d,%r14d
- andq $-2147483648,%r8
- movq %r9,%r12
- movl %r9d,%ebx
- andq $-2147483648,%r9
-
- shrq $6,%r8
- shlq $52,%r12
- addq %r8,%r14
- shrq $12,%rbx
- shrq $18,%r9
- addq %r12,%r14
- adcq %r9,%rbx
-
- movq %r10,%r8
- shlq $40,%r8
- shrq $24,%r10
- addq %r8,%rbx
- adcq $0,%r10
-
- movq $-4,%r9
- movq %r10,%r8
- andq %r10,%r9
- shrq $2,%r8
- andq $3,%r10
- addq %r9,%r8
- addq %r8,%r14
- adcq $0,%rbx
- adcq $0,%r10
-
- movq %r13,%r12
- movq %r13,%rax
- shrq $2,%r13
- addq %r12,%r13
-
- addq 0(%rsi),%r14
- adcq 8(%rsi),%rbx
- leaq 16(%rsi),%rsi
- adcq %rcx,%r10
-
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
-
- testq %rcx,%rcx
- jz .Lstore_base2_64_avx
-
-
- movq %r14,%rax
- movq %r14,%rdx
- shrq $52,%r14
- movq %rbx,%r11
- movq %rbx,%r12
- shrq $26,%rdx
- andq $0x3ffffff,%rax
- shlq $12,%r11
- andq $0x3ffffff,%rdx
- shrq $14,%rbx
- orq %r11,%r14
- shlq $24,%r10
- andq $0x3ffffff,%r14
- shrq $40,%r12
- andq $0x3ffffff,%rbx
- orq %r12,%r10
-
- subq $16,%r15
- jz .Lstore_base2_26_avx
-
- vmovd %eax,%xmm0
- vmovd %edx,%xmm1
- vmovd %r14d,%xmm2
- vmovd %ebx,%xmm3
- vmovd %r10d,%xmm4
- jmp .Lproceed_avx
-
-.align 32
-.Lstore_base2_64_avx:
- movq %r14,0(%rdi)
- movq %rbx,8(%rdi)
- movq %r10,16(%rdi)
- jmp .Ldone_avx
-
-.align 16
-.Lstore_base2_26_avx:
- movl %eax,0(%rdi)
- movl %edx,4(%rdi)
- movl %r14d,8(%rdi)
- movl %ebx,12(%rdi)
- movl %r10d,16(%rdi)
-.align 16
-.Ldone_avx:
- movq 8(%rsp),%r15
- movq 16(%rsp),%r14
- movq 24(%rsp),%r13
- movq 32(%rsp),%r12
- movq 40(%rsp),%rbx
- leaq 48(%rsp),%rsp
-
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
- ret
-
-.align 32
-.Lbase2_64_avx:
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rdi
-
-.Lbase2_64_avx_body:
-
- movq %rdx,%r15
-
- movq 24(%rdi),%r11
- movq 32(%rdi),%r13
-
- movq 0(%rdi),%r14
- movq 8(%rdi),%rbx
- movl 16(%rdi),%r10d
-
- movq %r13,%r12
- movq %r13,%rax
- shrq $2,%r13
- addq %r12,%r13
-
- testq $31,%rdx
- jz .Linit_avx
-
- addq 0(%rsi),%r14
- adcq 8(%rsi),%rbx
- leaq 16(%rsi),%rsi
- adcq %rcx,%r10
- subq $16,%r15
-
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
-
-.Linit_avx:
-
- movq %r14,%rax
- movq %r14,%rdx
- shrq $52,%r14
- movq %rbx,%r8
- movq %rbx,%r9
- shrq $26,%rdx
- andq $0x3ffffff,%rax
- shlq $12,%r8
- andq $0x3ffffff,%rdx
- shrq $14,%rbx
- orq %r8,%r14
- shlq $24,%r10
- andq $0x3ffffff,%r14
- shrq $40,%r9
- andq $0x3ffffff,%rbx
- orq %r9,%r10
-
- vmovd %eax,%xmm0
- vmovd %edx,%xmm1
- vmovd %r14d,%xmm2
- vmovd %ebx,%xmm3
- vmovd %r10d,%xmm4
- movl $1,20(%rdi)
-
- __poly1305_init_avx
-
-.Lproceed_avx:
- movq %r15,%rdx
-
- movq 8(%rsp),%r15
- movq 16(%rsp),%r14
- movq 24(%rsp),%r13
- movq 32(%rsp),%r12
- movq 40(%rsp),%rbx
- leaq 48(%rsp),%rax
- leaq 48(%rsp),%rsp
-
-.Lbase2_64_avx_epilogue:
- jmp .Ldo_avx
-
-
-.align 32
-.Leven_avx:
- vmovd 0(%rdi),%xmm0
- vmovd 4(%rdi),%xmm1
- vmovd 8(%rdi),%xmm2
- vmovd 12(%rdi),%xmm3
- vmovd 16(%rdi),%xmm4
-
-.Ldo_avx:
- leaq 8(%rsp),%r10
- andq $-32,%rsp
- subq $8,%rsp
- leaq -88(%rsp),%r11
- subq $0x178,%rsp
- subq $64,%rdx
- leaq -32(%rsi),%rax
- cmovcq %rax,%rsi
-
- vmovdqu 48(%rdi),%xmm14
- leaq 112(%rdi),%rdi
- leaq .Lconst(%rip),%rcx
-
- vmovdqu 32(%rsi),%xmm5
- vmovdqu 48(%rsi),%xmm6
- vmovdqa 64(%rcx),%xmm15
-
- vpsrldq $6,%xmm5,%xmm7
- vpsrldq $6,%xmm6,%xmm8
- vpunpckhqdq %xmm6,%xmm5,%xmm9
- vpunpcklqdq %xmm6,%xmm5,%xmm5
- vpunpcklqdq %xmm8,%xmm7,%xmm8
-
- vpsrlq $40,%xmm9,%xmm9
- vpsrlq $26,%xmm5,%xmm6
- vpand %xmm15,%xmm5,%xmm5
- vpsrlq $4,%xmm8,%xmm7
- vpand %xmm15,%xmm6,%xmm6
- vpsrlq $30,%xmm8,%xmm8
- vpand %xmm15,%xmm7,%xmm7
- vpand %xmm15,%xmm8,%xmm8
- vpor 32(%rcx),%xmm9,%xmm9
-
- jbe .Lskip_loop_avx
-
-
- vmovdqu -48(%rdi),%xmm11
- vmovdqu -32(%rdi),%xmm12
- vpshufd $0xEE,%xmm14,%xmm13
- vpshufd $0x44,%xmm14,%xmm10
- vmovdqa %xmm13,-144(%r11)
- vmovdqa %xmm10,0(%rsp)
- vpshufd $0xEE,%xmm11,%xmm14
- vmovdqu -16(%rdi),%xmm10
- vpshufd $0x44,%xmm11,%xmm11
- vmovdqa %xmm14,-128(%r11)
- vmovdqa %xmm11,16(%rsp)
- vpshufd $0xEE,%xmm12,%xmm13
- vmovdqu 0(%rdi),%xmm11
- vpshufd $0x44,%xmm12,%xmm12
- vmovdqa %xmm13,-112(%r11)
- vmovdqa %xmm12,32(%rsp)
- vpshufd $0xEE,%xmm10,%xmm14
- vmovdqu 16(%rdi),%xmm12
- vpshufd $0x44,%xmm10,%xmm10
- vmovdqa %xmm14,-96(%r11)
- vmovdqa %xmm10,48(%rsp)
- vpshufd $0xEE,%xmm11,%xmm13
- vmovdqu 32(%rdi),%xmm10
- vpshufd $0x44,%xmm11,%xmm11
- vmovdqa %xmm13,-80(%r11)
- vmovdqa %xmm11,64(%rsp)
- vpshufd $0xEE,%xmm12,%xmm14
- vmovdqu 48(%rdi),%xmm11
- vpshufd $0x44,%xmm12,%xmm12
- vmovdqa %xmm14,-64(%r11)
- vmovdqa %xmm12,80(%rsp)
- vpshufd $0xEE,%xmm10,%xmm13
- vmovdqu 64(%rdi),%xmm12
- vpshufd $0x44,%xmm10,%xmm10
- vmovdqa %xmm13,-48(%r11)
- vmovdqa %xmm10,96(%rsp)
- vpshufd $0xEE,%xmm11,%xmm14
- vpshufd $0x44,%xmm11,%xmm11
- vmovdqa %xmm14,-32(%r11)
- vmovdqa %xmm11,112(%rsp)
- vpshufd $0xEE,%xmm12,%xmm13
- vmovdqa 0(%rsp),%xmm14
- vpshufd $0x44,%xmm12,%xmm12
- vmovdqa %xmm13,-16(%r11)
- vmovdqa %xmm12,128(%rsp)
-
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
-
- vpmuludq %xmm5,%xmm14,%xmm10
- vpmuludq %xmm6,%xmm14,%xmm11
- vmovdqa %xmm2,32(%r11)
- vpmuludq %xmm7,%xmm14,%xmm12
- vmovdqa 16(%rsp),%xmm2
- vpmuludq %xmm8,%xmm14,%xmm13
- vpmuludq %xmm9,%xmm14,%xmm14
-
- vmovdqa %xmm0,0(%r11)
- vpmuludq 32(%rsp),%xmm9,%xmm0
- vmovdqa %xmm1,16(%r11)
- vpmuludq %xmm8,%xmm2,%xmm1
- vpaddq %xmm0,%xmm10,%xmm10
- vpaddq %xmm1,%xmm14,%xmm14
- vmovdqa %xmm3,48(%r11)
- vpmuludq %xmm7,%xmm2,%xmm0
- vpmuludq %xmm6,%xmm2,%xmm1
- vpaddq %xmm0,%xmm13,%xmm13
- vmovdqa 48(%rsp),%xmm3
- vpaddq %xmm1,%xmm12,%xmm12
- vmovdqa %xmm4,64(%r11)
- vpmuludq %xmm5,%xmm2,%xmm2
- vpmuludq %xmm7,%xmm3,%xmm0
- vpaddq %xmm2,%xmm11,%xmm11
-
- vmovdqa 64(%rsp),%xmm4
- vpaddq %xmm0,%xmm14,%xmm14
- vpmuludq %xmm6,%xmm3,%xmm1
- vpmuludq %xmm5,%xmm3,%xmm3
- vpaddq %xmm1,%xmm13,%xmm13
- vmovdqa 80(%rsp),%xmm2
- vpaddq %xmm3,%xmm12,%xmm12
- vpmuludq %xmm9,%xmm4,%xmm0
- vpmuludq %xmm8,%xmm4,%xmm4
- vpaddq %xmm0,%xmm11,%xmm11
- vmovdqa 96(%rsp),%xmm3
- vpaddq %xmm4,%xmm10,%xmm10
-
- vmovdqa 128(%rsp),%xmm4
- vpmuludq %xmm6,%xmm2,%xmm1
- vpmuludq %xmm5,%xmm2,%xmm2
- vpaddq %xmm1,%xmm14,%xmm14
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq %xmm9,%xmm3,%xmm0
- vpmuludq %xmm8,%xmm3,%xmm1
- vpaddq %xmm0,%xmm12,%xmm12
- vmovdqu 0(%rsi),%xmm0
- vpaddq %xmm1,%xmm11,%xmm11
- vpmuludq %xmm7,%xmm3,%xmm3
- vpmuludq %xmm7,%xmm4,%xmm7
- vpaddq %xmm3,%xmm10,%xmm10
-
- vmovdqu 16(%rsi),%xmm1
- vpaddq %xmm7,%xmm11,%xmm11
- vpmuludq %xmm8,%xmm4,%xmm8
- vpmuludq %xmm9,%xmm4,%xmm9
- vpsrldq $6,%xmm0,%xmm2
- vpaddq %xmm8,%xmm12,%xmm12
- vpaddq %xmm9,%xmm13,%xmm13
- vpsrldq $6,%xmm1,%xmm3
- vpmuludq 112(%rsp),%xmm5,%xmm9
- vpmuludq %xmm6,%xmm4,%xmm5
- vpunpckhqdq %xmm1,%xmm0,%xmm4
- vpaddq %xmm9,%xmm14,%xmm14
- vmovdqa -144(%r11),%xmm9
- vpaddq %xmm5,%xmm10,%xmm10
-
- vpunpcklqdq %xmm1,%xmm0,%xmm0
- vpunpcklqdq %xmm3,%xmm2,%xmm3
-
-
- vpsrldq $5,%xmm4,%xmm4
- vpsrlq $26,%xmm0,%xmm1
- vpand %xmm15,%xmm0,%xmm0
- vpsrlq $4,%xmm3,%xmm2
- vpand %xmm15,%xmm1,%xmm1
- vpand 0(%rcx),%xmm4,%xmm4
- vpsrlq $30,%xmm3,%xmm3
- vpand %xmm15,%xmm2,%xmm2
- vpand %xmm15,%xmm3,%xmm3
- vpor 32(%rcx),%xmm4,%xmm4
-
- vpaddq 0(%r11),%xmm0,%xmm0
- vpaddq 16(%r11),%xmm1,%xmm1
- vpaddq 32(%r11),%xmm2,%xmm2
- vpaddq 48(%r11),%xmm3,%xmm3
- vpaddq 64(%r11),%xmm4,%xmm4
-
- leaq 32(%rsi),%rax
- leaq 64(%rsi),%rsi
- subq $64,%rdx
- cmovcq %rax,%rsi
-
- vpmuludq %xmm0,%xmm9,%xmm5
- vpmuludq %xmm1,%xmm9,%xmm6
- vpaddq %xmm5,%xmm10,%xmm10
- vpaddq %xmm6,%xmm11,%xmm11
- vmovdqa -128(%r11),%xmm7
- vpmuludq %xmm2,%xmm9,%xmm5
- vpmuludq %xmm3,%xmm9,%xmm6
- vpaddq %xmm5,%xmm12,%xmm12
- vpaddq %xmm6,%xmm13,%xmm13
- vpmuludq %xmm4,%xmm9,%xmm9
- vpmuludq -112(%r11),%xmm4,%xmm5
- vpaddq %xmm9,%xmm14,%xmm14
-
- vpaddq %xmm5,%xmm10,%xmm10
- vpmuludq %xmm2,%xmm7,%xmm6
- vpmuludq %xmm3,%xmm7,%xmm5
- vpaddq %xmm6,%xmm13,%xmm13
- vmovdqa -96(%r11),%xmm8
- vpaddq %xmm5,%xmm14,%xmm14
- vpmuludq %xmm1,%xmm7,%xmm6
- vpmuludq %xmm0,%xmm7,%xmm7
- vpaddq %xmm6,%xmm12,%xmm12
- vpaddq %xmm7,%xmm11,%xmm11
-
- vmovdqa -80(%r11),%xmm9
- vpmuludq %xmm2,%xmm8,%xmm5
- vpmuludq %xmm1,%xmm8,%xmm6
- vpaddq %xmm5,%xmm14,%xmm14
- vpaddq %xmm6,%xmm13,%xmm13
- vmovdqa -64(%r11),%xmm7
- vpmuludq %xmm0,%xmm8,%xmm8
- vpmuludq %xmm4,%xmm9,%xmm5
- vpaddq %xmm8,%xmm12,%xmm12
- vpaddq %xmm5,%xmm11,%xmm11
- vmovdqa -48(%r11),%xmm8
- vpmuludq %xmm3,%xmm9,%xmm9
- vpmuludq %xmm1,%xmm7,%xmm6
- vpaddq %xmm9,%xmm10,%xmm10
-
- vmovdqa -16(%r11),%xmm9
- vpaddq %xmm6,%xmm14,%xmm14
- vpmuludq %xmm0,%xmm7,%xmm7
- vpmuludq %xmm4,%xmm8,%xmm5
- vpaddq %xmm7,%xmm13,%xmm13
- vpaddq %xmm5,%xmm12,%xmm12
- vmovdqu 32(%rsi),%xmm5
- vpmuludq %xmm3,%xmm8,%xmm7
- vpmuludq %xmm2,%xmm8,%xmm8
- vpaddq %xmm7,%xmm11,%xmm11
- vmovdqu 48(%rsi),%xmm6
- vpaddq %xmm8,%xmm10,%xmm10
-
- vpmuludq %xmm2,%xmm9,%xmm2
- vpmuludq %xmm3,%xmm9,%xmm3
- vpsrldq $6,%xmm5,%xmm7
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq %xmm4,%xmm9,%xmm4
- vpsrldq $6,%xmm6,%xmm8
- vpaddq %xmm3,%xmm12,%xmm2
- vpaddq %xmm4,%xmm13,%xmm3
- vpmuludq -32(%r11),%xmm0,%xmm4
- vpmuludq %xmm1,%xmm9,%xmm0
- vpunpckhqdq %xmm6,%xmm5,%xmm9
- vpaddq %xmm4,%xmm14,%xmm4
- vpaddq %xmm0,%xmm10,%xmm0
-
- vpunpcklqdq %xmm6,%xmm5,%xmm5
- vpunpcklqdq %xmm8,%xmm7,%xmm8
-
-
- vpsrldq $5,%xmm9,%xmm9
- vpsrlq $26,%xmm5,%xmm6
- vmovdqa 0(%rsp),%xmm14
- vpand %xmm15,%xmm5,%xmm5
- vpsrlq $4,%xmm8,%xmm7
- vpand %xmm15,%xmm6,%xmm6
- vpand 0(%rcx),%xmm9,%xmm9
- vpsrlq $30,%xmm8,%xmm8
- vpand %xmm15,%xmm7,%xmm7
- vpand %xmm15,%xmm8,%xmm8
- vpor 32(%rcx),%xmm9,%xmm9
-
- vpsrlq $26,%xmm3,%xmm13
- vpand %xmm15,%xmm3,%xmm3
- vpaddq %xmm13,%xmm4,%xmm4
-
- vpsrlq $26,%xmm0,%xmm10
- vpand %xmm15,%xmm0,%xmm0
- vpaddq %xmm10,%xmm11,%xmm1
-
- vpsrlq $26,%xmm4,%xmm10
- vpand %xmm15,%xmm4,%xmm4
-
- vpsrlq $26,%xmm1,%xmm11
- vpand %xmm15,%xmm1,%xmm1
- vpaddq %xmm11,%xmm2,%xmm2
-
- vpaddq %xmm10,%xmm0,%xmm0
- vpsllq $2,%xmm10,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
-
- vpsrlq $26,%xmm2,%xmm12
- vpand %xmm15,%xmm2,%xmm2
- vpaddq %xmm12,%xmm3,%xmm3
-
- vpsrlq $26,%xmm0,%xmm10
- vpand %xmm15,%xmm0,%xmm0
- vpaddq %xmm10,%xmm1,%xmm1
-
- vpsrlq $26,%xmm3,%xmm13
- vpand %xmm15,%xmm3,%xmm3
- vpaddq %xmm13,%xmm4,%xmm4
-
- ja .Loop_avx
-
-.Lskip_loop_avx:
- vpshufd $0x10,%xmm14,%xmm14
- addq $32,%rdx
- jnz .Long_tail_avx
-
- vpaddq %xmm2,%xmm7,%xmm7
- vpaddq %xmm0,%xmm5,%xmm5
- vpaddq %xmm1,%xmm6,%xmm6
- vpaddq %xmm3,%xmm8,%xmm8
- vpaddq %xmm4,%xmm9,%xmm9
-
-.Long_tail_avx:
- vmovdqa %xmm2,32(%r11)
- vmovdqa %xmm0,0(%r11)
- vmovdqa %xmm1,16(%r11)
- vmovdqa %xmm3,48(%r11)
- vmovdqa %xmm4,64(%r11)
-
- vpmuludq %xmm7,%xmm14,%xmm12
- vpmuludq %xmm5,%xmm14,%xmm10
- vpshufd $0x10,-48(%rdi),%xmm2
- vpmuludq %xmm6,%xmm14,%xmm11
- vpmuludq %xmm8,%xmm14,%xmm13
- vpmuludq %xmm9,%xmm14,%xmm14
-
- vpmuludq %xmm8,%xmm2,%xmm0
- vpaddq %xmm0,%xmm14,%xmm14
- vpshufd $0x10,-32(%rdi),%xmm3
- vpmuludq %xmm7,%xmm2,%xmm1
- vpaddq %xmm1,%xmm13,%xmm13
- vpshufd $0x10,-16(%rdi),%xmm4
- vpmuludq %xmm6,%xmm2,%xmm0
- vpaddq %xmm0,%xmm12,%xmm12
- vpmuludq %xmm5,%xmm2,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq %xmm9,%xmm3,%xmm3
- vpaddq %xmm3,%xmm10,%xmm10
-
- vpshufd $0x10,0(%rdi),%xmm2
- vpmuludq %xmm7,%xmm4,%xmm1
- vpaddq %xmm1,%xmm14,%xmm14
- vpmuludq %xmm6,%xmm4,%xmm0
- vpaddq %xmm0,%xmm13,%xmm13
- vpshufd $0x10,16(%rdi),%xmm3
- vpmuludq %xmm5,%xmm4,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vpmuludq %xmm9,%xmm2,%xmm1
- vpaddq %xmm1,%xmm11,%xmm11
- vpshufd $0x10,32(%rdi),%xmm4
- vpmuludq %xmm8,%xmm2,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
-
- vpmuludq %xmm6,%xmm3,%xmm0
- vpaddq %xmm0,%xmm14,%xmm14
- vpmuludq %xmm5,%xmm3,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpshufd $0x10,48(%rdi),%xmm2
- vpmuludq %xmm9,%xmm4,%xmm1
- vpaddq %xmm1,%xmm12,%xmm12
- vpshufd $0x10,64(%rdi),%xmm3
- vpmuludq %xmm8,%xmm4,%xmm0
- vpaddq %xmm0,%xmm11,%xmm11
- vpmuludq %xmm7,%xmm4,%xmm4
- vpaddq %xmm4,%xmm10,%xmm10
-
- vpmuludq %xmm5,%xmm2,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq %xmm9,%xmm3,%xmm1
- vpaddq %xmm1,%xmm13,%xmm13
- vpmuludq %xmm8,%xmm3,%xmm0
- vpaddq %xmm0,%xmm12,%xmm12
- vpmuludq %xmm7,%xmm3,%xmm1
- vpaddq %xmm1,%xmm11,%xmm11
- vpmuludq %xmm6,%xmm3,%xmm3
- vpaddq %xmm3,%xmm10,%xmm10
-
- jz .Lshort_tail_avx
-
- vmovdqu 0(%rsi),%xmm0
- vmovdqu 16(%rsi),%xmm1
-
- vpsrldq $6,%xmm0,%xmm2
- vpsrldq $6,%xmm1,%xmm3
- vpunpckhqdq %xmm1,%xmm0,%xmm4
- vpunpcklqdq %xmm1,%xmm0,%xmm0
- vpunpcklqdq %xmm3,%xmm2,%xmm3
-
- vpsrlq $40,%xmm4,%xmm4
- vpsrlq $26,%xmm0,%xmm1
- vpand %xmm15,%xmm0,%xmm0
- vpsrlq $4,%xmm3,%xmm2
- vpand %xmm15,%xmm1,%xmm1
- vpsrlq $30,%xmm3,%xmm3
- vpand %xmm15,%xmm2,%xmm2
- vpand %xmm15,%xmm3,%xmm3
- vpor 32(%rcx),%xmm4,%xmm4
-
- vpshufd $0x32,-64(%rdi),%xmm9
- vpaddq 0(%r11),%xmm0,%xmm0
- vpaddq 16(%r11),%xmm1,%xmm1
- vpaddq 32(%r11),%xmm2,%xmm2
- vpaddq 48(%r11),%xmm3,%xmm3
- vpaddq 64(%r11),%xmm4,%xmm4
-
- vpmuludq %xmm0,%xmm9,%xmm5
- vpaddq %xmm5,%xmm10,%xmm10
- vpmuludq %xmm1,%xmm9,%xmm6
- vpaddq %xmm6,%xmm11,%xmm11
- vpmuludq %xmm2,%xmm9,%xmm5
- vpaddq %xmm5,%xmm12,%xmm12
- vpshufd $0x32,-48(%rdi),%xmm7
- vpmuludq %xmm3,%xmm9,%xmm6
- vpaddq %xmm6,%xmm13,%xmm13
- vpmuludq %xmm4,%xmm9,%xmm9
- vpaddq %xmm9,%xmm14,%xmm14
-
- vpmuludq %xmm3,%xmm7,%xmm5
- vpaddq %xmm5,%xmm14,%xmm14
- vpshufd $0x32,-32(%rdi),%xmm8
- vpmuludq %xmm2,%xmm7,%xmm6
- vpaddq %xmm6,%xmm13,%xmm13
- vpshufd $0x32,-16(%rdi),%xmm9
- vpmuludq %xmm1,%xmm7,%xmm5
- vpaddq %xmm5,%xmm12,%xmm12
- vpmuludq %xmm0,%xmm7,%xmm7
- vpaddq %xmm7,%xmm11,%xmm11
- vpmuludq %xmm4,%xmm8,%xmm8
- vpaddq %xmm8,%xmm10,%xmm10
-
- vpshufd $0x32,0(%rdi),%xmm7
- vpmuludq %xmm2,%xmm9,%xmm6
- vpaddq %xmm6,%xmm14,%xmm14
- vpmuludq %xmm1,%xmm9,%xmm5
- vpaddq %xmm5,%xmm13,%xmm13
- vpshufd $0x32,16(%rdi),%xmm8
- vpmuludq %xmm0,%xmm9,%xmm9
- vpaddq %xmm9,%xmm12,%xmm12
- vpmuludq %xmm4,%xmm7,%xmm6
- vpaddq %xmm6,%xmm11,%xmm11
- vpshufd $0x32,32(%rdi),%xmm9
- vpmuludq %xmm3,%xmm7,%xmm7
- vpaddq %xmm7,%xmm10,%xmm10
-
- vpmuludq %xmm1,%xmm8,%xmm5
- vpaddq %xmm5,%xmm14,%xmm14
- vpmuludq %xmm0,%xmm8,%xmm8
- vpaddq %xmm8,%xmm13,%xmm13
- vpshufd $0x32,48(%rdi),%xmm7
- vpmuludq %xmm4,%xmm9,%xmm6
- vpaddq %xmm6,%xmm12,%xmm12
- vpshufd $0x32,64(%rdi),%xmm8
- vpmuludq %xmm3,%xmm9,%xmm5
- vpaddq %xmm5,%xmm11,%xmm11
- vpmuludq %xmm2,%xmm9,%xmm9
- vpaddq %xmm9,%xmm10,%xmm10
-
- vpmuludq %xmm0,%xmm7,%xmm7
- vpaddq %xmm7,%xmm14,%xmm14
- vpmuludq %xmm4,%xmm8,%xmm6
- vpaddq %xmm6,%xmm13,%xmm13
- vpmuludq %xmm3,%xmm8,%xmm5
- vpaddq %xmm5,%xmm12,%xmm12
- vpmuludq %xmm2,%xmm8,%xmm6
- vpaddq %xmm6,%xmm11,%xmm11
- vpmuludq %xmm1,%xmm8,%xmm8
- vpaddq %xmm8,%xmm10,%xmm10
-
-.Lshort_tail_avx:
-
- vpsrldq $8,%xmm14,%xmm9
- vpsrldq $8,%xmm13,%xmm8
- vpsrldq $8,%xmm11,%xmm6
- vpsrldq $8,%xmm10,%xmm5
- vpsrldq $8,%xmm12,%xmm7
- vpaddq %xmm8,%xmm13,%xmm13
- vpaddq %xmm9,%xmm14,%xmm14
- vpaddq %xmm5,%xmm10,%xmm10
- vpaddq %xmm6,%xmm11,%xmm11
- vpaddq %xmm7,%xmm12,%xmm12
-
- vpsrlq $26,%xmm13,%xmm3
- vpand %xmm15,%xmm13,%xmm13
- vpaddq %xmm3,%xmm14,%xmm14
-
- vpsrlq $26,%xmm10,%xmm0
- vpand %xmm15,%xmm10,%xmm10
- vpaddq %xmm0,%xmm11,%xmm11
-
- vpsrlq $26,%xmm14,%xmm4
- vpand %xmm15,%xmm14,%xmm14
-
- vpsrlq $26,%xmm11,%xmm1
- vpand %xmm15,%xmm11,%xmm11
- vpaddq %xmm1,%xmm12,%xmm12
-
- vpaddq %xmm4,%xmm10,%xmm10
- vpsllq $2,%xmm4,%xmm4
- vpaddq %xmm4,%xmm10,%xmm10
-
- vpsrlq $26,%xmm12,%xmm2
- vpand %xmm15,%xmm12,%xmm12
- vpaddq %xmm2,%xmm13,%xmm13
-
- vpsrlq $26,%xmm10,%xmm0
- vpand %xmm15,%xmm10,%xmm10
- vpaddq %xmm0,%xmm11,%xmm11
-
- vpsrlq $26,%xmm13,%xmm3
- vpand %xmm15,%xmm13,%xmm13
- vpaddq %xmm3,%xmm14,%xmm14
-
- vmovd %xmm10,-112(%rdi)
- vmovd %xmm11,-108(%rdi)
- vmovd %xmm12,-104(%rdi)
- vmovd %xmm13,-100(%rdi)
- vmovd %xmm14,-96(%rdi)
- leaq -8(%r10),%rsp
-
- vzeroupper
- ret
-ENDPROC(poly1305_blocks_avx)
-
-.align 32
-ENTRY(poly1305_emit_avx)
- cmpl $0,20(%rdi)
- je .Lemit
-
- movl 0(%rdi),%eax
- movl 4(%rdi),%ecx
- movl 8(%rdi),%r8d
- movl 12(%rdi),%r11d
- movl 16(%rdi),%r10d
-
- shlq $26,%rcx
- movq %r8,%r9
- shlq $52,%r8
- addq %rcx,%rax
- shrq $12,%r9
- addq %rax,%r8
- adcq $0,%r9
-
- shlq $14,%r11
- movq %r10,%rax
- shrq $24,%r10
- addq %r11,%r9
- shlq $40,%rax
- addq %rax,%r9
- adcq $0,%r10
-
- movq %r10,%rax
- movq %r10,%rcx
- andq $3,%r10
- shrq $2,%rax
- andq $-4,%rcx
- addq %rcx,%rax
- addq %rax,%r8
- adcq $0,%r9
- adcq $0,%r10
-
- movq %r8,%rax
- addq $5,%r8
- movq %r9,%rcx
- adcq $0,%r9
- adcq $0,%r10
- shrq $2,%r10
- cmovnzq %r8,%rax
- cmovnzq %r9,%rcx
-
- addq 0(%rdx),%rax
- adcq 8(%rdx),%rcx
- movq %rax,0(%rsi)
- movq %rcx,8(%rsi)
-
- ret
-ENDPROC(poly1305_emit_avx)
-#endif /* CONFIG_AS_AVX */
-
-#ifdef CONFIG_AS_AVX2
-.align 32
-ENTRY(poly1305_blocks_avx2)
-
- movl 20(%rdi),%r8d
- cmpq $128,%rdx
- jae .Lblocks_avx2
- testl %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx2:
- andq $-16,%rdx
- jz .Lno_data_avx2
-
- vzeroupper
-
- testl %r8d,%r8d
- jz .Lbase2_64_avx2
-
- testq $63,%rdx
- jz .Leven_avx2
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rdi
-
-.Lblocks_avx2_body:
-
- movq %rdx,%r15
-
- movq 0(%rdi),%r8
- movq 8(%rdi),%r9
- movl 16(%rdi),%r10d
-
- movq 24(%rdi),%r11
- movq 32(%rdi),%r13
-
-
- movl %r8d,%r14d
- andq $-2147483648,%r8
- movq %r9,%r12
- movl %r9d,%ebx
- andq $-2147483648,%r9
-
- shrq $6,%r8
- shlq $52,%r12
- addq %r8,%r14
- shrq $12,%rbx
- shrq $18,%r9
- addq %r12,%r14
- adcq %r9,%rbx
-
- movq %r10,%r8
- shlq $40,%r8
- shrq $24,%r10
- addq %r8,%rbx
- adcq $0,%r10
-
- movq $-4,%r9
- movq %r10,%r8
- andq %r10,%r9
- shrq $2,%r8
- andq $3,%r10
- addq %r9,%r8
- addq %r8,%r14
- adcq $0,%rbx
- adcq $0,%r10
-
- movq %r13,%r12
- movq %r13,%rax
- shrq $2,%r13
- addq %r12,%r13
-
-.Lbase2_26_pre_avx2:
- addq 0(%rsi),%r14
- adcq 8(%rsi),%rbx
- leaq 16(%rsi),%rsi
- adcq %rcx,%r10
- subq $16,%r15
-
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
- movq %r12,%rax
-
- testq $63,%r15
- jnz .Lbase2_26_pre_avx2
-
- testq %rcx,%rcx
- jz .Lstore_base2_64_avx2
-
-
- movq %r14,%rax
- movq %r14,%rdx
- shrq $52,%r14
- movq %rbx,%r11
- movq %rbx,%r12
- shrq $26,%rdx
- andq $0x3ffffff,%rax
- shlq $12,%r11
- andq $0x3ffffff,%rdx
- shrq $14,%rbx
- orq %r11,%r14
- shlq $24,%r10
- andq $0x3ffffff,%r14
- shrq $40,%r12
- andq $0x3ffffff,%rbx
- orq %r12,%r10
-
- testq %r15,%r15
- jz .Lstore_base2_26_avx2
-
- vmovd %eax,%xmm0
- vmovd %edx,%xmm1
- vmovd %r14d,%xmm2
- vmovd %ebx,%xmm3
- vmovd %r10d,%xmm4
- jmp .Lproceed_avx2
-
-.align 32
-.Lstore_base2_64_avx2:
- movq %r14,0(%rdi)
- movq %rbx,8(%rdi)
- movq %r10,16(%rdi)
- jmp .Ldone_avx2
-
-.align 16
-.Lstore_base2_26_avx2:
- movl %eax,0(%rdi)
- movl %edx,4(%rdi)
- movl %r14d,8(%rdi)
- movl %ebx,12(%rdi)
- movl %r10d,16(%rdi)
-.align 16
-.Ldone_avx2:
- movq 8(%rsp),%r15
- movq 16(%rsp),%r14
- movq 24(%rsp),%r13
- movq 32(%rsp),%r12
- movq 40(%rsp),%rbx
- leaq 48(%rsp),%rsp
-
-.Lno_data_avx2:
-.Lblocks_avx2_epilogue:
- ret
-
-
-.align 32
-.Lbase2_64_avx2:
-
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rdi
-
-.Lbase2_64_avx2_body:
-
- movq %rdx,%r15
-
- movq 24(%rdi),%r11
- movq 32(%rdi),%r13
-
- movq 0(%rdi),%r14
- movq 8(%rdi),%rbx
- movl 16(%rdi),%r10d
-
- movq %r13,%r12
- movq %r13,%rax
- shrq $2,%r13
- addq %r12,%r13
-
- testq $63,%rdx
- jz .Linit_avx2
-
-.Lbase2_64_pre_avx2:
- addq 0(%rsi),%r14
- adcq 8(%rsi),%rbx
- leaq 16(%rsi),%rsi
- adcq %rcx,%r10
- subq $16,%r15
-
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
- movq %r12,%rax
-
- testq $63,%r15
- jnz .Lbase2_64_pre_avx2
-
-.Linit_avx2:
-
- movq %r14,%rax
- movq %r14,%rdx
- shrq $52,%r14
- movq %rbx,%r8
- movq %rbx,%r9
- shrq $26,%rdx
- andq $0x3ffffff,%rax
- shlq $12,%r8
- andq $0x3ffffff,%rdx
- shrq $14,%rbx
- orq %r8,%r14
- shlq $24,%r10
- andq $0x3ffffff,%r14
- shrq $40,%r9
- andq $0x3ffffff,%rbx
- orq %r9,%r10
-
- vmovd %eax,%xmm0
- vmovd %edx,%xmm1
- vmovd %r14d,%xmm2
- vmovd %ebx,%xmm3
- vmovd %r10d,%xmm4
- movl $1,20(%rdi)
-
- __poly1305_init_avx
-
-.Lproceed_avx2:
- movq %r15,%rdx
-
- movq 8(%rsp),%r15
- movq 16(%rsp),%r14
- movq 24(%rsp),%r13
- movq 32(%rsp),%r12
- movq 40(%rsp),%rbx
- leaq 48(%rsp),%rax
- leaq 48(%rsp),%rsp
-
-.Lbase2_64_avx2_epilogue:
- jmp .Ldo_avx2
-
-
-.align 32
-.Leven_avx2:
-
- vmovd 0(%rdi),%xmm0
- vmovd 4(%rdi),%xmm1
- vmovd 8(%rdi),%xmm2
- vmovd 12(%rdi),%xmm3
- vmovd 16(%rdi),%xmm4
-
-.Ldo_avx2:
- leaq 8(%rsp),%r10
- subq $0x128,%rsp
- leaq .Lconst(%rip),%rcx
- leaq 48+64(%rdi),%rdi
- vmovdqa 96(%rcx),%ymm7
-
-
- vmovdqu -64(%rdi),%xmm9
- andq $-512,%rsp
- vmovdqu -48(%rdi),%xmm10
- vmovdqu -32(%rdi),%xmm6
- vmovdqu -16(%rdi),%xmm11
- vmovdqu 0(%rdi),%xmm12
- vmovdqu 16(%rdi),%xmm13
- leaq 144(%rsp),%rax
- vmovdqu 32(%rdi),%xmm14
- vpermd %ymm9,%ymm7,%ymm9
- vmovdqu 48(%rdi),%xmm15
- vpermd %ymm10,%ymm7,%ymm10
- vmovdqu 64(%rdi),%xmm5
- vpermd %ymm6,%ymm7,%ymm6
- vmovdqa %ymm9,0(%rsp)
- vpermd %ymm11,%ymm7,%ymm11
- vmovdqa %ymm10,32-144(%rax)
- vpermd %ymm12,%ymm7,%ymm12
- vmovdqa %ymm6,64-144(%rax)
- vpermd %ymm13,%ymm7,%ymm13
- vmovdqa %ymm11,96-144(%rax)
- vpermd %ymm14,%ymm7,%ymm14
- vmovdqa %ymm12,128-144(%rax)
- vpermd %ymm15,%ymm7,%ymm15
- vmovdqa %ymm13,160-144(%rax)
- vpermd %ymm5,%ymm7,%ymm5
- vmovdqa %ymm14,192-144(%rax)
- vmovdqa %ymm15,224-144(%rax)
- vmovdqa %ymm5,256-144(%rax)
- vmovdqa 64(%rcx),%ymm5
-
-
-
- vmovdqu 0(%rsi),%xmm7
- vmovdqu 16(%rsi),%xmm8
- vinserti128 $1,32(%rsi),%ymm7,%ymm7
- vinserti128 $1,48(%rsi),%ymm8,%ymm8
- leaq 64(%rsi),%rsi
-
- vpsrldq $6,%ymm7,%ymm9
- vpsrldq $6,%ymm8,%ymm10
- vpunpckhqdq %ymm8,%ymm7,%ymm6
- vpunpcklqdq %ymm10,%ymm9,%ymm9
- vpunpcklqdq %ymm8,%ymm7,%ymm7
-
- vpsrlq $30,%ymm9,%ymm10
- vpsrlq $4,%ymm9,%ymm9
- vpsrlq $26,%ymm7,%ymm8
- vpsrlq $40,%ymm6,%ymm6
- vpand %ymm5,%ymm9,%ymm9
- vpand %ymm5,%ymm7,%ymm7
- vpand %ymm5,%ymm8,%ymm8
- vpand %ymm5,%ymm10,%ymm10
- vpor 32(%rcx),%ymm6,%ymm6
-
- vpaddq %ymm2,%ymm9,%ymm2
- subq $64,%rdx
- jz .Ltail_avx2
- jmp .Loop_avx2
-
-.align 32
-.Loop_avx2:
-
- vpaddq %ymm0,%ymm7,%ymm0
- vmovdqa 0(%rsp),%ymm7
- vpaddq %ymm1,%ymm8,%ymm1
- vmovdqa 32(%rsp),%ymm8
- vpaddq %ymm3,%ymm10,%ymm3
- vmovdqa 96(%rsp),%ymm9
- vpaddq %ymm4,%ymm6,%ymm4
- vmovdqa 48(%rax),%ymm10
- vmovdqa 112(%rax),%ymm5
-
- vpmuludq %ymm2,%ymm7,%ymm13
- vpmuludq %ymm2,%ymm8,%ymm14
- vpmuludq %ymm2,%ymm9,%ymm15
- vpmuludq %ymm2,%ymm10,%ymm11
- vpmuludq %ymm2,%ymm5,%ymm12
-
- vpmuludq %ymm0,%ymm8,%ymm6
- vpmuludq %ymm1,%ymm8,%ymm2
- vpaddq %ymm6,%ymm12,%ymm12
- vpaddq %ymm2,%ymm13,%ymm13
- vpmuludq %ymm3,%ymm8,%ymm6
- vpmuludq 64(%rsp),%ymm4,%ymm2
- vpaddq %ymm6,%ymm15,%ymm15
- vpaddq %ymm2,%ymm11,%ymm11
- vmovdqa -16(%rax),%ymm8
-
- vpmuludq %ymm0,%ymm7,%ymm6
- vpmuludq %ymm1,%ymm7,%ymm2
- vpaddq %ymm6,%ymm11,%ymm11
- vpaddq %ymm2,%ymm12,%ymm12
- vpmuludq %ymm3,%ymm7,%ymm6
- vpmuludq %ymm4,%ymm7,%ymm2
- vmovdqu 0(%rsi),%xmm7
- vpaddq %ymm6,%ymm14,%ymm14
- vpaddq %ymm2,%ymm15,%ymm15
- vinserti128 $1,32(%rsi),%ymm7,%ymm7
-
- vpmuludq %ymm3,%ymm8,%ymm6
- vpmuludq %ymm4,%ymm8,%ymm2
- vmovdqu 16(%rsi),%xmm8
- vpaddq %ymm6,%ymm11,%ymm11
- vpaddq %ymm2,%ymm12,%ymm12
- vmovdqa 16(%rax),%ymm2
- vpmuludq %ymm1,%ymm9,%ymm6
- vpmuludq %ymm0,%ymm9,%ymm9
- vpaddq %ymm6,%ymm14,%ymm14
- vpaddq %ymm9,%ymm13,%ymm13
- vinserti128 $1,48(%rsi),%ymm8,%ymm8
- leaq 64(%rsi),%rsi
-
- vpmuludq %ymm1,%ymm2,%ymm6
- vpmuludq %ymm0,%ymm2,%ymm2
- vpsrldq $6,%ymm7,%ymm9
- vpaddq %ymm6,%ymm15,%ymm15
- vpaddq %ymm2,%ymm14,%ymm14
- vpmuludq %ymm3,%ymm10,%ymm6
- vpmuludq %ymm4,%ymm10,%ymm2
- vpsrldq $6,%ymm8,%ymm10
- vpaddq %ymm6,%ymm12,%ymm12
- vpaddq %ymm2,%ymm13,%ymm13
- vpunpckhqdq %ymm8,%ymm7,%ymm6
-
- vpmuludq %ymm3,%ymm5,%ymm3
- vpmuludq %ymm4,%ymm5,%ymm4
- vpunpcklqdq %ymm8,%ymm7,%ymm7
- vpaddq %ymm3,%ymm13,%ymm2
- vpaddq %ymm4,%ymm14,%ymm3
- vpunpcklqdq %ymm10,%ymm9,%ymm10
- vpmuludq 80(%rax),%ymm0,%ymm4
- vpmuludq %ymm1,%ymm5,%ymm0
- vmovdqa 64(%rcx),%ymm5
- vpaddq %ymm4,%ymm15,%ymm4
- vpaddq %ymm0,%ymm11,%ymm0
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpaddq %ymm14,%ymm4,%ymm4
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm11,%ymm12,%ymm1
-
- vpsrlq $26,%ymm4,%ymm15
- vpand %ymm5,%ymm4,%ymm4
-
- vpsrlq $4,%ymm10,%ymm9
-
- vpsrlq $26,%ymm1,%ymm12
- vpand %ymm5,%ymm1,%ymm1
- vpaddq %ymm12,%ymm2,%ymm2
-
- vpaddq %ymm15,%ymm0,%ymm0
- vpsllq $2,%ymm15,%ymm15
- vpaddq %ymm15,%ymm0,%ymm0
-
- vpand %ymm5,%ymm9,%ymm9
- vpsrlq $26,%ymm7,%ymm8
-
- vpsrlq $26,%ymm2,%ymm13
- vpand %ymm5,%ymm2,%ymm2
- vpaddq %ymm13,%ymm3,%ymm3
-
- vpaddq %ymm9,%ymm2,%ymm2
- vpsrlq $30,%ymm10,%ymm10
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm11,%ymm1,%ymm1
-
- vpsrlq $40,%ymm6,%ymm6
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpaddq %ymm14,%ymm4,%ymm4
-
- vpand %ymm5,%ymm7,%ymm7
- vpand %ymm5,%ymm8,%ymm8
- vpand %ymm5,%ymm10,%ymm10
- vpor 32(%rcx),%ymm6,%ymm6
-
- subq $64,%rdx
- jnz .Loop_avx2
-
-.byte 0x66,0x90
-.Ltail_avx2:
-
- vpaddq %ymm0,%ymm7,%ymm0
- vmovdqu 4(%rsp),%ymm7
- vpaddq %ymm1,%ymm8,%ymm1
- vmovdqu 36(%rsp),%ymm8
- vpaddq %ymm3,%ymm10,%ymm3
- vmovdqu 100(%rsp),%ymm9
- vpaddq %ymm4,%ymm6,%ymm4
- vmovdqu 52(%rax),%ymm10
- vmovdqu 116(%rax),%ymm5
-
- vpmuludq %ymm2,%ymm7,%ymm13
- vpmuludq %ymm2,%ymm8,%ymm14
- vpmuludq %ymm2,%ymm9,%ymm15
- vpmuludq %ymm2,%ymm10,%ymm11
- vpmuludq %ymm2,%ymm5,%ymm12
-
- vpmuludq %ymm0,%ymm8,%ymm6
- vpmuludq %ymm1,%ymm8,%ymm2
- vpaddq %ymm6,%ymm12,%ymm12
- vpaddq %ymm2,%ymm13,%ymm13
- vpmuludq %ymm3,%ymm8,%ymm6
- vpmuludq 68(%rsp),%ymm4,%ymm2
- vpaddq %ymm6,%ymm15,%ymm15
- vpaddq %ymm2,%ymm11,%ymm11
-
- vpmuludq %ymm0,%ymm7,%ymm6
- vpmuludq %ymm1,%ymm7,%ymm2
- vpaddq %ymm6,%ymm11,%ymm11
- vmovdqu -12(%rax),%ymm8
- vpaddq %ymm2,%ymm12,%ymm12
- vpmuludq %ymm3,%ymm7,%ymm6
- vpmuludq %ymm4,%ymm7,%ymm2
- vpaddq %ymm6,%ymm14,%ymm14
- vpaddq %ymm2,%ymm15,%ymm15
-
- vpmuludq %ymm3,%ymm8,%ymm6
- vpmuludq %ymm4,%ymm8,%ymm2
- vpaddq %ymm6,%ymm11,%ymm11
- vpaddq %ymm2,%ymm12,%ymm12
- vmovdqu 20(%rax),%ymm2
- vpmuludq %ymm1,%ymm9,%ymm6
- vpmuludq %ymm0,%ymm9,%ymm9
- vpaddq %ymm6,%ymm14,%ymm14
- vpaddq %ymm9,%ymm13,%ymm13
-
- vpmuludq %ymm1,%ymm2,%ymm6
- vpmuludq %ymm0,%ymm2,%ymm2
- vpaddq %ymm6,%ymm15,%ymm15
- vpaddq %ymm2,%ymm14,%ymm14
- vpmuludq %ymm3,%ymm10,%ymm6
- vpmuludq %ymm4,%ymm10,%ymm2
- vpaddq %ymm6,%ymm12,%ymm12
- vpaddq %ymm2,%ymm13,%ymm13
-
- vpmuludq %ymm3,%ymm5,%ymm3
- vpmuludq %ymm4,%ymm5,%ymm4
- vpaddq %ymm3,%ymm13,%ymm2
- vpaddq %ymm4,%ymm14,%ymm3
- vpmuludq 84(%rax),%ymm0,%ymm4
- vpmuludq %ymm1,%ymm5,%ymm0
- vmovdqa 64(%rcx),%ymm5
- vpaddq %ymm4,%ymm15,%ymm4
- vpaddq %ymm0,%ymm11,%ymm0
-
- vpsrldq $8,%ymm12,%ymm8
- vpsrldq $8,%ymm2,%ymm9
- vpsrldq $8,%ymm3,%ymm10
- vpsrldq $8,%ymm4,%ymm6
- vpsrldq $8,%ymm0,%ymm7
- vpaddq %ymm8,%ymm12,%ymm12
- vpaddq %ymm9,%ymm2,%ymm2
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm6,%ymm4,%ymm4
- vpaddq %ymm7,%ymm0,%ymm0
-
- vpermq $0x2,%ymm3,%ymm10
- vpermq $0x2,%ymm4,%ymm6
- vpermq $0x2,%ymm0,%ymm7
- vpermq $0x2,%ymm12,%ymm8
- vpermq $0x2,%ymm2,%ymm9
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm6,%ymm4,%ymm4
- vpaddq %ymm7,%ymm0,%ymm0
- vpaddq %ymm8,%ymm12,%ymm12
- vpaddq %ymm9,%ymm2,%ymm2
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpaddq %ymm14,%ymm4,%ymm4
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm11,%ymm12,%ymm1
-
- vpsrlq $26,%ymm4,%ymm15
- vpand %ymm5,%ymm4,%ymm4
-
- vpsrlq $26,%ymm1,%ymm12
- vpand %ymm5,%ymm1,%ymm1
- vpaddq %ymm12,%ymm2,%ymm2
-
- vpaddq %ymm15,%ymm0,%ymm0
- vpsllq $2,%ymm15,%ymm15
- vpaddq %ymm15,%ymm0,%ymm0
-
- vpsrlq $26,%ymm2,%ymm13
- vpand %ymm5,%ymm2,%ymm2
- vpaddq %ymm13,%ymm3,%ymm3
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm11,%ymm1,%ymm1
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpaddq %ymm14,%ymm4,%ymm4
-
- vmovd %xmm0,-112(%rdi)
- vmovd %xmm1,-108(%rdi)
- vmovd %xmm2,-104(%rdi)
- vmovd %xmm3,-100(%rdi)
- vmovd %xmm4,-96(%rdi)
- leaq -8(%r10),%rsp
-
- vzeroupper
- ret
-
-ENDPROC(poly1305_blocks_avx2)
-#endif /* CONFIG_AS_AVX2 */
-
-#ifdef CONFIG_AS_AVX512
-.align 32
-ENTRY(poly1305_blocks_avx512)
-
- movl 20(%rdi),%r8d
- cmpq $128,%rdx
- jae .Lblocks_avx2_512
- testl %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx2_512:
- andq $-16,%rdx
- jz .Lno_data_avx2_512
-
- vzeroupper
-
- testl %r8d,%r8d
- jz .Lbase2_64_avx2_512
-
- testq $63,%rdx
- jz .Leven_avx2_512
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rdi
-
-.Lblocks_avx2_body_512:
-
- movq %rdx,%r15
-
- movq 0(%rdi),%r8
- movq 8(%rdi),%r9
- movl 16(%rdi),%r10d
-
- movq 24(%rdi),%r11
- movq 32(%rdi),%r13
-
-
- movl %r8d,%r14d
- andq $-2147483648,%r8
- movq %r9,%r12
- movl %r9d,%ebx
- andq $-2147483648,%r9
-
- shrq $6,%r8
- shlq $52,%r12
- addq %r8,%r14
- shrq $12,%rbx
- shrq $18,%r9
- addq %r12,%r14
- adcq %r9,%rbx
-
- movq %r10,%r8
- shlq $40,%r8
- shrq $24,%r10
- addq %r8,%rbx
- adcq $0,%r10
-
- movq $-4,%r9
- movq %r10,%r8
- andq %r10,%r9
- shrq $2,%r8
- andq $3,%r10
- addq %r9,%r8
- addq %r8,%r14
- adcq $0,%rbx
- adcq $0,%r10
-
- movq %r13,%r12
- movq %r13,%rax
- shrq $2,%r13
- addq %r12,%r13
-
-.Lbase2_26_pre_avx2_512:
- addq 0(%rsi),%r14
- adcq 8(%rsi),%rbx
- leaq 16(%rsi),%rsi
- adcq %rcx,%r10
- subq $16,%r15
-
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
- movq %r12,%rax
-
- testq $63,%r15
- jnz .Lbase2_26_pre_avx2_512
-
- testq %rcx,%rcx
- jz .Lstore_base2_64_avx2_512
-
-
- movq %r14,%rax
- movq %r14,%rdx
- shrq $52,%r14
- movq %rbx,%r11
- movq %rbx,%r12
- shrq $26,%rdx
- andq $0x3ffffff,%rax
- shlq $12,%r11
- andq $0x3ffffff,%rdx
- shrq $14,%rbx
- orq %r11,%r14
- shlq $24,%r10
- andq $0x3ffffff,%r14
- shrq $40,%r12
- andq $0x3ffffff,%rbx
- orq %r12,%r10
-
- testq %r15,%r15
- jz .Lstore_base2_26_avx2_512
-
- vmovd %eax,%xmm0
- vmovd %edx,%xmm1
- vmovd %r14d,%xmm2
- vmovd %ebx,%xmm3
- vmovd %r10d,%xmm4
- jmp .Lproceed_avx2_512
-
-.align 32
-.Lstore_base2_64_avx2_512:
- movq %r14,0(%rdi)
- movq %rbx,8(%rdi)
- movq %r10,16(%rdi)
- jmp .Ldone_avx2_512
-
-.align 16
-.Lstore_base2_26_avx2_512:
- movl %eax,0(%rdi)
- movl %edx,4(%rdi)
- movl %r14d,8(%rdi)
- movl %ebx,12(%rdi)
- movl %r10d,16(%rdi)
-.align 16
-.Ldone_avx2_512:
- movq 8(%rsp),%r15
- movq 16(%rsp),%r14
- movq 24(%rsp),%r13
- movq 32(%rsp),%r12
- movq 40(%rsp),%rbx
- leaq 48(%rsp),%rsp
-
-.Lno_data_avx2_512:
-.Lblocks_avx2_epilogue_512:
- ret
-
-
-.align 32
-.Lbase2_64_avx2_512:
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rdi
-
-.Lbase2_64_avx2_body_512:
-
- movq %rdx,%r15
-
- movq 24(%rdi),%r11
- movq 32(%rdi),%r13
-
- movq 0(%rdi),%r14
- movq 8(%rdi),%rbx
- movl 16(%rdi),%r10d
-
- movq %r13,%r12
- movq %r13,%rax
- shrq $2,%r13
- addq %r12,%r13
-
- testq $63,%rdx
- jz .Linit_avx2_512
-
-.Lbase2_64_pre_avx2_512:
- addq 0(%rsi),%r14
- adcq 8(%rsi),%rbx
- leaq 16(%rsi),%rsi
- adcq %rcx,%r10
- subq $16,%r15
-
- movq %rdi,0(%rsp)
- __poly1305_block
- movq 0(%rsp),%rdi
- movq %r12,%rax
-
- testq $63,%r15
- jnz .Lbase2_64_pre_avx2_512
-
-.Linit_avx2_512:
-
- movq %r14,%rax
- movq %r14,%rdx
- shrq $52,%r14
- movq %rbx,%r8
- movq %rbx,%r9
- shrq $26,%rdx
- andq $0x3ffffff,%rax
- shlq $12,%r8
- andq $0x3ffffff,%rdx
- shrq $14,%rbx
- orq %r8,%r14
- shlq $24,%r10
- andq $0x3ffffff,%r14
- shrq $40,%r9
- andq $0x3ffffff,%rbx
- orq %r9,%r10
-
- vmovd %eax,%xmm0
- vmovd %edx,%xmm1
- vmovd %r14d,%xmm2
- vmovd %ebx,%xmm3
- vmovd %r10d,%xmm4
- movl $1,20(%rdi)
-
- __poly1305_init_avx
-
-.Lproceed_avx2_512:
- movq %r15,%rdx
-
- movq 8(%rsp),%r15
- movq 16(%rsp),%r14
- movq 24(%rsp),%r13
- movq 32(%rsp),%r12
- movq 40(%rsp),%rbx
- leaq 48(%rsp),%rax
- leaq 48(%rsp),%rsp
-
-.Lbase2_64_avx2_epilogue_512:
- jmp .Ldo_avx2_512
-
-
-.align 32
-.Leven_avx2_512:
-
- vmovd 0(%rdi),%xmm0
- vmovd 4(%rdi),%xmm1
- vmovd 8(%rdi),%xmm2
- vmovd 12(%rdi),%xmm3
- vmovd 16(%rdi),%xmm4
-
-.Ldo_avx2_512:
- cmpq $512,%rdx
- jae .Lblocks_avx512
-.Lskip_avx512:
- leaq 8(%rsp),%r10
-
- subq $0x128,%rsp
- leaq .Lconst(%rip),%rcx
- leaq 48+64(%rdi),%rdi
- vmovdqa 96(%rcx),%ymm7
-
-
- vmovdqu -64(%rdi),%xmm9
- andq $-512,%rsp
- vmovdqu -48(%rdi),%xmm10
- vmovdqu -32(%rdi),%xmm6
- vmovdqu -16(%rdi),%xmm11
- vmovdqu 0(%rdi),%xmm12
- vmovdqu 16(%rdi),%xmm13
- leaq 144(%rsp),%rax
- vmovdqu 32(%rdi),%xmm14
- vpermd %ymm9,%ymm7,%ymm9
- vmovdqu 48(%rdi),%xmm15
- vpermd %ymm10,%ymm7,%ymm10
- vmovdqu 64(%rdi),%xmm5
- vpermd %ymm6,%ymm7,%ymm6
- vmovdqa %ymm9,0(%rsp)
- vpermd %ymm11,%ymm7,%ymm11
- vmovdqa %ymm10,32-144(%rax)
- vpermd %ymm12,%ymm7,%ymm12
- vmovdqa %ymm6,64-144(%rax)
- vpermd %ymm13,%ymm7,%ymm13
- vmovdqa %ymm11,96-144(%rax)
- vpermd %ymm14,%ymm7,%ymm14
- vmovdqa %ymm12,128-144(%rax)
- vpermd %ymm15,%ymm7,%ymm15
- vmovdqa %ymm13,160-144(%rax)
- vpermd %ymm5,%ymm7,%ymm5
- vmovdqa %ymm14,192-144(%rax)
- vmovdqa %ymm15,224-144(%rax)
- vmovdqa %ymm5,256-144(%rax)
- vmovdqa 64(%rcx),%ymm5
-
-
-
- vmovdqu 0(%rsi),%xmm7
- vmovdqu 16(%rsi),%xmm8
- vinserti128 $1,32(%rsi),%ymm7,%ymm7
- vinserti128 $1,48(%rsi),%ymm8,%ymm8
- leaq 64(%rsi),%rsi
-
- vpsrldq $6,%ymm7,%ymm9
- vpsrldq $6,%ymm8,%ymm10
- vpunpckhqdq %ymm8,%ymm7,%ymm6
- vpunpcklqdq %ymm10,%ymm9,%ymm9
- vpunpcklqdq %ymm8,%ymm7,%ymm7
-
- vpsrlq $30,%ymm9,%ymm10
- vpsrlq $4,%ymm9,%ymm9
- vpsrlq $26,%ymm7,%ymm8
- vpsrlq $40,%ymm6,%ymm6
- vpand %ymm5,%ymm9,%ymm9
- vpand %ymm5,%ymm7,%ymm7
- vpand %ymm5,%ymm8,%ymm8
- vpand %ymm5,%ymm10,%ymm10
- vpor 32(%rcx),%ymm6,%ymm6
-
- vpaddq %ymm2,%ymm9,%ymm2
- subq $64,%rdx
- jz .Ltail_avx2_512
- jmp .Loop_avx2_512
-
-.align 32
-.Loop_avx2_512:
-
- vpaddq %ymm0,%ymm7,%ymm0
- vmovdqa 0(%rsp),%ymm7
- vpaddq %ymm1,%ymm8,%ymm1
- vmovdqa 32(%rsp),%ymm8
- vpaddq %ymm3,%ymm10,%ymm3
- vmovdqa 96(%rsp),%ymm9
- vpaddq %ymm4,%ymm6,%ymm4
- vmovdqa 48(%rax),%ymm10
- vmovdqa 112(%rax),%ymm5
-
- vpmuludq %ymm2,%ymm7,%ymm13
- vpmuludq %ymm2,%ymm8,%ymm14
- vpmuludq %ymm2,%ymm9,%ymm15
- vpmuludq %ymm2,%ymm10,%ymm11
- vpmuludq %ymm2,%ymm5,%ymm12
-
- vpmuludq %ymm0,%ymm8,%ymm6
- vpmuludq %ymm1,%ymm8,%ymm2
- vpaddq %ymm6,%ymm12,%ymm12
- vpaddq %ymm2,%ymm13,%ymm13
- vpmuludq %ymm3,%ymm8,%ymm6
- vpmuludq 64(%rsp),%ymm4,%ymm2
- vpaddq %ymm6,%ymm15,%ymm15
- vpaddq %ymm2,%ymm11,%ymm11
- vmovdqa -16(%rax),%ymm8
-
- vpmuludq %ymm0,%ymm7,%ymm6
- vpmuludq %ymm1,%ymm7,%ymm2
- vpaddq %ymm6,%ymm11,%ymm11
- vpaddq %ymm2,%ymm12,%ymm12
- vpmuludq %ymm3,%ymm7,%ymm6
- vpmuludq %ymm4,%ymm7,%ymm2
- vmovdqu 0(%rsi),%xmm7
- vpaddq %ymm6,%ymm14,%ymm14
- vpaddq %ymm2,%ymm15,%ymm15
- vinserti128 $1,32(%rsi),%ymm7,%ymm7
-
- vpmuludq %ymm3,%ymm8,%ymm6
- vpmuludq %ymm4,%ymm8,%ymm2
- vmovdqu 16(%rsi),%xmm8
- vpaddq %ymm6,%ymm11,%ymm11
- vpaddq %ymm2,%ymm12,%ymm12
- vmovdqa 16(%rax),%ymm2
- vpmuludq %ymm1,%ymm9,%ymm6
- vpmuludq %ymm0,%ymm9,%ymm9
- vpaddq %ymm6,%ymm14,%ymm14
- vpaddq %ymm9,%ymm13,%ymm13
- vinserti128 $1,48(%rsi),%ymm8,%ymm8
- leaq 64(%rsi),%rsi
-
- vpmuludq %ymm1,%ymm2,%ymm6
- vpmuludq %ymm0,%ymm2,%ymm2
- vpsrldq $6,%ymm7,%ymm9
- vpaddq %ymm6,%ymm15,%ymm15
- vpaddq %ymm2,%ymm14,%ymm14
- vpmuludq %ymm3,%ymm10,%ymm6
- vpmuludq %ymm4,%ymm10,%ymm2
- vpsrldq $6,%ymm8,%ymm10
- vpaddq %ymm6,%ymm12,%ymm12
- vpaddq %ymm2,%ymm13,%ymm13
- vpunpckhqdq %ymm8,%ymm7,%ymm6
-
- vpmuludq %ymm3,%ymm5,%ymm3
- vpmuludq %ymm4,%ymm5,%ymm4
- vpunpcklqdq %ymm8,%ymm7,%ymm7
- vpaddq %ymm3,%ymm13,%ymm2
- vpaddq %ymm4,%ymm14,%ymm3
- vpunpcklqdq %ymm10,%ymm9,%ymm10
- vpmuludq 80(%rax),%ymm0,%ymm4
- vpmuludq %ymm1,%ymm5,%ymm0
- vmovdqa 64(%rcx),%ymm5
- vpaddq %ymm4,%ymm15,%ymm4
- vpaddq %ymm0,%ymm11,%ymm0
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpaddq %ymm14,%ymm4,%ymm4
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm11,%ymm12,%ymm1
-
- vpsrlq $26,%ymm4,%ymm15
- vpand %ymm5,%ymm4,%ymm4
-
- vpsrlq $4,%ymm10,%ymm9
-
- vpsrlq $26,%ymm1,%ymm12
- vpand %ymm5,%ymm1,%ymm1
- vpaddq %ymm12,%ymm2,%ymm2
-
- vpaddq %ymm15,%ymm0,%ymm0
- vpsllq $2,%ymm15,%ymm15
- vpaddq %ymm15,%ymm0,%ymm0
-
- vpand %ymm5,%ymm9,%ymm9
- vpsrlq $26,%ymm7,%ymm8
-
- vpsrlq $26,%ymm2,%ymm13
- vpand %ymm5,%ymm2,%ymm2
- vpaddq %ymm13,%ymm3,%ymm3
-
- vpaddq %ymm9,%ymm2,%ymm2
- vpsrlq $30,%ymm10,%ymm10
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm11,%ymm1,%ymm1
-
- vpsrlq $40,%ymm6,%ymm6
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpaddq %ymm14,%ymm4,%ymm4
-
- vpand %ymm5,%ymm7,%ymm7
- vpand %ymm5,%ymm8,%ymm8
- vpand %ymm5,%ymm10,%ymm10
- vpor 32(%rcx),%ymm6,%ymm6
-
- subq $64,%rdx
- jnz .Loop_avx2_512
-
-.byte 0x66,0x90
-.Ltail_avx2_512:
-
- vpaddq %ymm0,%ymm7,%ymm0
- vmovdqu 4(%rsp),%ymm7
- vpaddq %ymm1,%ymm8,%ymm1
- vmovdqu 36(%rsp),%ymm8
- vpaddq %ymm3,%ymm10,%ymm3
- vmovdqu 100(%rsp),%ymm9
- vpaddq %ymm4,%ymm6,%ymm4
- vmovdqu 52(%rax),%ymm10
- vmovdqu 116(%rax),%ymm5
-
- vpmuludq %ymm2,%ymm7,%ymm13
- vpmuludq %ymm2,%ymm8,%ymm14
- vpmuludq %ymm2,%ymm9,%ymm15
- vpmuludq %ymm2,%ymm10,%ymm11
- vpmuludq %ymm2,%ymm5,%ymm12
-
- vpmuludq %ymm0,%ymm8,%ymm6
- vpmuludq %ymm1,%ymm8,%ymm2
- vpaddq %ymm6,%ymm12,%ymm12
- vpaddq %ymm2,%ymm13,%ymm13
- vpmuludq %ymm3,%ymm8,%ymm6
- vpmuludq 68(%rsp),%ymm4,%ymm2
- vpaddq %ymm6,%ymm15,%ymm15
- vpaddq %ymm2,%ymm11,%ymm11
-
- vpmuludq %ymm0,%ymm7,%ymm6
- vpmuludq %ymm1,%ymm7,%ymm2
- vpaddq %ymm6,%ymm11,%ymm11
- vmovdqu -12(%rax),%ymm8
- vpaddq %ymm2,%ymm12,%ymm12
- vpmuludq %ymm3,%ymm7,%ymm6
- vpmuludq %ymm4,%ymm7,%ymm2
- vpaddq %ymm6,%ymm14,%ymm14
- vpaddq %ymm2,%ymm15,%ymm15
-
- vpmuludq %ymm3,%ymm8,%ymm6
- vpmuludq %ymm4,%ymm8,%ymm2
- vpaddq %ymm6,%ymm11,%ymm11
- vpaddq %ymm2,%ymm12,%ymm12
- vmovdqu 20(%rax),%ymm2
- vpmuludq %ymm1,%ymm9,%ymm6
- vpmuludq %ymm0,%ymm9,%ymm9
- vpaddq %ymm6,%ymm14,%ymm14
- vpaddq %ymm9,%ymm13,%ymm13
-
- vpmuludq %ymm1,%ymm2,%ymm6
- vpmuludq %ymm0,%ymm2,%ymm2
- vpaddq %ymm6,%ymm15,%ymm15
- vpaddq %ymm2,%ymm14,%ymm14
- vpmuludq %ymm3,%ymm10,%ymm6
- vpmuludq %ymm4,%ymm10,%ymm2
- vpaddq %ymm6,%ymm12,%ymm12
- vpaddq %ymm2,%ymm13,%ymm13
-
- vpmuludq %ymm3,%ymm5,%ymm3
- vpmuludq %ymm4,%ymm5,%ymm4
- vpaddq %ymm3,%ymm13,%ymm2
- vpaddq %ymm4,%ymm14,%ymm3
- vpmuludq 84(%rax),%ymm0,%ymm4
- vpmuludq %ymm1,%ymm5,%ymm0
- vmovdqa 64(%rcx),%ymm5
- vpaddq %ymm4,%ymm15,%ymm4
- vpaddq %ymm0,%ymm11,%ymm0
-
- vpsrldq $8,%ymm12,%ymm8
- vpsrldq $8,%ymm2,%ymm9
- vpsrldq $8,%ymm3,%ymm10
- vpsrldq $8,%ymm4,%ymm6
- vpsrldq $8,%ymm0,%ymm7
- vpaddq %ymm8,%ymm12,%ymm12
- vpaddq %ymm9,%ymm2,%ymm2
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm6,%ymm4,%ymm4
- vpaddq %ymm7,%ymm0,%ymm0
-
- vpermq $0x2,%ymm3,%ymm10
- vpermq $0x2,%ymm4,%ymm6
- vpermq $0x2,%ymm0,%ymm7
- vpermq $0x2,%ymm12,%ymm8
- vpermq $0x2,%ymm2,%ymm9
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm6,%ymm4,%ymm4
- vpaddq %ymm7,%ymm0,%ymm0
- vpaddq %ymm8,%ymm12,%ymm12
- vpaddq %ymm9,%ymm2,%ymm2
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpaddq %ymm14,%ymm4,%ymm4
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm11,%ymm12,%ymm1
-
- vpsrlq $26,%ymm4,%ymm15
- vpand %ymm5,%ymm4,%ymm4
-
- vpsrlq $26,%ymm1,%ymm12
- vpand %ymm5,%ymm1,%ymm1
- vpaddq %ymm12,%ymm2,%ymm2
-
- vpaddq %ymm15,%ymm0,%ymm0
- vpsllq $2,%ymm15,%ymm15
- vpaddq %ymm15,%ymm0,%ymm0
-
- vpsrlq $26,%ymm2,%ymm13
- vpand %ymm5,%ymm2,%ymm2
- vpaddq %ymm13,%ymm3,%ymm3
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm11,%ymm1,%ymm1
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpaddq %ymm14,%ymm4,%ymm4
-
- vmovd %xmm0,-112(%rdi)
- vmovd %xmm1,-108(%rdi)
- vmovd %xmm2,-104(%rdi)
- vmovd %xmm3,-100(%rdi)
- vmovd %xmm4,-96(%rdi)
- leaq -8(%r10),%rsp
-
- vzeroupper
- ret
-
-.Lblocks_avx512:
-
- movl $15,%eax
- kmovw %eax,%k2
- leaq 8(%rsp),%r10
-
- subq $0x128,%rsp
- leaq .Lconst(%rip),%rcx
- leaq 48+64(%rdi),%rdi
- vmovdqa 96(%rcx),%ymm9
-
- vmovdqu32 -64(%rdi),%zmm16{%k2}{z}
- andq $-512,%rsp
- vmovdqu32 -48(%rdi),%zmm17{%k2}{z}
- movq $0x20,%rax
- vmovdqu32 -32(%rdi),%zmm21{%k2}{z}
- vmovdqu32 -16(%rdi),%zmm18{%k2}{z}
- vmovdqu32 0(%rdi),%zmm22{%k2}{z}
- vmovdqu32 16(%rdi),%zmm19{%k2}{z}
- vmovdqu32 32(%rdi),%zmm23{%k2}{z}
- vmovdqu32 48(%rdi),%zmm20{%k2}{z}
- vmovdqu32 64(%rdi),%zmm24{%k2}{z}
- vpermd %zmm16,%zmm9,%zmm16
- vpbroadcastq 64(%rcx),%zmm5
- vpermd %zmm17,%zmm9,%zmm17
- vpermd %zmm21,%zmm9,%zmm21
- vpermd %zmm18,%zmm9,%zmm18
- vmovdqa64 %zmm16,0(%rsp){%k2}
- vpsrlq $32,%zmm16,%zmm7
- vpermd %zmm22,%zmm9,%zmm22
- vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
- vpsrlq $32,%zmm17,%zmm8
- vpermd %zmm19,%zmm9,%zmm19
- vmovdqa64 %zmm21,64(%rsp){%k2}
- vpermd %zmm23,%zmm9,%zmm23
- vpermd %zmm20,%zmm9,%zmm20
- vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
- vpermd %zmm24,%zmm9,%zmm24
- vmovdqa64 %zmm22,128(%rsp){%k2}
- vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
- vmovdqa64 %zmm23,192(%rsp){%k2}
- vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
- vmovdqa64 %zmm24,256(%rsp){%k2}
-
- vpmuludq %zmm7,%zmm16,%zmm11
- vpmuludq %zmm7,%zmm17,%zmm12
- vpmuludq %zmm7,%zmm18,%zmm13
- vpmuludq %zmm7,%zmm19,%zmm14
- vpmuludq %zmm7,%zmm20,%zmm15
- vpsrlq $32,%zmm18,%zmm9
-
- vpmuludq %zmm8,%zmm24,%zmm25
- vpmuludq %zmm8,%zmm16,%zmm26
- vpmuludq %zmm8,%zmm17,%zmm27
- vpmuludq %zmm8,%zmm18,%zmm28
- vpmuludq %zmm8,%zmm19,%zmm29
- vpsrlq $32,%zmm19,%zmm10
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
-
- vpmuludq %zmm9,%zmm23,%zmm25
- vpmuludq %zmm9,%zmm24,%zmm26
- vpmuludq %zmm9,%zmm17,%zmm28
- vpmuludq %zmm9,%zmm18,%zmm29
- vpmuludq %zmm9,%zmm16,%zmm27
- vpsrlq $32,%zmm20,%zmm6
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm27,%zmm13,%zmm13
-
- vpmuludq %zmm10,%zmm22,%zmm25
- vpmuludq %zmm10,%zmm16,%zmm28
- vpmuludq %zmm10,%zmm17,%zmm29
- vpmuludq %zmm10,%zmm23,%zmm26
- vpmuludq %zmm10,%zmm24,%zmm27
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
-
- vpmuludq %zmm6,%zmm24,%zmm28
- vpmuludq %zmm6,%zmm16,%zmm29
- vpmuludq %zmm6,%zmm21,%zmm25
- vpmuludq %zmm6,%zmm22,%zmm26
- vpmuludq %zmm6,%zmm23,%zmm27
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
-
- vmovdqu64 0(%rsi),%zmm10
- vmovdqu64 64(%rsi),%zmm6
- leaq 128(%rsi),%rsi
-
- vpsrlq $26,%zmm14,%zmm28
- vpandq %zmm5,%zmm14,%zmm14
- vpaddq %zmm28,%zmm15,%zmm15
-
- vpsrlq $26,%zmm11,%zmm25
- vpandq %zmm5,%zmm11,%zmm11
- vpaddq %zmm25,%zmm12,%zmm12
-
- vpsrlq $26,%zmm15,%zmm29
- vpandq %zmm5,%zmm15,%zmm15
-
- vpsrlq $26,%zmm12,%zmm26
- vpandq %zmm5,%zmm12,%zmm12
- vpaddq %zmm26,%zmm13,%zmm13
-
- vpaddq %zmm29,%zmm11,%zmm11
- vpsllq $2,%zmm29,%zmm29
- vpaddq %zmm29,%zmm11,%zmm11
-
- vpsrlq $26,%zmm13,%zmm27
- vpandq %zmm5,%zmm13,%zmm13
- vpaddq %zmm27,%zmm14,%zmm14
-
- vpsrlq $26,%zmm11,%zmm25
- vpandq %zmm5,%zmm11,%zmm11
- vpaddq %zmm25,%zmm12,%zmm12
-
- vpsrlq $26,%zmm14,%zmm28
- vpandq %zmm5,%zmm14,%zmm14
- vpaddq %zmm28,%zmm15,%zmm15
-
- vpunpcklqdq %zmm6,%zmm10,%zmm7
- vpunpckhqdq %zmm6,%zmm10,%zmm6
-
- vmovdqa32 128(%rcx),%zmm25
- movl $0x7777,%eax
- kmovw %eax,%k1
-
- vpermd %zmm16,%zmm25,%zmm16
- vpermd %zmm17,%zmm25,%zmm17
- vpermd %zmm18,%zmm25,%zmm18
- vpermd %zmm19,%zmm25,%zmm19
- vpermd %zmm20,%zmm25,%zmm20
-
- vpermd %zmm11,%zmm25,%zmm16{%k1}
- vpermd %zmm12,%zmm25,%zmm17{%k1}
- vpermd %zmm13,%zmm25,%zmm18{%k1}
- vpermd %zmm14,%zmm25,%zmm19{%k1}
- vpermd %zmm15,%zmm25,%zmm20{%k1}
-
- vpslld $2,%zmm17,%zmm21
- vpslld $2,%zmm18,%zmm22
- vpslld $2,%zmm19,%zmm23
- vpslld $2,%zmm20,%zmm24
- vpaddd %zmm17,%zmm21,%zmm21
- vpaddd %zmm18,%zmm22,%zmm22
- vpaddd %zmm19,%zmm23,%zmm23
- vpaddd %zmm20,%zmm24,%zmm24
-
- vpbroadcastq 32(%rcx),%zmm30
-
- vpsrlq $52,%zmm7,%zmm9
- vpsllq $12,%zmm6,%zmm10
- vporq %zmm10,%zmm9,%zmm9
- vpsrlq $26,%zmm7,%zmm8
- vpsrlq $14,%zmm6,%zmm10
- vpsrlq $40,%zmm6,%zmm6
- vpandq %zmm5,%zmm9,%zmm9
- vpandq %zmm5,%zmm7,%zmm7
-
- vpaddq %zmm2,%zmm9,%zmm2
- subq $192,%rdx
- jbe .Ltail_avx512
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
-
- vpmuludq %zmm2,%zmm17,%zmm14
- vpaddq %zmm0,%zmm7,%zmm0
- vpmuludq %zmm2,%zmm18,%zmm15
- vpandq %zmm5,%zmm8,%zmm8
- vpmuludq %zmm2,%zmm23,%zmm11
- vpandq %zmm5,%zmm10,%zmm10
- vpmuludq %zmm2,%zmm24,%zmm12
- vporq %zmm30,%zmm6,%zmm6
- vpmuludq %zmm2,%zmm16,%zmm13
- vpaddq %zmm1,%zmm8,%zmm1
- vpaddq %zmm3,%zmm10,%zmm3
- vpaddq %zmm4,%zmm6,%zmm4
-
- vmovdqu64 0(%rsi),%zmm10
- vmovdqu64 64(%rsi),%zmm6
- leaq 128(%rsi),%rsi
- vpmuludq %zmm0,%zmm19,%zmm28
- vpmuludq %zmm0,%zmm20,%zmm29
- vpmuludq %zmm0,%zmm16,%zmm25
- vpmuludq %zmm0,%zmm17,%zmm26
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
-
- vpmuludq %zmm1,%zmm18,%zmm28
- vpmuludq %zmm1,%zmm19,%zmm29
- vpmuludq %zmm1,%zmm24,%zmm25
- vpmuludq %zmm0,%zmm18,%zmm27
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm27,%zmm13,%zmm13
-
- vpunpcklqdq %zmm6,%zmm10,%zmm7
- vpunpckhqdq %zmm6,%zmm10,%zmm6
-
- vpmuludq %zmm3,%zmm16,%zmm28
- vpmuludq %zmm3,%zmm17,%zmm29
- vpmuludq %zmm1,%zmm16,%zmm26
- vpmuludq %zmm1,%zmm17,%zmm27
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
-
- vpmuludq %zmm4,%zmm24,%zmm28
- vpmuludq %zmm4,%zmm16,%zmm29
- vpmuludq %zmm3,%zmm22,%zmm25
- vpmuludq %zmm3,%zmm23,%zmm26
- vpaddq %zmm28,%zmm14,%zmm14
- vpmuludq %zmm3,%zmm24,%zmm27
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
-
- vpmuludq %zmm4,%zmm21,%zmm25
- vpmuludq %zmm4,%zmm22,%zmm26
- vpmuludq %zmm4,%zmm23,%zmm27
- vpaddq %zmm25,%zmm11,%zmm0
- vpaddq %zmm26,%zmm12,%zmm1
- vpaddq %zmm27,%zmm13,%zmm2
-
- vpsrlq $52,%zmm7,%zmm9
- vpsllq $12,%zmm6,%zmm10
-
- vpsrlq $26,%zmm14,%zmm3
- vpandq %zmm5,%zmm14,%zmm14
- vpaddq %zmm3,%zmm15,%zmm4
-
- vporq %zmm10,%zmm9,%zmm9
-
- vpsrlq $26,%zmm0,%zmm11
- vpandq %zmm5,%zmm0,%zmm0
- vpaddq %zmm11,%zmm1,%zmm1
-
- vpandq %zmm5,%zmm9,%zmm9
-
- vpsrlq $26,%zmm4,%zmm15
- vpandq %zmm5,%zmm4,%zmm4
-
- vpsrlq $26,%zmm1,%zmm12
- vpandq %zmm5,%zmm1,%zmm1
- vpaddq %zmm12,%zmm2,%zmm2
-
- vpaddq %zmm15,%zmm0,%zmm0
- vpsllq $2,%zmm15,%zmm15
- vpaddq %zmm15,%zmm0,%zmm0
-
- vpaddq %zmm9,%zmm2,%zmm2
- vpsrlq $26,%zmm7,%zmm8
-
- vpsrlq $26,%zmm2,%zmm13
- vpandq %zmm5,%zmm2,%zmm2
- vpaddq %zmm13,%zmm14,%zmm3
-
- vpsrlq $14,%zmm6,%zmm10
-
- vpsrlq $26,%zmm0,%zmm11
- vpandq %zmm5,%zmm0,%zmm0
- vpaddq %zmm11,%zmm1,%zmm1
-
- vpsrlq $40,%zmm6,%zmm6
-
- vpsrlq $26,%zmm3,%zmm14
- vpandq %zmm5,%zmm3,%zmm3
- vpaddq %zmm14,%zmm4,%zmm4
-
- vpandq %zmm5,%zmm7,%zmm7
-
- subq $128,%rdx
- ja .Loop_avx512
-
-.Ltail_avx512:
-
- vpsrlq $32,%zmm16,%zmm16
- vpsrlq $32,%zmm17,%zmm17
- vpsrlq $32,%zmm18,%zmm18
- vpsrlq $32,%zmm23,%zmm23
- vpsrlq $32,%zmm24,%zmm24
- vpsrlq $32,%zmm19,%zmm19
- vpsrlq $32,%zmm20,%zmm20
- vpsrlq $32,%zmm21,%zmm21
- vpsrlq $32,%zmm22,%zmm22
-
- leaq (%rsi,%rdx,1),%rsi
-
- vpaddq %zmm0,%zmm7,%zmm0
-
- vpmuludq %zmm2,%zmm17,%zmm14
- vpmuludq %zmm2,%zmm18,%zmm15
- vpmuludq %zmm2,%zmm23,%zmm11
- vpandq %zmm5,%zmm8,%zmm8
- vpmuludq %zmm2,%zmm24,%zmm12
- vpandq %zmm5,%zmm10,%zmm10
- vpmuludq %zmm2,%zmm16,%zmm13
- vporq %zmm30,%zmm6,%zmm6
- vpaddq %zmm1,%zmm8,%zmm1
- vpaddq %zmm3,%zmm10,%zmm3
- vpaddq %zmm4,%zmm6,%zmm4
-
- vmovdqu 0(%rsi),%xmm7
- vpmuludq %zmm0,%zmm19,%zmm28
- vpmuludq %zmm0,%zmm20,%zmm29
- vpmuludq %zmm0,%zmm16,%zmm25
- vpmuludq %zmm0,%zmm17,%zmm26
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
-
- vmovdqu 16(%rsi),%xmm8
- vpmuludq %zmm1,%zmm18,%zmm28
- vpmuludq %zmm1,%zmm19,%zmm29
- vpmuludq %zmm1,%zmm24,%zmm25
- vpmuludq %zmm0,%zmm18,%zmm27
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm27,%zmm13,%zmm13
-
- vinserti128 $1,32(%rsi),%ymm7,%ymm7
- vpmuludq %zmm3,%zmm16,%zmm28
- vpmuludq %zmm3,%zmm17,%zmm29
- vpmuludq %zmm1,%zmm16,%zmm26
- vpmuludq %zmm1,%zmm17,%zmm27
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
-
- vinserti128 $1,48(%rsi),%ymm8,%ymm8
- vpmuludq %zmm4,%zmm24,%zmm28
- vpmuludq %zmm4,%zmm16,%zmm29
- vpmuludq %zmm3,%zmm22,%zmm25
- vpmuludq %zmm3,%zmm23,%zmm26
- vpmuludq %zmm3,%zmm24,%zmm27
- vpaddq %zmm28,%zmm14,%zmm3
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
-
- vpmuludq %zmm4,%zmm21,%zmm25
- vpmuludq %zmm4,%zmm22,%zmm26
- vpmuludq %zmm4,%zmm23,%zmm27
- vpaddq %zmm25,%zmm11,%zmm0
- vpaddq %zmm26,%zmm12,%zmm1
- vpaddq %zmm27,%zmm13,%zmm2
-
- movl $1,%eax
- vpermq $0xb1,%zmm3,%zmm14
- vpermq $0xb1,%zmm15,%zmm4
- vpermq $0xb1,%zmm0,%zmm11
- vpermq $0xb1,%zmm1,%zmm12
- vpermq $0xb1,%zmm2,%zmm13
- vpaddq %zmm14,%zmm3,%zmm3
- vpaddq %zmm15,%zmm4,%zmm4
- vpaddq %zmm11,%zmm0,%zmm0
- vpaddq %zmm12,%zmm1,%zmm1
- vpaddq %zmm13,%zmm2,%zmm2
-
- kmovw %eax,%k3
- vpermq $0x2,%zmm3,%zmm14
- vpermq $0x2,%zmm4,%zmm15
- vpermq $0x2,%zmm0,%zmm11
- vpermq $0x2,%zmm1,%zmm12
- vpermq $0x2,%zmm2,%zmm13
- vpaddq %zmm14,%zmm3,%zmm3
- vpaddq %zmm15,%zmm4,%zmm4
- vpaddq %zmm11,%zmm0,%zmm0
- vpaddq %zmm12,%zmm1,%zmm1
- vpaddq %zmm13,%zmm2,%zmm2
-
- vextracti64x4 $0x1,%zmm3,%ymm14
- vextracti64x4 $0x1,%zmm4,%ymm15
- vextracti64x4 $0x1,%zmm0,%ymm11
- vextracti64x4 $0x1,%zmm1,%ymm12
- vextracti64x4 $0x1,%zmm2,%ymm13
- vpaddq %zmm14,%zmm3,%zmm3{%k3}{z}
- vpaddq %zmm15,%zmm4,%zmm4{%k3}{z}
- vpaddq %zmm11,%zmm0,%zmm0{%k3}{z}
- vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
- vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpsrldq $6,%ymm7,%ymm9
- vpsrldq $6,%ymm8,%ymm10
- vpunpckhqdq %ymm8,%ymm7,%ymm6
- vpaddq %ymm14,%ymm4,%ymm4
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpunpcklqdq %ymm10,%ymm9,%ymm9
- vpunpcklqdq %ymm8,%ymm7,%ymm7
- vpaddq %ymm11,%ymm1,%ymm1
-
- vpsrlq $26,%ymm4,%ymm15
- vpand %ymm5,%ymm4,%ymm4
-
- vpsrlq $26,%ymm1,%ymm12
- vpand %ymm5,%ymm1,%ymm1
- vpsrlq $30,%ymm9,%ymm10
- vpsrlq $4,%ymm9,%ymm9
- vpaddq %ymm12,%ymm2,%ymm2
-
- vpaddq %ymm15,%ymm0,%ymm0
- vpsllq $2,%ymm15,%ymm15
- vpsrlq $26,%ymm7,%ymm8
- vpsrlq $40,%ymm6,%ymm6
- vpaddq %ymm15,%ymm0,%ymm0
-
- vpsrlq $26,%ymm2,%ymm13
- vpand %ymm5,%ymm2,%ymm2
- vpand %ymm5,%ymm9,%ymm9
- vpand %ymm5,%ymm7,%ymm7
- vpaddq %ymm13,%ymm3,%ymm3
-
- vpsrlq $26,%ymm0,%ymm11
- vpand %ymm5,%ymm0,%ymm0
- vpaddq %ymm2,%ymm9,%ymm2
- vpand %ymm5,%ymm8,%ymm8
- vpaddq %ymm11,%ymm1,%ymm1
-
- vpsrlq $26,%ymm3,%ymm14
- vpand %ymm5,%ymm3,%ymm3
- vpand %ymm5,%ymm10,%ymm10
- vpor 32(%rcx),%ymm6,%ymm6
- vpaddq %ymm14,%ymm4,%ymm4
-
- leaq 144(%rsp),%rax
- addq $64,%rdx
- jnz .Ltail_avx2_512
-
- vpsubq %ymm9,%ymm2,%ymm2
- vmovd %xmm0,-112(%rdi)
- vmovd %xmm1,-108(%rdi)
- vmovd %xmm2,-104(%rdi)
- vmovd %xmm3,-100(%rdi)
- vmovd %xmm4,-96(%rdi)
- vzeroall
- leaq -8(%r10),%rsp
-
- ret
-
-ENDPROC(poly1305_blocks_avx512)
-#endif /* CONFIG_AS_AVX512 */