diff options
author | Samuel Neves <sneves@dei.uc.pt> | 2018-11-12 08:14:51 +0000 |
---|---|---|
committer | Jason A. Donenfeld <Jason@zx2c4.com> | 2018-11-14 23:59:05 -0800 |
commit | 44e2ef7b23f4b68008ed5c910a7cb881f9c0939f (patch) | |
tree | f0f2e43c46a819eed4f055321ecf34a95b8e504f /src/crypto/zinc/poly1305/poly1305-x86_64.S | |
parent | 5c67177dcc6a23ceccaf8e69daf92a8a12212732 (diff) |
chacha20,poly1305: switch to perlasm originals on x86_64
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'src/crypto/zinc/poly1305/poly1305-x86_64.S')
-rw-r--r-- | src/crypto/zinc/poly1305/poly1305-x86_64.S | 2792 |
1 files changed, 0 insertions, 2792 deletions
diff --git a/src/crypto/zinc/poly1305/poly1305-x86_64.S b/src/crypto/zinc/poly1305/poly1305-x86_64.S deleted file mode 100644 index 3c3f2b4..0000000 --- a/src/crypto/zinc/poly1305/poly1305-x86_64.S +++ /dev/null @@ -1,2792 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. - * - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. - */ - -#include <linux/linkage.h> - -.section .rodata.cst192.Lconst, "aM", @progbits, 192 -.align 64 -.Lconst: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.long 16777216,0,16777216,0,16777216,0,16777216,0 -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.long 2,2,2,3,2,0,2,1 -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.text - -.align 32 -ENTRY(poly1305_init_x86_64) - xorq %rax,%rax - movq %rax,0(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - - cmpq $0,%rsi - je .Lno_key - - movq $0x0ffffffc0fffffff,%rax - movq $0x0ffffffc0ffffffc,%rcx - andq 0(%rsi),%rax - andq 8(%rsi),%rcx - movq %rax,24(%rdi) - movq %rcx,32(%rdi) - movl $1,%eax -.Lno_key: - ret -ENDPROC(poly1305_init_x86_64) - -.align 32 -ENTRY(poly1305_blocks_x86_64) -.Lblocks: - shrq $4,%rdx - jz .Lno_data - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lblocks_body: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movq 16(%rdi),%r10 - - movq %r13,%r12 - shrq $2,%r13 - movq %r12,%rax - addq %r12,%r13 - jmp .Loop - -.align 32 -.Loop: - - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - mulq %r14 - movq %rax,%r9 - movq %r11,%rax - movq %rdx,%rdi - - mulq %r14 - movq %rax,%r14 - movq %r11,%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r9 - movq %r13,%rax - adcq %rdx,%rdi - - mulq %rbx - movq %r10,%rbx - addq %rax,%r14 - adcq %rdx,%r8 - - imulq %r13,%rbx - addq %rbx,%r9 - movq %r8,%rbx - adcq $0,%rdi - - imulq %r11,%r10 - addq %r9,%rbx - movq $-4,%rax - adcq %r10,%rdi - - andq %rdi,%rax - movq %rdi,%r10 - shrq $2,%rdi - andq $3,%r10 - addq %rdi,%rax - addq %rax,%r14 - adcq $0,%rbx - adcq $0,%r10 - - movq %r12,%rax - decq %r15 - jnz .Loop - - movq 0(%rsp),%rdi - - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %r10,16(%rdi) - - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rsp -.Lno_data: -.Lblocks_epilogue: - ret -ENDPROC(poly1305_blocks_x86_64) - -.align 32 -ENTRY(poly1305_emit_x86_64) -.Lemit: - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - - movq %r8,%rax - addq $5,%r8 - movq %r9,%rcx - adcq $0,%r9 - adcq $0,%r10 - shrq $2,%r10 - cmovnzq %r8,%rax - cmovnzq %r9,%rcx - - addq 0(%rdx),%rax - adcq 8(%rdx),%rcx - movq %rax,0(%rsi) - movq %rcx,8(%rsi) - - ret -ENDPROC(poly1305_emit_x86_64) - -.macro __poly1305_block - mulq %r14 - movq %rax,%r9 - movq %r11,%rax - movq %rdx,%rdi - - mulq %r14 - movq %rax,%r14 - movq %r11,%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r9 - movq %r13,%rax - adcq %rdx,%rdi - - mulq %rbx - movq %r10,%rbx - addq %rax,%r14 - adcq %rdx,%r8 - - imulq %r13,%rbx - addq %rbx,%r9 - movq %r8,%rbx - adcq $0,%rdi - - imulq %r11,%r10 - addq %r9,%rbx - movq $-4,%rax - adcq %r10,%rdi - - andq %rdi,%rax - movq %rdi,%r10 - shrq $2,%rdi - andq $3,%r10 - addq %rdi,%rax - addq %rax,%r14 - adcq $0,%rbx - adcq $0,%r10 -.endm - -.macro __poly1305_init_avx - movq %r11,%r14 - movq %r12,%rbx - xorq %r10,%r10 - - leaq 48+64(%rdi),%rdi - - movq %r12,%rax - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - movq %r14,%r8 - andl %r14d,%eax - movq %r11,%r9 - andl %r11d,%edx - movl %eax,-64(%rdi) - shrq $26,%r8 - movl %edx,-60(%rdi) - shrq $26,%r9 - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - andl %r8d,%eax - andl %r9d,%edx - movl %eax,-48(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,-44(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,-32(%rdi) - shrq $26,%r8 - movl %edx,-28(%rdi) - shrq $26,%r9 - - movq %rbx,%rax - movq %r12,%rdx - shlq $12,%rax - shlq $12,%rdx - orq %r8,%rax - orq %r9,%rdx - andl $0x3ffffff,%eax - andl $0x3ffffff,%edx - movl %eax,-16(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,-12(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,0(%rdi) - movq %rbx,%r8 - movl %edx,4(%rdi) - movq %r12,%r9 - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - shrq $14,%r8 - shrq $14,%r9 - andl %r8d,%eax - andl %r9d,%edx - movl %eax,16(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,20(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,32(%rdi) - shrq $26,%r8 - movl %edx,36(%rdi) - shrq $26,%r9 - - movq %r10,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,48(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r9d,52(%rdi) - leaq (%r9,%r9,4),%r9 - movl %r8d,64(%rdi) - movl %r9d,68(%rdi) - - movq %r12,%rax - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - - movl $0x3ffffff,%eax - movq %r14,%r8 - andl %r14d,%eax - shrq $26,%r8 - movl %eax,-52(%rdi) - - movl $0x3ffffff,%edx - andl %r8d,%edx - movl %edx,-36(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,-20(%rdi) - - movq %rbx,%rax - shlq $12,%rax - orq %r8,%rax - andl $0x3ffffff,%eax - movl %eax,-4(%rdi) - leal (%rax,%rax,4),%eax - movq %rbx,%r8 - movl %eax,12(%rdi) - - movl $0x3ffffff,%edx - shrq $14,%r8 - andl %r8d,%edx - movl %edx,28(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,44(%rdi) - - movq %r10,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,60(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r8d,76(%rdi) - - movq %r12,%rax - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - - movl $0x3ffffff,%eax - movq %r14,%r8 - andl %r14d,%eax - shrq $26,%r8 - movl %eax,-56(%rdi) - - movl $0x3ffffff,%edx - andl %r8d,%edx - movl %edx,-40(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,-24(%rdi) - - movq %rbx,%rax - shlq $12,%rax - orq %r8,%rax - andl $0x3ffffff,%eax - movl %eax,-8(%rdi) - leal (%rax,%rax,4),%eax - movq %rbx,%r8 - movl %eax,8(%rdi) - - movl $0x3ffffff,%edx - shrq $14,%r8 - andl %r8d,%edx - movl %edx,24(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,40(%rdi) - - movq %r10,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,56(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r8d,72(%rdi) - - leaq -48-64(%rdi),%rdi -.endm - -#ifdef CONFIG_AS_AVX -.align 32 -ENTRY(poly1305_blocks_avx) - - movl 20(%rdi),%r8d - cmpq $128,%rdx - jae .Lblocks_avx - testl %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - andq $-16,%rdx - jz .Lno_data_avx - - vzeroupper - - testl %r8d,%r8d - jz .Lbase2_64_avx - - testq $31,%rdx - jz .Leven_avx - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lblocks_avx_body: - - movq %rdx,%r15 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movl 16(%rdi),%r10d - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - - movl %r8d,%r14d - andq $-2147483648,%r8 - movq %r9,%r12 - movl %r9d,%ebx - andq $-2147483648,%r9 - - shrq $6,%r8 - shlq $52,%r12 - addq %r8,%r14 - shrq $12,%rbx - shrq $18,%r9 - addq %r12,%r14 - adcq %r9,%rbx - - movq %r10,%r8 - shlq $40,%r8 - shrq $24,%r10 - addq %r8,%rbx - adcq $0,%r10 - - movq $-4,%r9 - movq %r10,%r8 - andq %r10,%r9 - shrq $2,%r8 - andq $3,%r10 - addq %r9,%r8 - addq %r8,%r14 - adcq $0,%rbx - adcq $0,%r10 - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - - testq %rcx,%rcx - jz .Lstore_base2_64_avx - - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r11 - movq %rbx,%r12 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r11 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r11,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r12 - andq $0x3ffffff,%rbx - orq %r12,%r10 - - subq $16,%r15 - jz .Lstore_base2_26_avx - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %r10,16(%rdi) - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - movl %eax,0(%rdi) - movl %edx,4(%rdi) - movl %r14d,8(%rdi) - movl %ebx,12(%rdi) - movl %r10d,16(%rdi) -.align 16 -.Ldone_avx: - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rsp - -.Lno_data_avx: -.Lblocks_avx_epilogue: - ret - -.align 32 -.Lbase2_64_avx: - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lbase2_64_avx_body: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movl 16(%rdi),%r10d - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - testq $31,%rdx - jz .Linit_avx - - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - -.Linit_avx: - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r8 - movq %rbx,%r9 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r8 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r8,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r9 - andq $0x3ffffff,%rbx - orq %r9,%r10 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - movl $1,20(%rdi) - - __poly1305_init_avx - -.Lproceed_avx: - movq %r15,%rdx - - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rax - leaq 48(%rsp),%rsp - -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx - - -.align 32 -.Leven_avx: - vmovd 0(%rdi),%xmm0 - vmovd 4(%rdi),%xmm1 - vmovd 8(%rdi),%xmm2 - vmovd 12(%rdi),%xmm3 - vmovd 16(%rdi),%xmm4 - -.Ldo_avx: - leaq 8(%rsp),%r10 - andq $-32,%rsp - subq $8,%rsp - leaq -88(%rsp),%r11 - subq $0x178,%rsp - subq $64,%rdx - leaq -32(%rsi),%rax - cmovcq %rax,%rsi - - vmovdqu 48(%rdi),%xmm14 - leaq 112(%rdi),%rdi - leaq .Lconst(%rip),%rcx - - vmovdqu 32(%rsi),%xmm5 - vmovdqu 48(%rsi),%xmm6 - vmovdqa 64(%rcx),%xmm15 - - vpsrldq $6,%xmm5,%xmm7 - vpsrldq $6,%xmm6,%xmm8 - vpunpckhqdq %xmm6,%xmm5,%xmm9 - vpunpcklqdq %xmm6,%xmm5,%xmm5 - vpunpcklqdq %xmm8,%xmm7,%xmm8 - - vpsrlq $40,%xmm9,%xmm9 - vpsrlq $26,%xmm5,%xmm6 - vpand %xmm15,%xmm5,%xmm5 - vpsrlq $4,%xmm8,%xmm7 - vpand %xmm15,%xmm6,%xmm6 - vpsrlq $30,%xmm8,%xmm8 - vpand %xmm15,%xmm7,%xmm7 - vpand %xmm15,%xmm8,%xmm8 - vpor 32(%rcx),%xmm9,%xmm9 - - jbe .Lskip_loop_avx - - - vmovdqu -48(%rdi),%xmm11 - vmovdqu -32(%rdi),%xmm12 - vpshufd $0xEE,%xmm14,%xmm13 - vpshufd $0x44,%xmm14,%xmm10 - vmovdqa %xmm13,-144(%r11) - vmovdqa %xmm10,0(%rsp) - vpshufd $0xEE,%xmm11,%xmm14 - vmovdqu -16(%rdi),%xmm10 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm14,-128(%r11) - vmovdqa %xmm11,16(%rsp) - vpshufd $0xEE,%xmm12,%xmm13 - vmovdqu 0(%rdi),%xmm11 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm13,-112(%r11) - vmovdqa %xmm12,32(%rsp) - vpshufd $0xEE,%xmm10,%xmm14 - vmovdqu 16(%rdi),%xmm12 - vpshufd $0x44,%xmm10,%xmm10 - vmovdqa %xmm14,-96(%r11) - vmovdqa %xmm10,48(%rsp) - vpshufd $0xEE,%xmm11,%xmm13 - vmovdqu 32(%rdi),%xmm10 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm13,-80(%r11) - vmovdqa %xmm11,64(%rsp) - vpshufd $0xEE,%xmm12,%xmm14 - vmovdqu 48(%rdi),%xmm11 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm14,-64(%r11) - vmovdqa %xmm12,80(%rsp) - vpshufd $0xEE,%xmm10,%xmm13 - vmovdqu 64(%rdi),%xmm12 - vpshufd $0x44,%xmm10,%xmm10 - vmovdqa %xmm13,-48(%r11) - vmovdqa %xmm10,96(%rsp) - vpshufd $0xEE,%xmm11,%xmm14 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm14,-32(%r11) - vmovdqa %xmm11,112(%rsp) - vpshufd $0xEE,%xmm12,%xmm13 - vmovdqa 0(%rsp),%xmm14 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm13,-16(%r11) - vmovdqa %xmm12,128(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - - vpmuludq %xmm5,%xmm14,%xmm10 - vpmuludq %xmm6,%xmm14,%xmm11 - vmovdqa %xmm2,32(%r11) - vpmuludq %xmm7,%xmm14,%xmm12 - vmovdqa 16(%rsp),%xmm2 - vpmuludq %xmm8,%xmm14,%xmm13 - vpmuludq %xmm9,%xmm14,%xmm14 - - vmovdqa %xmm0,0(%r11) - vpmuludq 32(%rsp),%xmm9,%xmm0 - vmovdqa %xmm1,16(%r11) - vpmuludq %xmm8,%xmm2,%xmm1 - vpaddq %xmm0,%xmm10,%xmm10 - vpaddq %xmm1,%xmm14,%xmm14 - vmovdqa %xmm3,48(%r11) - vpmuludq %xmm7,%xmm2,%xmm0 - vpmuludq %xmm6,%xmm2,%xmm1 - vpaddq %xmm0,%xmm13,%xmm13 - vmovdqa 48(%rsp),%xmm3 - vpaddq %xmm1,%xmm12,%xmm12 - vmovdqa %xmm4,64(%r11) - vpmuludq %xmm5,%xmm2,%xmm2 - vpmuludq %xmm7,%xmm3,%xmm0 - vpaddq %xmm2,%xmm11,%xmm11 - - vmovdqa 64(%rsp),%xmm4 - vpaddq %xmm0,%xmm14,%xmm14 - vpmuludq %xmm6,%xmm3,%xmm1 - vpmuludq %xmm5,%xmm3,%xmm3 - vpaddq %xmm1,%xmm13,%xmm13 - vmovdqa 80(%rsp),%xmm2 - vpaddq %xmm3,%xmm12,%xmm12 - vpmuludq %xmm9,%xmm4,%xmm0 - vpmuludq %xmm8,%xmm4,%xmm4 - vpaddq %xmm0,%xmm11,%xmm11 - vmovdqa 96(%rsp),%xmm3 - vpaddq %xmm4,%xmm10,%xmm10 - - vmovdqa 128(%rsp),%xmm4 - vpmuludq %xmm6,%xmm2,%xmm1 - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm1,%xmm14,%xmm14 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq %xmm9,%xmm3,%xmm0 - vpmuludq %xmm8,%xmm3,%xmm1 - vpaddq %xmm0,%xmm12,%xmm12 - vmovdqu 0(%rsi),%xmm0 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq %xmm7,%xmm3,%xmm3 - vpmuludq %xmm7,%xmm4,%xmm7 - vpaddq %xmm3,%xmm10,%xmm10 - - vmovdqu 16(%rsi),%xmm1 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq %xmm8,%xmm4,%xmm8 - vpmuludq %xmm9,%xmm4,%xmm9 - vpsrldq $6,%xmm0,%xmm2 - vpaddq %xmm8,%xmm12,%xmm12 - vpaddq %xmm9,%xmm13,%xmm13 - vpsrldq $6,%xmm1,%xmm3 - vpmuludq 112(%rsp),%xmm5,%xmm9 - vpmuludq %xmm6,%xmm4,%xmm5 - vpunpckhqdq %xmm1,%xmm0,%xmm4 - vpaddq %xmm9,%xmm14,%xmm14 - vmovdqa -144(%r11),%xmm9 - vpaddq %xmm5,%xmm10,%xmm10 - - vpunpcklqdq %xmm1,%xmm0,%xmm0 - vpunpcklqdq %xmm3,%xmm2,%xmm3 - - - vpsrldq $5,%xmm4,%xmm4 - vpsrlq $26,%xmm0,%xmm1 - vpand %xmm15,%xmm0,%xmm0 - vpsrlq $4,%xmm3,%xmm2 - vpand %xmm15,%xmm1,%xmm1 - vpand 0(%rcx),%xmm4,%xmm4 - vpsrlq $30,%xmm3,%xmm3 - vpand %xmm15,%xmm2,%xmm2 - vpand %xmm15,%xmm3,%xmm3 - vpor 32(%rcx),%xmm4,%xmm4 - - vpaddq 0(%r11),%xmm0,%xmm0 - vpaddq 16(%r11),%xmm1,%xmm1 - vpaddq 32(%r11),%xmm2,%xmm2 - vpaddq 48(%r11),%xmm3,%xmm3 - vpaddq 64(%r11),%xmm4,%xmm4 - - leaq 32(%rsi),%rax - leaq 64(%rsi),%rsi - subq $64,%rdx - cmovcq %rax,%rsi - - vpmuludq %xmm0,%xmm9,%xmm5 - vpmuludq %xmm1,%xmm9,%xmm6 - vpaddq %xmm5,%xmm10,%xmm10 - vpaddq %xmm6,%xmm11,%xmm11 - vmovdqa -128(%r11),%xmm7 - vpmuludq %xmm2,%xmm9,%xmm5 - vpmuludq %xmm3,%xmm9,%xmm6 - vpaddq %xmm5,%xmm12,%xmm12 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm4,%xmm9,%xmm9 - vpmuludq -112(%r11),%xmm4,%xmm5 - vpaddq %xmm9,%xmm14,%xmm14 - - vpaddq %xmm5,%xmm10,%xmm10 - vpmuludq %xmm2,%xmm7,%xmm6 - vpmuludq %xmm3,%xmm7,%xmm5 - vpaddq %xmm6,%xmm13,%xmm13 - vmovdqa -96(%r11),%xmm8 - vpaddq %xmm5,%xmm14,%xmm14 - vpmuludq %xmm1,%xmm7,%xmm6 - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm6,%xmm12,%xmm12 - vpaddq %xmm7,%xmm11,%xmm11 - - vmovdqa -80(%r11),%xmm9 - vpmuludq %xmm2,%xmm8,%xmm5 - vpmuludq %xmm1,%xmm8,%xmm6 - vpaddq %xmm5,%xmm14,%xmm14 - vpaddq %xmm6,%xmm13,%xmm13 - vmovdqa -64(%r11),%xmm7 - vpmuludq %xmm0,%xmm8,%xmm8 - vpmuludq %xmm4,%xmm9,%xmm5 - vpaddq %xmm8,%xmm12,%xmm12 - vpaddq %xmm5,%xmm11,%xmm11 - vmovdqa -48(%r11),%xmm8 - vpmuludq %xmm3,%xmm9,%xmm9 - vpmuludq %xmm1,%xmm7,%xmm6 - vpaddq %xmm9,%xmm10,%xmm10 - - vmovdqa -16(%r11),%xmm9 - vpaddq %xmm6,%xmm14,%xmm14 - vpmuludq %xmm0,%xmm7,%xmm7 - vpmuludq %xmm4,%xmm8,%xmm5 - vpaddq %xmm7,%xmm13,%xmm13 - vpaddq %xmm5,%xmm12,%xmm12 - vmovdqu 32(%rsi),%xmm5 - vpmuludq %xmm3,%xmm8,%xmm7 - vpmuludq %xmm2,%xmm8,%xmm8 - vpaddq %xmm7,%xmm11,%xmm11 - vmovdqu 48(%rsi),%xmm6 - vpaddq %xmm8,%xmm10,%xmm10 - - vpmuludq %xmm2,%xmm9,%xmm2 - vpmuludq %xmm3,%xmm9,%xmm3 - vpsrldq $6,%xmm5,%xmm7 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq %xmm4,%xmm9,%xmm4 - vpsrldq $6,%xmm6,%xmm8 - vpaddq %xmm3,%xmm12,%xmm2 - vpaddq %xmm4,%xmm13,%xmm3 - vpmuludq -32(%r11),%xmm0,%xmm4 - vpmuludq %xmm1,%xmm9,%xmm0 - vpunpckhqdq %xmm6,%xmm5,%xmm9 - vpaddq %xmm4,%xmm14,%xmm4 - vpaddq %xmm0,%xmm10,%xmm0 - - vpunpcklqdq %xmm6,%xmm5,%xmm5 - vpunpcklqdq %xmm8,%xmm7,%xmm8 - - - vpsrldq $5,%xmm9,%xmm9 - vpsrlq $26,%xmm5,%xmm6 - vmovdqa 0(%rsp),%xmm14 - vpand %xmm15,%xmm5,%xmm5 - vpsrlq $4,%xmm8,%xmm7 - vpand %xmm15,%xmm6,%xmm6 - vpand 0(%rcx),%xmm9,%xmm9 - vpsrlq $30,%xmm8,%xmm8 - vpand %xmm15,%xmm7,%xmm7 - vpand %xmm15,%xmm8,%xmm8 - vpor 32(%rcx),%xmm9,%xmm9 - - vpsrlq $26,%xmm3,%xmm13 - vpand %xmm15,%xmm3,%xmm3 - vpaddq %xmm13,%xmm4,%xmm4 - - vpsrlq $26,%xmm0,%xmm10 - vpand %xmm15,%xmm0,%xmm0 - vpaddq %xmm10,%xmm11,%xmm1 - - vpsrlq $26,%xmm4,%xmm10 - vpand %xmm15,%xmm4,%xmm4 - - vpsrlq $26,%xmm1,%xmm11 - vpand %xmm15,%xmm1,%xmm1 - vpaddq %xmm11,%xmm2,%xmm2 - - vpaddq %xmm10,%xmm0,%xmm0 - vpsllq $2,%xmm10,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - - vpsrlq $26,%xmm2,%xmm12 - vpand %xmm15,%xmm2,%xmm2 - vpaddq %xmm12,%xmm3,%xmm3 - - vpsrlq $26,%xmm0,%xmm10 - vpand %xmm15,%xmm0,%xmm0 - vpaddq %xmm10,%xmm1,%xmm1 - - vpsrlq $26,%xmm3,%xmm13 - vpand %xmm15,%xmm3,%xmm3 - vpaddq %xmm13,%xmm4,%xmm4 - - ja .Loop_avx - -.Lskip_loop_avx: - vpshufd $0x10,%xmm14,%xmm14 - addq $32,%rdx - jnz .Long_tail_avx - - vpaddq %xmm2,%xmm7,%xmm7 - vpaddq %xmm0,%xmm5,%xmm5 - vpaddq %xmm1,%xmm6,%xmm6 - vpaddq %xmm3,%xmm8,%xmm8 - vpaddq %xmm4,%xmm9,%xmm9 - -.Long_tail_avx: - vmovdqa %xmm2,32(%r11) - vmovdqa %xmm0,0(%r11) - vmovdqa %xmm1,16(%r11) - vmovdqa %xmm3,48(%r11) - vmovdqa %xmm4,64(%r11) - - vpmuludq %xmm7,%xmm14,%xmm12 - vpmuludq %xmm5,%xmm14,%xmm10 - vpshufd $0x10,-48(%rdi),%xmm2 - vpmuludq %xmm6,%xmm14,%xmm11 - vpmuludq %xmm8,%xmm14,%xmm13 - vpmuludq %xmm9,%xmm14,%xmm14 - - vpmuludq %xmm8,%xmm2,%xmm0 - vpaddq %xmm0,%xmm14,%xmm14 - vpshufd $0x10,-32(%rdi),%xmm3 - vpmuludq %xmm7,%xmm2,%xmm1 - vpaddq %xmm1,%xmm13,%xmm13 - vpshufd $0x10,-16(%rdi),%xmm4 - vpmuludq %xmm6,%xmm2,%xmm0 - vpaddq %xmm0,%xmm12,%xmm12 - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq %xmm9,%xmm3,%xmm3 - vpaddq %xmm3,%xmm10,%xmm10 - - vpshufd $0x10,0(%rdi),%xmm2 - vpmuludq %xmm7,%xmm4,%xmm1 - vpaddq %xmm1,%xmm14,%xmm14 - vpmuludq %xmm6,%xmm4,%xmm0 - vpaddq %xmm0,%xmm13,%xmm13 - vpshufd $0x10,16(%rdi),%xmm3 - vpmuludq %xmm5,%xmm4,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vpmuludq %xmm9,%xmm2,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpshufd $0x10,32(%rdi),%xmm4 - vpmuludq %xmm8,%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - - vpmuludq %xmm6,%xmm3,%xmm0 - vpaddq %xmm0,%xmm14,%xmm14 - vpmuludq %xmm5,%xmm3,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpshufd $0x10,48(%rdi),%xmm2 - vpmuludq %xmm9,%xmm4,%xmm1 - vpaddq %xmm1,%xmm12,%xmm12 - vpshufd $0x10,64(%rdi),%xmm3 - vpmuludq %xmm8,%xmm4,%xmm0 - vpaddq %xmm0,%xmm11,%xmm11 - vpmuludq %xmm7,%xmm4,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq %xmm9,%xmm3,%xmm1 - vpaddq %xmm1,%xmm13,%xmm13 - vpmuludq %xmm8,%xmm3,%xmm0 - vpaddq %xmm0,%xmm12,%xmm12 - vpmuludq %xmm7,%xmm3,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq %xmm6,%xmm3,%xmm3 - vpaddq %xmm3,%xmm10,%xmm10 - - jz .Lshort_tail_avx - - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - - vpsrldq $6,%xmm0,%xmm2 - vpsrldq $6,%xmm1,%xmm3 - vpunpckhqdq %xmm1,%xmm0,%xmm4 - vpunpcklqdq %xmm1,%xmm0,%xmm0 - vpunpcklqdq %xmm3,%xmm2,%xmm3 - - vpsrlq $40,%xmm4,%xmm4 - vpsrlq $26,%xmm0,%xmm1 - vpand %xmm15,%xmm0,%xmm0 - vpsrlq $4,%xmm3,%xmm2 - vpand %xmm15,%xmm1,%xmm1 - vpsrlq $30,%xmm3,%xmm3 - vpand %xmm15,%xmm2,%xmm2 - vpand %xmm15,%xmm3,%xmm3 - vpor 32(%rcx),%xmm4,%xmm4 - - vpshufd $0x32,-64(%rdi),%xmm9 - vpaddq 0(%r11),%xmm0,%xmm0 - vpaddq 16(%r11),%xmm1,%xmm1 - vpaddq 32(%r11),%xmm2,%xmm2 - vpaddq 48(%r11),%xmm3,%xmm3 - vpaddq 64(%r11),%xmm4,%xmm4 - - vpmuludq %xmm0,%xmm9,%xmm5 - vpaddq %xmm5,%xmm10,%xmm10 - vpmuludq %xmm1,%xmm9,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpmuludq %xmm2,%xmm9,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpshufd $0x32,-48(%rdi),%xmm7 - vpmuludq %xmm3,%xmm9,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm4,%xmm9,%xmm9 - vpaddq %xmm9,%xmm14,%xmm14 - - vpmuludq %xmm3,%xmm7,%xmm5 - vpaddq %xmm5,%xmm14,%xmm14 - vpshufd $0x32,-32(%rdi),%xmm8 - vpmuludq %xmm2,%xmm7,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpshufd $0x32,-16(%rdi),%xmm9 - vpmuludq %xmm1,%xmm7,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq %xmm4,%xmm8,%xmm8 - vpaddq %xmm8,%xmm10,%xmm10 - - vpshufd $0x32,0(%rdi),%xmm7 - vpmuludq %xmm2,%xmm9,%xmm6 - vpaddq %xmm6,%xmm14,%xmm14 - vpmuludq %xmm1,%xmm9,%xmm5 - vpaddq %xmm5,%xmm13,%xmm13 - vpshufd $0x32,16(%rdi),%xmm8 - vpmuludq %xmm0,%xmm9,%xmm9 - vpaddq %xmm9,%xmm12,%xmm12 - vpmuludq %xmm4,%xmm7,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpshufd $0x32,32(%rdi),%xmm9 - vpmuludq %xmm3,%xmm7,%xmm7 - vpaddq %xmm7,%xmm10,%xmm10 - - vpmuludq %xmm1,%xmm8,%xmm5 - vpaddq %xmm5,%xmm14,%xmm14 - vpmuludq %xmm0,%xmm8,%xmm8 - vpaddq %xmm8,%xmm13,%xmm13 - vpshufd $0x32,48(%rdi),%xmm7 - vpmuludq %xmm4,%xmm9,%xmm6 - vpaddq %xmm6,%xmm12,%xmm12 - vpshufd $0x32,64(%rdi),%xmm8 - vpmuludq %xmm3,%xmm9,%xmm5 - vpaddq %xmm5,%xmm11,%xmm11 - vpmuludq %xmm2,%xmm9,%xmm9 - vpaddq %xmm9,%xmm10,%xmm10 - - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm7,%xmm14,%xmm14 - vpmuludq %xmm4,%xmm8,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm3,%xmm8,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpmuludq %xmm2,%xmm8,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpmuludq %xmm1,%xmm8,%xmm8 - vpaddq %xmm8,%xmm10,%xmm10 - -.Lshort_tail_avx: - - vpsrldq $8,%xmm14,%xmm9 - vpsrldq $8,%xmm13,%xmm8 - vpsrldq $8,%xmm11,%xmm6 - vpsrldq $8,%xmm10,%xmm5 - vpsrldq $8,%xmm12,%xmm7 - vpaddq %xmm8,%xmm13,%xmm13 - vpaddq %xmm9,%xmm14,%xmm14 - vpaddq %xmm5,%xmm10,%xmm10 - vpaddq %xmm6,%xmm11,%xmm11 - vpaddq %xmm7,%xmm12,%xmm12 - - vpsrlq $26,%xmm13,%xmm3 - vpand %xmm15,%xmm13,%xmm13 - vpaddq %xmm3,%xmm14,%xmm14 - - vpsrlq $26,%xmm10,%xmm0 - vpand %xmm15,%xmm10,%xmm10 - vpaddq %xmm0,%xmm11,%xmm11 - - vpsrlq $26,%xmm14,%xmm4 - vpand %xmm15,%xmm14,%xmm14 - - vpsrlq $26,%xmm11,%xmm1 - vpand %xmm15,%xmm11,%xmm11 - vpaddq %xmm1,%xmm12,%xmm12 - - vpaddq %xmm4,%xmm10,%xmm10 - vpsllq $2,%xmm4,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - - vpsrlq $26,%xmm12,%xmm2 - vpand %xmm15,%xmm12,%xmm12 - vpaddq %xmm2,%xmm13,%xmm13 - - vpsrlq $26,%xmm10,%xmm0 - vpand %xmm15,%xmm10,%xmm10 - vpaddq %xmm0,%xmm11,%xmm11 - - vpsrlq $26,%xmm13,%xmm3 - vpand %xmm15,%xmm13,%xmm13 - vpaddq %xmm3,%xmm14,%xmm14 - - vmovd %xmm10,-112(%rdi) - vmovd %xmm11,-108(%rdi) - vmovd %xmm12,-104(%rdi) - vmovd %xmm13,-100(%rdi) - vmovd %xmm14,-96(%rdi) - leaq -8(%r10),%rsp - - vzeroupper - ret -ENDPROC(poly1305_blocks_avx) - -.align 32 -ENTRY(poly1305_emit_avx) - cmpl $0,20(%rdi) - je .Lemit - - movl 0(%rdi),%eax - movl 4(%rdi),%ecx - movl 8(%rdi),%r8d - movl 12(%rdi),%r11d - movl 16(%rdi),%r10d - - shlq $26,%rcx - movq %r8,%r9 - shlq $52,%r8 - addq %rcx,%rax - shrq $12,%r9 - addq %rax,%r8 - adcq $0,%r9 - - shlq $14,%r11 - movq %r10,%rax - shrq $24,%r10 - addq %r11,%r9 - shlq $40,%rax - addq %rax,%r9 - adcq $0,%r10 - - movq %r10,%rax - movq %r10,%rcx - andq $3,%r10 - shrq $2,%rax - andq $-4,%rcx - addq %rcx,%rax - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - - movq %r8,%rax - addq $5,%r8 - movq %r9,%rcx - adcq $0,%r9 - adcq $0,%r10 - shrq $2,%r10 - cmovnzq %r8,%rax - cmovnzq %r9,%rcx - - addq 0(%rdx),%rax - adcq 8(%rdx),%rcx - movq %rax,0(%rsi) - movq %rcx,8(%rsi) - - ret -ENDPROC(poly1305_emit_avx) -#endif /* CONFIG_AS_AVX */ - -#ifdef CONFIG_AS_AVX2 -.align 32 -ENTRY(poly1305_blocks_avx2) - - movl 20(%rdi),%r8d - cmpq $128,%rdx - jae .Lblocks_avx2 - testl %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2: - andq $-16,%rdx - jz .Lno_data_avx2 - - vzeroupper - - testl %r8d,%r8d - jz .Lbase2_64_avx2 - - testq $63,%rdx - jz .Leven_avx2 - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lblocks_avx2_body: - - movq %rdx,%r15 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movl 16(%rdi),%r10d - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - - movl %r8d,%r14d - andq $-2147483648,%r8 - movq %r9,%r12 - movl %r9d,%ebx - andq $-2147483648,%r9 - - shrq $6,%r8 - shlq $52,%r12 - addq %r8,%r14 - shrq $12,%rbx - shrq $18,%r9 - addq %r12,%r14 - adcq %r9,%rbx - - movq %r10,%r8 - shlq $40,%r8 - shrq $24,%r10 - addq %r8,%rbx - adcq $0,%r10 - - movq $-4,%r9 - movq %r10,%r8 - andq %r10,%r9 - shrq $2,%r8 - andq $3,%r10 - addq %r9,%r8 - addq %r8,%r14 - adcq $0,%rbx - adcq $0,%r10 - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - -.Lbase2_26_pre_avx2: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_26_pre_avx2 - - testq %rcx,%rcx - jz .Lstore_base2_64_avx2 - - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r11 - movq %rbx,%r12 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r11 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r11,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r12 - andq $0x3ffffff,%rbx - orq %r12,%r10 - - testq %r15,%r15 - jz .Lstore_base2_26_avx2 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - jmp .Lproceed_avx2 - -.align 32 -.Lstore_base2_64_avx2: - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %r10,16(%rdi) - jmp .Ldone_avx2 - -.align 16 -.Lstore_base2_26_avx2: - movl %eax,0(%rdi) - movl %edx,4(%rdi) - movl %r14d,8(%rdi) - movl %ebx,12(%rdi) - movl %r10d,16(%rdi) -.align 16 -.Ldone_avx2: - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rsp - -.Lno_data_avx2: -.Lblocks_avx2_epilogue: - ret - - -.align 32 -.Lbase2_64_avx2: - - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lbase2_64_avx2_body: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movl 16(%rdi),%r10d - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - testq $63,%rdx - jz .Linit_avx2 - -.Lbase2_64_pre_avx2: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_64_pre_avx2 - -.Linit_avx2: - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r8 - movq %rbx,%r9 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r8 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r8,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r9 - andq $0x3ffffff,%rbx - orq %r9,%r10 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - movl $1,20(%rdi) - - __poly1305_init_avx - -.Lproceed_avx2: - movq %r15,%rdx - - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rax - leaq 48(%rsp),%rsp - -.Lbase2_64_avx2_epilogue: - jmp .Ldo_avx2 - - -.align 32 -.Leven_avx2: - - vmovd 0(%rdi),%xmm0 - vmovd 4(%rdi),%xmm1 - vmovd 8(%rdi),%xmm2 - vmovd 12(%rdi),%xmm3 - vmovd 16(%rdi),%xmm4 - -.Ldo_avx2: - leaq 8(%rsp),%r10 - subq $0x128,%rsp - leaq .Lconst(%rip),%rcx - leaq 48+64(%rdi),%rdi - vmovdqa 96(%rcx),%ymm7 - - - vmovdqu -64(%rdi),%xmm9 - andq $-512,%rsp - vmovdqu -48(%rdi),%xmm10 - vmovdqu -32(%rdi),%xmm6 - vmovdqu -16(%rdi),%xmm11 - vmovdqu 0(%rdi),%xmm12 - vmovdqu 16(%rdi),%xmm13 - leaq 144(%rsp),%rax - vmovdqu 32(%rdi),%xmm14 - vpermd %ymm9,%ymm7,%ymm9 - vmovdqu 48(%rdi),%xmm15 - vpermd %ymm10,%ymm7,%ymm10 - vmovdqu 64(%rdi),%xmm5 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqa %ymm9,0(%rsp) - vpermd %ymm11,%ymm7,%ymm11 - vmovdqa %ymm10,32-144(%rax) - vpermd %ymm12,%ymm7,%ymm12 - vmovdqa %ymm6,64-144(%rax) - vpermd %ymm13,%ymm7,%ymm13 - vmovdqa %ymm11,96-144(%rax) - vpermd %ymm14,%ymm7,%ymm14 - vmovdqa %ymm12,128-144(%rax) - vpermd %ymm15,%ymm7,%ymm15 - vmovdqa %ymm13,160-144(%rax) - vpermd %ymm5,%ymm7,%ymm5 - vmovdqa %ymm14,192-144(%rax) - vmovdqa %ymm15,224-144(%rax) - vmovdqa %ymm5,256-144(%rax) - vmovdqa 64(%rcx),%ymm5 - - - - vmovdqu 0(%rsi),%xmm7 - vmovdqu 16(%rsi),%xmm8 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpsrldq $6,%ymm7,%ymm9 - vpsrldq $6,%ymm8,%ymm10 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - vpunpcklqdq %ymm10,%ymm9,%ymm9 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - - vpsrlq $30,%ymm9,%ymm10 - vpsrlq $4,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - vpsrlq $40,%ymm6,%ymm6 - vpand %ymm5,%ymm9,%ymm9 - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - vpaddq %ymm2,%ymm9,%ymm2 - subq $64,%rdx - jz .Ltail_avx2 - jmp .Loop_avx2 - -.align 32 -.Loop_avx2: - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqa 0(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqa 32(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqa 96(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqa 48(%rax),%ymm10 - vmovdqa 112(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 64(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - vmovdqa -16(%rax),%ymm8 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vmovdqu 0(%rsi),%xmm7 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vmovdqu 16(%rsi),%xmm8 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqa 16(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpsrldq $6,%ymm7,%ymm9 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpsrldq $6,%ymm8,%ymm10 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpunpcklqdq %ymm10,%ymm9,%ymm10 - vpmuludq 80(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $4,%ymm10,%ymm9 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpand %ymm5,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpaddq %ymm9,%ymm2,%ymm2 - vpsrlq $30,%ymm10,%ymm10 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $40,%ymm6,%ymm6 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - subq $64,%rdx - jnz .Loop_avx2 - -.byte 0x66,0x90 -.Ltail_avx2: - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqu 4(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqu 36(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqu 100(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqu 52(%rax),%ymm10 - vmovdqu 116(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 68(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vmovdqu -12(%rax),%ymm8 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqu 20(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpmuludq 84(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - vpsrldq $8,%ymm12,%ymm8 - vpsrldq $8,%ymm2,%ymm9 - vpsrldq $8,%ymm3,%ymm10 - vpsrldq $8,%ymm4,%ymm6 - vpsrldq $8,%ymm0,%ymm7 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - - vpermq $0x2,%ymm3,%ymm10 - vpermq $0x2,%ymm4,%ymm6 - vpermq $0x2,%ymm0,%ymm7 - vpermq $0x2,%ymm12,%ymm8 - vpermq $0x2,%ymm2,%ymm9 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vmovd %xmm0,-112(%rdi) - vmovd %xmm1,-108(%rdi) - vmovd %xmm2,-104(%rdi) - vmovd %xmm3,-100(%rdi) - vmovd %xmm4,-96(%rdi) - leaq -8(%r10),%rsp - - vzeroupper - ret - -ENDPROC(poly1305_blocks_avx2) -#endif /* CONFIG_AS_AVX2 */ - -#ifdef CONFIG_AS_AVX512 -.align 32 -ENTRY(poly1305_blocks_avx512) - - movl 20(%rdi),%r8d - cmpq $128,%rdx - jae .Lblocks_avx2_512 - testl %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2_512: - andq $-16,%rdx - jz .Lno_data_avx2_512 - - vzeroupper - - testl %r8d,%r8d - jz .Lbase2_64_avx2_512 - - testq $63,%rdx - jz .Leven_avx2_512 - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lblocks_avx2_body_512: - - movq %rdx,%r15 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movl 16(%rdi),%r10d - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - - movl %r8d,%r14d - andq $-2147483648,%r8 - movq %r9,%r12 - movl %r9d,%ebx - andq $-2147483648,%r9 - - shrq $6,%r8 - shlq $52,%r12 - addq %r8,%r14 - shrq $12,%rbx - shrq $18,%r9 - addq %r12,%r14 - adcq %r9,%rbx - - movq %r10,%r8 - shlq $40,%r8 - shrq $24,%r10 - addq %r8,%rbx - adcq $0,%r10 - - movq $-4,%r9 - movq %r10,%r8 - andq %r10,%r9 - shrq $2,%r8 - andq $3,%r10 - addq %r9,%r8 - addq %r8,%r14 - adcq $0,%rbx - adcq $0,%r10 - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - -.Lbase2_26_pre_avx2_512: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_26_pre_avx2_512 - - testq %rcx,%rcx - jz .Lstore_base2_64_avx2_512 - - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r11 - movq %rbx,%r12 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r11 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r11,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r12 - andq $0x3ffffff,%rbx - orq %r12,%r10 - - testq %r15,%r15 - jz .Lstore_base2_26_avx2_512 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - jmp .Lproceed_avx2_512 - -.align 32 -.Lstore_base2_64_avx2_512: - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %r10,16(%rdi) - jmp .Ldone_avx2_512 - -.align 16 -.Lstore_base2_26_avx2_512: - movl %eax,0(%rdi) - movl %edx,4(%rdi) - movl %r14d,8(%rdi) - movl %ebx,12(%rdi) - movl %r10d,16(%rdi) -.align 16 -.Ldone_avx2_512: - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rsp - -.Lno_data_avx2_512: -.Lblocks_avx2_epilogue_512: - ret - - -.align 32 -.Lbase2_64_avx2_512: - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lbase2_64_avx2_body_512: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movl 16(%rdi),%r10d - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - testq $63,%rdx - jz .Linit_avx2_512 - -.Lbase2_64_pre_avx2_512: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_64_pre_avx2_512 - -.Linit_avx2_512: - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r8 - movq %rbx,%r9 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r8 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r8,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r9 - andq $0x3ffffff,%rbx - orq %r9,%r10 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - movl $1,20(%rdi) - - __poly1305_init_avx - -.Lproceed_avx2_512: - movq %r15,%rdx - - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rax - leaq 48(%rsp),%rsp - -.Lbase2_64_avx2_epilogue_512: - jmp .Ldo_avx2_512 - - -.align 32 -.Leven_avx2_512: - - vmovd 0(%rdi),%xmm0 - vmovd 4(%rdi),%xmm1 - vmovd 8(%rdi),%xmm2 - vmovd 12(%rdi),%xmm3 - vmovd 16(%rdi),%xmm4 - -.Ldo_avx2_512: - cmpq $512,%rdx - jae .Lblocks_avx512 -.Lskip_avx512: - leaq 8(%rsp),%r10 - - subq $0x128,%rsp - leaq .Lconst(%rip),%rcx - leaq 48+64(%rdi),%rdi - vmovdqa 96(%rcx),%ymm7 - - - vmovdqu -64(%rdi),%xmm9 - andq $-512,%rsp - vmovdqu -48(%rdi),%xmm10 - vmovdqu -32(%rdi),%xmm6 - vmovdqu -16(%rdi),%xmm11 - vmovdqu 0(%rdi),%xmm12 - vmovdqu 16(%rdi),%xmm13 - leaq 144(%rsp),%rax - vmovdqu 32(%rdi),%xmm14 - vpermd %ymm9,%ymm7,%ymm9 - vmovdqu 48(%rdi),%xmm15 - vpermd %ymm10,%ymm7,%ymm10 - vmovdqu 64(%rdi),%xmm5 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqa %ymm9,0(%rsp) - vpermd %ymm11,%ymm7,%ymm11 - vmovdqa %ymm10,32-144(%rax) - vpermd %ymm12,%ymm7,%ymm12 - vmovdqa %ymm6,64-144(%rax) - vpermd %ymm13,%ymm7,%ymm13 - vmovdqa %ymm11,96-144(%rax) - vpermd %ymm14,%ymm7,%ymm14 - vmovdqa %ymm12,128-144(%rax) - vpermd %ymm15,%ymm7,%ymm15 - vmovdqa %ymm13,160-144(%rax) - vpermd %ymm5,%ymm7,%ymm5 - vmovdqa %ymm14,192-144(%rax) - vmovdqa %ymm15,224-144(%rax) - vmovdqa %ymm5,256-144(%rax) - vmovdqa 64(%rcx),%ymm5 - - - - vmovdqu 0(%rsi),%xmm7 - vmovdqu 16(%rsi),%xmm8 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpsrldq $6,%ymm7,%ymm9 - vpsrldq $6,%ymm8,%ymm10 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - vpunpcklqdq %ymm10,%ymm9,%ymm9 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - - vpsrlq $30,%ymm9,%ymm10 - vpsrlq $4,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - vpsrlq $40,%ymm6,%ymm6 - vpand %ymm5,%ymm9,%ymm9 - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - vpaddq %ymm2,%ymm9,%ymm2 - subq $64,%rdx - jz .Ltail_avx2_512 - jmp .Loop_avx2_512 - -.align 32 -.Loop_avx2_512: - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqa 0(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqa 32(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqa 96(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqa 48(%rax),%ymm10 - vmovdqa 112(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 64(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - vmovdqa -16(%rax),%ymm8 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vmovdqu 0(%rsi),%xmm7 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vmovdqu 16(%rsi),%xmm8 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqa 16(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpsrldq $6,%ymm7,%ymm9 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpsrldq $6,%ymm8,%ymm10 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpunpcklqdq %ymm10,%ymm9,%ymm10 - vpmuludq 80(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $4,%ymm10,%ymm9 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpand %ymm5,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpaddq %ymm9,%ymm2,%ymm2 - vpsrlq $30,%ymm10,%ymm10 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $40,%ymm6,%ymm6 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - subq $64,%rdx - jnz .Loop_avx2_512 - -.byte 0x66,0x90 -.Ltail_avx2_512: - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqu 4(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqu 36(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqu 100(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqu 52(%rax),%ymm10 - vmovdqu 116(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 68(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vmovdqu -12(%rax),%ymm8 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqu 20(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpmuludq 84(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - vpsrldq $8,%ymm12,%ymm8 - vpsrldq $8,%ymm2,%ymm9 - vpsrldq $8,%ymm3,%ymm10 - vpsrldq $8,%ymm4,%ymm6 - vpsrldq $8,%ymm0,%ymm7 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - - vpermq $0x2,%ymm3,%ymm10 - vpermq $0x2,%ymm4,%ymm6 - vpermq $0x2,%ymm0,%ymm7 - vpermq $0x2,%ymm12,%ymm8 - vpermq $0x2,%ymm2,%ymm9 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vmovd %xmm0,-112(%rdi) - vmovd %xmm1,-108(%rdi) - vmovd %xmm2,-104(%rdi) - vmovd %xmm3,-100(%rdi) - vmovd %xmm4,-96(%rdi) - leaq -8(%r10),%rsp - - vzeroupper - ret - -.Lblocks_avx512: - - movl $15,%eax - kmovw %eax,%k2 - leaq 8(%rsp),%r10 - - subq $0x128,%rsp - leaq .Lconst(%rip),%rcx - leaq 48+64(%rdi),%rdi - vmovdqa 96(%rcx),%ymm9 - - vmovdqu32 -64(%rdi),%zmm16{%k2}{z} - andq $-512,%rsp - vmovdqu32 -48(%rdi),%zmm17{%k2}{z} - movq $0x20,%rax - vmovdqu32 -32(%rdi),%zmm21{%k2}{z} - vmovdqu32 -16(%rdi),%zmm18{%k2}{z} - vmovdqu32 0(%rdi),%zmm22{%k2}{z} - vmovdqu32 16(%rdi),%zmm19{%k2}{z} - vmovdqu32 32(%rdi),%zmm23{%k2}{z} - vmovdqu32 48(%rdi),%zmm20{%k2}{z} - vmovdqu32 64(%rdi),%zmm24{%k2}{z} - vpermd %zmm16,%zmm9,%zmm16 - vpbroadcastq 64(%rcx),%zmm5 - vpermd %zmm17,%zmm9,%zmm17 - vpermd %zmm21,%zmm9,%zmm21 - vpermd %zmm18,%zmm9,%zmm18 - vmovdqa64 %zmm16,0(%rsp){%k2} - vpsrlq $32,%zmm16,%zmm7 - vpermd %zmm22,%zmm9,%zmm22 - vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2} - vpsrlq $32,%zmm17,%zmm8 - vpermd %zmm19,%zmm9,%zmm19 - vmovdqa64 %zmm21,64(%rsp){%k2} - vpermd %zmm23,%zmm9,%zmm23 - vpermd %zmm20,%zmm9,%zmm20 - vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2} - vpermd %zmm24,%zmm9,%zmm24 - vmovdqa64 %zmm22,128(%rsp){%k2} - vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2} - vmovdqa64 %zmm23,192(%rsp){%k2} - vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2} - vmovdqa64 %zmm24,256(%rsp){%k2} - - vpmuludq %zmm7,%zmm16,%zmm11 - vpmuludq %zmm7,%zmm17,%zmm12 - vpmuludq %zmm7,%zmm18,%zmm13 - vpmuludq %zmm7,%zmm19,%zmm14 - vpmuludq %zmm7,%zmm20,%zmm15 - vpsrlq $32,%zmm18,%zmm9 - - vpmuludq %zmm8,%zmm24,%zmm25 - vpmuludq %zmm8,%zmm16,%zmm26 - vpmuludq %zmm8,%zmm17,%zmm27 - vpmuludq %zmm8,%zmm18,%zmm28 - vpmuludq %zmm8,%zmm19,%zmm29 - vpsrlq $32,%zmm19,%zmm10 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - - vpmuludq %zmm9,%zmm23,%zmm25 - vpmuludq %zmm9,%zmm24,%zmm26 - vpmuludq %zmm9,%zmm17,%zmm28 - vpmuludq %zmm9,%zmm18,%zmm29 - vpmuludq %zmm9,%zmm16,%zmm27 - vpsrlq $32,%zmm20,%zmm6 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm10,%zmm22,%zmm25 - vpmuludq %zmm10,%zmm16,%zmm28 - vpmuludq %zmm10,%zmm17,%zmm29 - vpmuludq %zmm10,%zmm23,%zmm26 - vpmuludq %zmm10,%zmm24,%zmm27 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm6,%zmm24,%zmm28 - vpmuludq %zmm6,%zmm16,%zmm29 - vpmuludq %zmm6,%zmm21,%zmm25 - vpmuludq %zmm6,%zmm22,%zmm26 - vpmuludq %zmm6,%zmm23,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vmovdqu64 0(%rsi),%zmm10 - vmovdqu64 64(%rsi),%zmm6 - leaq 128(%rsi),%rsi - - vpsrlq $26,%zmm14,%zmm28 - vpandq %zmm5,%zmm14,%zmm14 - vpaddq %zmm28,%zmm15,%zmm15 - - vpsrlq $26,%zmm11,%zmm25 - vpandq %zmm5,%zmm11,%zmm11 - vpaddq %zmm25,%zmm12,%zmm12 - - vpsrlq $26,%zmm15,%zmm29 - vpandq %zmm5,%zmm15,%zmm15 - - vpsrlq $26,%zmm12,%zmm26 - vpandq %zmm5,%zmm12,%zmm12 - vpaddq %zmm26,%zmm13,%zmm13 - - vpaddq %zmm29,%zmm11,%zmm11 - vpsllq $2,%zmm29,%zmm29 - vpaddq %zmm29,%zmm11,%zmm11 - - vpsrlq $26,%zmm13,%zmm27 - vpandq %zmm5,%zmm13,%zmm13 - vpaddq %zmm27,%zmm14,%zmm14 - - vpsrlq $26,%zmm11,%zmm25 - vpandq %zmm5,%zmm11,%zmm11 - vpaddq %zmm25,%zmm12,%zmm12 - - vpsrlq $26,%zmm14,%zmm28 - vpandq %zmm5,%zmm14,%zmm14 - vpaddq %zmm28,%zmm15,%zmm15 - - vpunpcklqdq %zmm6,%zmm10,%zmm7 - vpunpckhqdq %zmm6,%zmm10,%zmm6 - - vmovdqa32 128(%rcx),%zmm25 - movl $0x7777,%eax - kmovw %eax,%k1 - - vpermd %zmm16,%zmm25,%zmm16 - vpermd %zmm17,%zmm25,%zmm17 - vpermd %zmm18,%zmm25,%zmm18 - vpermd %zmm19,%zmm25,%zmm19 - vpermd %zmm20,%zmm25,%zmm20 - - vpermd %zmm11,%zmm25,%zmm16{%k1} - vpermd %zmm12,%zmm25,%zmm17{%k1} - vpermd %zmm13,%zmm25,%zmm18{%k1} - vpermd %zmm14,%zmm25,%zmm19{%k1} - vpermd %zmm15,%zmm25,%zmm20{%k1} - - vpslld $2,%zmm17,%zmm21 - vpslld $2,%zmm18,%zmm22 - vpslld $2,%zmm19,%zmm23 - vpslld $2,%zmm20,%zmm24 - vpaddd %zmm17,%zmm21,%zmm21 - vpaddd %zmm18,%zmm22,%zmm22 - vpaddd %zmm19,%zmm23,%zmm23 - vpaddd %zmm20,%zmm24,%zmm24 - - vpbroadcastq 32(%rcx),%zmm30 - - vpsrlq $52,%zmm7,%zmm9 - vpsllq $12,%zmm6,%zmm10 - vporq %zmm10,%zmm9,%zmm9 - vpsrlq $26,%zmm7,%zmm8 - vpsrlq $14,%zmm6,%zmm10 - vpsrlq $40,%zmm6,%zmm6 - vpandq %zmm5,%zmm9,%zmm9 - vpandq %zmm5,%zmm7,%zmm7 - - vpaddq %zmm2,%zmm9,%zmm2 - subq $192,%rdx - jbe .Ltail_avx512 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - - vpmuludq %zmm2,%zmm17,%zmm14 - vpaddq %zmm0,%zmm7,%zmm0 - vpmuludq %zmm2,%zmm18,%zmm15 - vpandq %zmm5,%zmm8,%zmm8 - vpmuludq %zmm2,%zmm23,%zmm11 - vpandq %zmm5,%zmm10,%zmm10 - vpmuludq %zmm2,%zmm24,%zmm12 - vporq %zmm30,%zmm6,%zmm6 - vpmuludq %zmm2,%zmm16,%zmm13 - vpaddq %zmm1,%zmm8,%zmm1 - vpaddq %zmm3,%zmm10,%zmm3 - vpaddq %zmm4,%zmm6,%zmm4 - - vmovdqu64 0(%rsi),%zmm10 - vmovdqu64 64(%rsi),%zmm6 - leaq 128(%rsi),%rsi - vpmuludq %zmm0,%zmm19,%zmm28 - vpmuludq %zmm0,%zmm20,%zmm29 - vpmuludq %zmm0,%zmm16,%zmm25 - vpmuludq %zmm0,%zmm17,%zmm26 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - - vpmuludq %zmm1,%zmm18,%zmm28 - vpmuludq %zmm1,%zmm19,%zmm29 - vpmuludq %zmm1,%zmm24,%zmm25 - vpmuludq %zmm0,%zmm18,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm27,%zmm13,%zmm13 - - vpunpcklqdq %zmm6,%zmm10,%zmm7 - vpunpckhqdq %zmm6,%zmm10,%zmm6 - - vpmuludq %zmm3,%zmm16,%zmm28 - vpmuludq %zmm3,%zmm17,%zmm29 - vpmuludq %zmm1,%zmm16,%zmm26 - vpmuludq %zmm1,%zmm17,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm4,%zmm24,%zmm28 - vpmuludq %zmm4,%zmm16,%zmm29 - vpmuludq %zmm3,%zmm22,%zmm25 - vpmuludq %zmm3,%zmm23,%zmm26 - vpaddq %zmm28,%zmm14,%zmm14 - vpmuludq %zmm3,%zmm24,%zmm27 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm4,%zmm21,%zmm25 - vpmuludq %zmm4,%zmm22,%zmm26 - vpmuludq %zmm4,%zmm23,%zmm27 - vpaddq %zmm25,%zmm11,%zmm0 - vpaddq %zmm26,%zmm12,%zmm1 - vpaddq %zmm27,%zmm13,%zmm2 - - vpsrlq $52,%zmm7,%zmm9 - vpsllq $12,%zmm6,%zmm10 - - vpsrlq $26,%zmm14,%zmm3 - vpandq %zmm5,%zmm14,%zmm14 - vpaddq %zmm3,%zmm15,%zmm4 - - vporq %zmm10,%zmm9,%zmm9 - - vpsrlq $26,%zmm0,%zmm11 - vpandq %zmm5,%zmm0,%zmm0 - vpaddq %zmm11,%zmm1,%zmm1 - - vpandq %zmm5,%zmm9,%zmm9 - - vpsrlq $26,%zmm4,%zmm15 - vpandq %zmm5,%zmm4,%zmm4 - - vpsrlq $26,%zmm1,%zmm12 - vpandq %zmm5,%zmm1,%zmm1 - vpaddq %zmm12,%zmm2,%zmm2 - - vpaddq %zmm15,%zmm0,%zmm0 - vpsllq $2,%zmm15,%zmm15 - vpaddq %zmm15,%zmm0,%zmm0 - - vpaddq %zmm9,%zmm2,%zmm2 - vpsrlq $26,%zmm7,%zmm8 - - vpsrlq $26,%zmm2,%zmm13 - vpandq %zmm5,%zmm2,%zmm2 - vpaddq %zmm13,%zmm14,%zmm3 - - vpsrlq $14,%zmm6,%zmm10 - - vpsrlq $26,%zmm0,%zmm11 - vpandq %zmm5,%zmm0,%zmm0 - vpaddq %zmm11,%zmm1,%zmm1 - - vpsrlq $40,%zmm6,%zmm6 - - vpsrlq $26,%zmm3,%zmm14 - vpandq %zmm5,%zmm3,%zmm3 - vpaddq %zmm14,%zmm4,%zmm4 - - vpandq %zmm5,%zmm7,%zmm7 - - subq $128,%rdx - ja .Loop_avx512 - -.Ltail_avx512: - - vpsrlq $32,%zmm16,%zmm16 - vpsrlq $32,%zmm17,%zmm17 - vpsrlq $32,%zmm18,%zmm18 - vpsrlq $32,%zmm23,%zmm23 - vpsrlq $32,%zmm24,%zmm24 - vpsrlq $32,%zmm19,%zmm19 - vpsrlq $32,%zmm20,%zmm20 - vpsrlq $32,%zmm21,%zmm21 - vpsrlq $32,%zmm22,%zmm22 - - leaq (%rsi,%rdx,1),%rsi - - vpaddq %zmm0,%zmm7,%zmm0 - - vpmuludq %zmm2,%zmm17,%zmm14 - vpmuludq %zmm2,%zmm18,%zmm15 - vpmuludq %zmm2,%zmm23,%zmm11 - vpandq %zmm5,%zmm8,%zmm8 - vpmuludq %zmm2,%zmm24,%zmm12 - vpandq %zmm5,%zmm10,%zmm10 - vpmuludq %zmm2,%zmm16,%zmm13 - vporq %zmm30,%zmm6,%zmm6 - vpaddq %zmm1,%zmm8,%zmm1 - vpaddq %zmm3,%zmm10,%zmm3 - vpaddq %zmm4,%zmm6,%zmm4 - - vmovdqu 0(%rsi),%xmm7 - vpmuludq %zmm0,%zmm19,%zmm28 - vpmuludq %zmm0,%zmm20,%zmm29 - vpmuludq %zmm0,%zmm16,%zmm25 - vpmuludq %zmm0,%zmm17,%zmm26 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - - vmovdqu 16(%rsi),%xmm8 - vpmuludq %zmm1,%zmm18,%zmm28 - vpmuludq %zmm1,%zmm19,%zmm29 - vpmuludq %zmm1,%zmm24,%zmm25 - vpmuludq %zmm0,%zmm18,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm27,%zmm13,%zmm13 - - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - vpmuludq %zmm3,%zmm16,%zmm28 - vpmuludq %zmm3,%zmm17,%zmm29 - vpmuludq %zmm1,%zmm16,%zmm26 - vpmuludq %zmm1,%zmm17,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - vpmuludq %zmm4,%zmm24,%zmm28 - vpmuludq %zmm4,%zmm16,%zmm29 - vpmuludq %zmm3,%zmm22,%zmm25 - vpmuludq %zmm3,%zmm23,%zmm26 - vpmuludq %zmm3,%zmm24,%zmm27 - vpaddq %zmm28,%zmm14,%zmm3 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm4,%zmm21,%zmm25 - vpmuludq %zmm4,%zmm22,%zmm26 - vpmuludq %zmm4,%zmm23,%zmm27 - vpaddq %zmm25,%zmm11,%zmm0 - vpaddq %zmm26,%zmm12,%zmm1 - vpaddq %zmm27,%zmm13,%zmm2 - - movl $1,%eax - vpermq $0xb1,%zmm3,%zmm14 - vpermq $0xb1,%zmm15,%zmm4 - vpermq $0xb1,%zmm0,%zmm11 - vpermq $0xb1,%zmm1,%zmm12 - vpermq $0xb1,%zmm2,%zmm13 - vpaddq %zmm14,%zmm3,%zmm3 - vpaddq %zmm15,%zmm4,%zmm4 - vpaddq %zmm11,%zmm0,%zmm0 - vpaddq %zmm12,%zmm1,%zmm1 - vpaddq %zmm13,%zmm2,%zmm2 - - kmovw %eax,%k3 - vpermq $0x2,%zmm3,%zmm14 - vpermq $0x2,%zmm4,%zmm15 - vpermq $0x2,%zmm0,%zmm11 - vpermq $0x2,%zmm1,%zmm12 - vpermq $0x2,%zmm2,%zmm13 - vpaddq %zmm14,%zmm3,%zmm3 - vpaddq %zmm15,%zmm4,%zmm4 - vpaddq %zmm11,%zmm0,%zmm0 - vpaddq %zmm12,%zmm1,%zmm1 - vpaddq %zmm13,%zmm2,%zmm2 - - vextracti64x4 $0x1,%zmm3,%ymm14 - vextracti64x4 $0x1,%zmm4,%ymm15 - vextracti64x4 $0x1,%zmm0,%ymm11 - vextracti64x4 $0x1,%zmm1,%ymm12 - vextracti64x4 $0x1,%zmm2,%ymm13 - vpaddq %zmm14,%zmm3,%zmm3{%k3}{z} - vpaddq %zmm15,%zmm4,%zmm4{%k3}{z} - vpaddq %zmm11,%zmm0,%zmm0{%k3}{z} - vpaddq %zmm12,%zmm1,%zmm1{%k3}{z} - vpaddq %zmm13,%zmm2,%zmm2{%k3}{z} - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpsrldq $6,%ymm7,%ymm9 - vpsrldq $6,%ymm8,%ymm10 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpunpcklqdq %ymm10,%ymm9,%ymm9 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpsrlq $30,%ymm9,%ymm10 - vpsrlq $4,%ymm9,%ymm9 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpsrlq $26,%ymm7,%ymm8 - vpsrlq $40,%ymm6,%ymm6 - vpaddq %ymm15,%ymm0,%ymm0 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpand %ymm5,%ymm9,%ymm9 - vpand %ymm5,%ymm7,%ymm7 - vpaddq %ymm13,%ymm3,%ymm3 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm2,%ymm9,%ymm2 - vpand %ymm5,%ymm8,%ymm8 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - vpaddq %ymm14,%ymm4,%ymm4 - - leaq 144(%rsp),%rax - addq $64,%rdx - jnz .Ltail_avx2_512 - - vpsubq %ymm9,%ymm2,%ymm2 - vmovd %xmm0,-112(%rdi) - vmovd %xmm1,-108(%rdi) - vmovd %xmm2,-104(%rdi) - vmovd %xmm3,-100(%rdi) - vmovd %xmm4,-96(%rdi) - vzeroall - leaq -8(%r10),%rsp - - ret - -ENDPROC(poly1305_blocks_avx512) -#endif /* CONFIG_AS_AVX512 */ |