summaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2018-02-01 16:21:51 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2018-03-09 13:47:23 +0100
commit186be2742c948351c27bc068102252e10a28959b (patch)
tree729fc328045a250e0f78a186455877a0bcff2ab1 /src
parentfd54417e41c13f021609bf4f328bd7b9b8411e30 (diff)
curve25519: use precomp implementation instead of sandy2x
It's faster and doesn't use the FPU. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'src')
-rw-r--r--src/Kbuild2
-rw-r--r--src/crypto/curve25519-x86_64.S3261
-rw-r--r--src/crypto/curve25519-x86_64.h2220
-rw-r--r--src/crypto/curve25519.c26
4 files changed, 2071 insertions, 3438 deletions
diff --git a/src/Kbuild b/src/Kbuild
index 5ffc1b9..3569ec3 100644
--- a/src/Kbuild
+++ b/src/Kbuild
@@ -9,7 +9,7 @@ ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
wireguard-y := main.o noise.o device.o peer.o timers.o queueing.o send.o receive.o socket.o hashtables.o allowedips.o ratelimiter.o cookie.o netlink.o
wireguard-y += crypto/curve25519.o crypto/chacha20poly1305.o crypto/blake2s.o
-wireguard-$(CONFIG_X86_64) += crypto/chacha20-x86_64.o crypto/poly1305-x86_64.o crypto/blake2s-x86_64.o crypto/curve25519-x86_64.o
+wireguard-$(CONFIG_X86_64) += crypto/chacha20-x86_64.o crypto/poly1305-x86_64.o crypto/blake2s-x86_64.o
wireguard-$(CONFIG_ARM) += crypto/chacha20-arm.o crypto/poly1305-arm.o crypto/curve25519-arm.o
wireguard-$(CONFIG_ARM64) += crypto/chacha20-arm64.o crypto/poly1305-arm64.o
wireguard-$(if $(filter yy,$(CONFIG_MIPS)$(CONFIG_64BIT)),y,n) += crypto/poly1305-mips64.o
diff --git a/src/crypto/curve25519-x86_64.S b/src/crypto/curve25519-x86_64.S
deleted file mode 100644
index 57fe50f..0000000
--- a/src/crypto/curve25519-x86_64.S
+++ /dev/null
@@ -1,3261 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0
- *
- * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * Based on algorithms from Tung Chou <blueprint@crypto.tw>
- */
-
-#include <linux/linkage.h>
-
-.data
-.align 16
-curve25519_sandy2x_v0_0: .quad 0, 0
-curve25519_sandy2x_v1_0: .quad 1, 0
-curve25519_sandy2x_v2_1: .quad 2, 1
-curve25519_sandy2x_v9_0: .quad 9, 0
-curve25519_sandy2x_v9_9: .quad 9, 9
-curve25519_sandy2x_v19_19: .quad 19, 19
-curve25519_sandy2x_v38_1: .quad 38, 1
-curve25519_sandy2x_v38_38: .quad 38, 38
-curve25519_sandy2x_v121666_121666: .quad 121666, 121666
-curve25519_sandy2x_m25: .quad 33554431, 33554431
-curve25519_sandy2x_m26: .quad 67108863, 67108863
-curve25519_sandy2x_subc0: .quad 0x07FFFFDA, 0x03FFFFFE
-curve25519_sandy2x_subc2: .quad 0x07FFFFFE, 0x03FFFFFE
-curve25519_sandy2x_REDMASK51: .quad 0x0007FFFFFFFFFFFF
-
-.text
-.align 32
-#ifdef CONFIG_AS_AVX
-ENTRY(curve25519_sandy2x_fe51_mul)
- push %rbp
- mov %rsp,%rbp
- sub $96,%rsp
- and $-32,%rsp
- movq %r11,0(%rsp)
- movq %r12,8(%rsp)
- movq %r13,16(%rsp)
- movq %r14,24(%rsp)
- movq %r15,32(%rsp)
- movq %rbx,40(%rsp)
- movq %rbp,48(%rsp)
- movq %rdi,56(%rsp)
- mov %rdx,%rcx
- movq 24(%rsi),%rdx
- imulq $19,%rdx,%rax
- movq %rax,64(%rsp)
- mulq 16(%rcx)
- mov %rax,%r8
- mov %rdx,%r9
- movq 32(%rsi),%rdx
- imulq $19,%rdx,%rax
- movq %rax,72(%rsp)
- mulq 8(%rcx)
- add %rax,%r8
- adc %rdx,%r9
- movq 0(%rsi),%rax
- mulq 0(%rcx)
- add %rax,%r8
- adc %rdx,%r9
- movq 0(%rsi),%rax
- mulq 8(%rcx)
- mov %rax,%r10
- mov %rdx,%r11
- movq 0(%rsi),%rax
- mulq 16(%rcx)
- mov %rax,%r12
- mov %rdx,%r13
- movq 0(%rsi),%rax
- mulq 24(%rcx)
- mov %rax,%r14
- mov %rdx,%r15
- movq 0(%rsi),%rax
- mulq 32(%rcx)
- mov %rax,%rbx
- mov %rdx,%rbp
- movq 8(%rsi),%rax
- mulq 0(%rcx)
- add %rax,%r10
- adc %rdx,%r11
- movq 8(%rsi),%rax
- mulq 8(%rcx)
- add %rax,%r12
- adc %rdx,%r13
- movq 8(%rsi),%rax
- mulq 16(%rcx)
- add %rax,%r14
- adc %rdx,%r15
- movq 8(%rsi),%rax
- mulq 24(%rcx)
- add %rax,%rbx
- adc %rdx,%rbp
- movq 8(%rsi),%rdx
- imulq $19,%rdx,%rax
- mulq 32(%rcx)
- add %rax,%r8
- adc %rdx,%r9
- movq 16(%rsi),%rax
- mulq 0(%rcx)
- add %rax,%r12
- adc %rdx,%r13
- movq 16(%rsi),%rax
- mulq 8(%rcx)
- add %rax,%r14
- adc %rdx,%r15
- movq 16(%rsi),%rax
- mulq 16(%rcx)
- add %rax,%rbx
- adc %rdx,%rbp
- movq 16(%rsi),%rdx
- imulq $19,%rdx,%rax
- mulq 24(%rcx)
- add %rax,%r8
- adc %rdx,%r9
- movq 16(%rsi),%rdx
- imulq $19,%rdx,%rax
- mulq 32(%rcx)
- add %rax,%r10
- adc %rdx,%r11
- movq 24(%rsi),%rax
- mulq 0(%rcx)
- add %rax,%r14
- adc %rdx,%r15
- movq 24(%rsi),%rax
- mulq 8(%rcx)
- add %rax,%rbx
- adc %rdx,%rbp
- movq 64(%rsp),%rax
- mulq 24(%rcx)
- add %rax,%r10
- adc %rdx,%r11
- movq 64(%rsp),%rax
- mulq 32(%rcx)
- add %rax,%r12
- adc %rdx,%r13
- movq 32(%rsi),%rax
- mulq 0(%rcx)
- add %rax,%rbx
- adc %rdx,%rbp
- movq 72(%rsp),%rax
- mulq 16(%rcx)
- add %rax,%r10
- adc %rdx,%r11
- movq 72(%rsp),%rax
- mulq 24(%rcx)
- add %rax,%r12
- adc %rdx,%r13
- movq 72(%rsp),%rax
- mulq 32(%rcx)
- add %rax,%r14
- adc %rdx,%r15
- movq curve25519_sandy2x_REDMASK51(%rip),%rsi
- shld $13,%r8,%r9
- and %rsi,%r8
- shld $13,%r10,%r11
- and %rsi,%r10
- add %r9,%r10
- shld $13,%r12,%r13
- and %rsi,%r12
- add %r11,%r12
- shld $13,%r14,%r15
- and %rsi,%r14
- add %r13,%r14
- shld $13,%rbx,%rbp
- and %rsi,%rbx
- add %r15,%rbx
- imulq $19,%rbp,%rdx
- add %rdx,%r8
- mov %r8,%rdx
- shr $51,%rdx
- add %r10,%rdx
- mov %rdx,%rcx
- shr $51,%rdx
- and %rsi,%r8
- add %r12,%rdx
- mov %rdx,%r9
- shr $51,%rdx
- and %rsi,%rcx
- add %r14,%rdx
- mov %rdx,%rax
- shr $51,%rdx
- and %rsi,%r9
- add %rbx,%rdx
- mov %rdx,%r10
- shr $51,%rdx
- and %rsi,%rax
- imulq $19,%rdx,%rdx
- add %rdx,%r8
- and %rsi,%r10
- movq %r8,0(%rdi)
- movq %rcx,8(%rdi)
- movq %r9,16(%rdi)
- movq %rax,24(%rdi)
- movq %r10,32(%rdi)
- movq 0(%rsp),%r11
- movq 8(%rsp),%r12
- movq 16(%rsp),%r13
- movq 24(%rsp),%r14
- movq 32(%rsp),%r15
- movq 40(%rsp),%rbx
- movq 48(%rsp),%rbp
- leave
- ret
-ENDPROC(curve25519_sandy2x_fe51_mul)
-
-.align 32
-ENTRY(curve25519_sandy2x_fe51_nsquare)
- push %rbp
- mov %rsp,%rbp
- sub $64,%rsp
- and $-32,%rsp
- movq %r11,0(%rsp)
- movq %r12,8(%rsp)
- movq %r13,16(%rsp)
- movq %r14,24(%rsp)
- movq %r15,32(%rsp)
- movq %rbx,40(%rsp)
- movq %rbp,48(%rsp)
- movq 0(%rsi),%rcx
- movq 8(%rsi),%r8
- movq 16(%rsi),%r9
- movq 24(%rsi),%rax
- movq 32(%rsi),%rsi
- movq %r9,16(%rdi)
- movq %rax,24(%rdi)
- movq %rsi,32(%rdi)
- mov %rdx,%rsi
-
- .align 16
- .Lloop:
- sub $1,%rsi
- mov %rcx,%rax
- mul %rcx
- add %rcx,%rcx
- mov %rax,%r9
- mov %rdx,%r10
- mov %rcx,%rax
- mul %r8
- mov %rax,%r11
- mov %rdx,%r12
- mov %rcx,%rax
- mulq 16(%rdi)
- mov %rax,%r13
- mov %rdx,%r14
- mov %rcx,%rax
- mulq 24(%rdi)
- mov %rax,%r15
- mov %rdx,%rbx
- mov %rcx,%rax
- mulq 32(%rdi)
- mov %rax,%rcx
- mov %rdx,%rbp
- mov %r8,%rax
- mul %r8
- add %r8,%r8
- add %rax,%r13
- adc %rdx,%r14
- mov %r8,%rax
- mulq 16(%rdi)
- add %rax,%r15
- adc %rdx,%rbx
- mov %r8,%rax
- imulq $19, %r8,%r8
- mulq 24(%rdi)
- add %rax,%rcx
- adc %rdx,%rbp
- mov %r8,%rax
- mulq 32(%rdi)
- add %rax,%r9
- adc %rdx,%r10
- movq 16(%rdi),%rax
- mulq 16(%rdi)
- add %rax,%rcx
- adc %rdx,%rbp
- shld $13,%rcx,%rbp
- movq 16(%rdi),%rax
- imulq $38, %rax,%rax
- mulq 24(%rdi)
- add %rax,%r9
- adc %rdx,%r10
- shld $13,%r9,%r10
- movq 16(%rdi),%rax
- imulq $38, %rax,%rax
- mulq 32(%rdi)
- add %rax,%r11
- adc %rdx,%r12
- movq 24(%rdi),%rax
- imulq $19, %rax,%rax
- mulq 24(%rdi)
- add %rax,%r11
- adc %rdx,%r12
- shld $13,%r11,%r12
- movq 24(%rdi),%rax
- imulq $38, %rax,%rax
- mulq 32(%rdi)
- add %rax,%r13
- adc %rdx,%r14
- shld $13,%r13,%r14
- movq 32(%rdi),%rax
- imulq $19, %rax,%rax
- mulq 32(%rdi)
- add %rax,%r15
- adc %rdx,%rbx
- shld $13,%r15,%rbx
- movq curve25519_sandy2x_REDMASK51(%rip),%rdx
- and %rdx,%rcx
- add %rbx,%rcx
- and %rdx,%r9
- and %rdx,%r11
- add %r10,%r11
- and %rdx,%r13
- add %r12,%r13
- and %rdx,%r15
- add %r14,%r15
- imulq $19, %rbp,%rbp
- lea (%r9,%rbp),%r9
- mov %r9,%rax
- shr $51,%r9
- add %r11,%r9
- and %rdx,%rax
- mov %r9,%r8
- shr $51,%r9
- add %r13,%r9
- and %rdx,%r8
- mov %r9,%r10
- shr $51,%r9
- add %r15,%r9
- and %rdx,%r10
- movq %r10,16(%rdi)
- mov %r9,%r10
- shr $51,%r9
- add %rcx,%r9
- and %rdx,%r10
- movq %r10,24(%rdi)
- mov %r9,%r10
- shr $51,%r9
- imulq $19, %r9,%r9
- lea (%rax,%r9),%rcx
- and %rdx,%r10
- movq %r10,32(%rdi)
- cmp $0,%rsi
- jne .Lloop
-
- movq %rcx,0(%rdi)
- movq %r8,8(%rdi)
- movq 0(%rsp),%r11
- movq 8(%rsp),%r12
- movq 16(%rsp),%r13
- movq 24(%rsp),%r14
- movq 32(%rsp),%r15
- movq 40(%rsp),%rbx
- movq 48(%rsp),%rbp
- leave
- ret
-ENDPROC(curve25519_sandy2x_fe51_nsquare)
-
-.align 32
-ENTRY(curve25519_sandy2x_fe51_pack)
- push %rbp
- mov %rsp,%rbp
- sub $32,%rsp
- and $-32,%rsp
- movq %r11,0(%rsp)
- movq %r12,8(%rsp)
- movq 0(%rsi),%rdx
- movq 8(%rsi),%rcx
- movq 16(%rsi),%r8
- movq 24(%rsi),%r9
- movq 32(%rsi),%rsi
- movq curve25519_sandy2x_REDMASK51(%rip),%rax
- lea -18(%rax),%r10
- mov $3,%r11
-
- .align 16
- .Lreduceloop:
- mov %rdx,%r12
- shr $51,%r12
- and %rax,%rdx
- add %r12,%rcx
- mov %rcx,%r12
- shr $51,%r12
- and %rax,%rcx
- add %r12,%r8
- mov %r8,%r12
- shr $51,%r12
- and %rax,%r8
- add %r12,%r9
- mov %r9,%r12
- shr $51,%r12
- and %rax,%r9
- add %r12,%rsi
- mov %rsi,%r12
- shr $51,%r12
- and %rax,%rsi
- imulq $19, %r12,%r12
- add %r12,%rdx
- sub $1,%r11
- ja .Lreduceloop
-
- mov $1,%r12
- cmp %r10,%rdx
- cmovl %r11,%r12
- cmp %rax,%rcx
- cmovne %r11,%r12
- cmp %rax,%r8
- cmovne %r11,%r12
- cmp %rax,%r9
- cmovne %r11,%r12
- cmp %rax,%rsi
- cmovne %r11,%r12
- neg %r12
- and %r12,%rax
- and %r12,%r10
- sub %r10,%rdx
- sub %rax,%rcx
- sub %rax,%r8
- sub %rax,%r9
- sub %rax,%rsi
- mov %rdx,%rax
- and $0xFF,%eax
- movb %al,0(%rdi)
- mov %rdx,%rax
- shr $8,%rax
- and $0xFF,%eax
- movb %al,1(%rdi)
- mov %rdx,%rax
- shr $16,%rax
- and $0xFF,%eax
- movb %al,2(%rdi)
- mov %rdx,%rax
- shr $24,%rax
- and $0xFF,%eax
- movb %al,3(%rdi)
- mov %rdx,%rax
- shr $32,%rax
- and $0xFF,%eax
- movb %al,4(%rdi)
- mov %rdx,%rax
- shr $40,%rax
- and $0xFF,%eax
- movb %al,5(%rdi)
- mov %rdx,%rdx
- shr $48,%rdx
- mov %rcx,%rax
- shl $3,%rax
- and $0xF8,%eax
- xor %rdx,%rax
- movb %al,6(%rdi)
- mov %rcx,%rdx
- shr $5,%rdx
- and $0xFF,%edx
- movb %dl,7(%rdi)
- mov %rcx,%rdx
- shr $13,%rdx
- and $0xFF,%edx
- movb %dl,8(%rdi)
- mov %rcx,%rdx
- shr $21,%rdx
- and $0xFF,%edx
- movb %dl,9(%rdi)
- mov %rcx,%rdx
- shr $29,%rdx
- and $0xFF,%edx
- movb %dl,10(%rdi)
- mov %rcx,%rdx
- shr $37,%rdx
- and $0xFF,%edx
- movb %dl,11(%rdi)
- mov %rcx,%rdx
- shr $45,%rdx
- mov %r8,%rcx
- shl $6,%rcx
- and $0xC0,%ecx
- xor %rdx,%rcx
- movb %cl,12(%rdi)
- mov %r8,%rdx
- shr $2,%rdx
- and $0xFF,%edx
- movb %dl,13(%rdi)
- mov %r8,%rdx
- shr $10,%rdx
- and $0xFF,%edx
- movb %dl,14(%rdi)
- mov %r8,%rdx
- shr $18,%rdx
- and $0xFF,%edx
- movb %dl,15(%rdi)
- mov %r8,%rdx
- shr $26,%rdx
- and $0xFF,%edx
- movb %dl,16(%rdi)
- mov %r8,%rdx
- shr $34,%rdx
- and $0xFF,%edx
- movb %dl,17(%rdi)
- mov %r8,%rdx
- shr $42,%rdx
- movb %dl,18(%rdi)
- mov %r8,%rdx
- shr $50,%rdx
- mov %r9,%rcx
- shl $1,%rcx
- and $0xFE,%ecx
- xor %rdx,%rcx
- movb %cl,19(%rdi)
- mov %r9,%rdx
- shr $7,%rdx
- and $0xFF,%edx
- movb %dl,20(%rdi)
- mov %r9,%rdx
- shr $15,%rdx
- and $0xFF,%edx
- movb %dl,21(%rdi)
- mov %r9,%rdx
- shr $23,%rdx
- and $0xFF,%edx
- movb %dl,22(%rdi)
- mov %r9,%rdx
- shr $31,%rdx
- and $0xFF,%edx
- movb %dl,23(%rdi)
- mov %r9,%rdx
- shr $39,%rdx
- and $0xFF,%edx
- movb %dl,24(%rdi)
- mov %r9,%rdx
- shr $47,%rdx
- mov %rsi,%rcx
- shl $4,%rcx
- and $0xF0,%ecx
- xor %rdx,%rcx
- movb %cl,25(%rdi)
- mov %rsi,%rdx
- shr $4,%rdx
- and $0xFF,%edx
- movb %dl,26(%rdi)
- mov %rsi,%rdx
- shr $12,%rdx
- and $0xFF,%edx
- movb %dl,27(%rdi)
- mov %rsi,%rdx
- shr $20,%rdx
- and $0xFF,%edx
- movb %dl,28(%rdi)
- mov %rsi,%rdx
- shr $28,%rdx
- and $0xFF,%edx
- movb %dl,29(%rdi)
- mov %rsi,%rdx
- shr $36,%rdx
- and $0xFF,%edx
- movb %dl,30(%rdi)
- mov %rsi,%rsi
- shr $44,%rsi
- movb %sil,31(%rdi)
- movq 0(%rsp),%r11
- movq 8(%rsp),%r12
- leave
- ret
-ENDPROC(curve25519_sandy2x_fe51_pack)
-
-.align 32
-ENTRY(curve25519_sandy2x_ladder)
- push %rbp
- mov %rsp,%rbp
- sub $1856,%rsp
- and $-32,%rsp
- movq %r11,1824(%rsp)
- movq %r12,1832(%rsp)
- movq %r13,1840(%rsp)
- movq %r14,1848(%rsp)
- vmovdqa curve25519_sandy2x_v0_0(%rip),%xmm0
- vmovdqa curve25519_sandy2x_v1_0(%rip),%xmm1
- vmovdqu 0(%rdi),%xmm2
- vmovdqa %xmm2,0(%rsp)
- vmovdqu 16(%rdi),%xmm2
- vmovdqa %xmm2,16(%rsp)
- vmovdqu 32(%rdi),%xmm2
- vmovdqa %xmm2,32(%rsp)
- vmovdqu 48(%rdi),%xmm2
- vmovdqa %xmm2,48(%rsp)
- vmovdqu 64(%rdi),%xmm2
- vmovdqa %xmm2,64(%rsp)
- vmovdqa %xmm1,80(%rsp)
- vmovdqa %xmm0,96(%rsp)
- vmovdqa %xmm0,112(%rsp)
- vmovdqa %xmm0,128(%rsp)
- vmovdqa %xmm0,144(%rsp)
- vmovdqa %xmm1,%xmm0
- vpxor %xmm1,%xmm1,%xmm1
- vpxor %xmm2,%xmm2,%xmm2
- vpxor %xmm3,%xmm3,%xmm3
- vpxor %xmm4,%xmm4,%xmm4
- vpxor %xmm5,%xmm5,%xmm5
- vpxor %xmm6,%xmm6,%xmm6
- vpxor %xmm7,%xmm7,%xmm7
- vpxor %xmm8,%xmm8,%xmm8
- vpxor %xmm9,%xmm9,%xmm9
- vmovdqu 0(%rdi),%xmm10
- vmovdqa %xmm10,160(%rsp)
- vmovdqu 16(%rdi),%xmm10
- vmovdqa %xmm10,176(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,192(%rsp)
- vmovdqu 32(%rdi),%xmm10
- vmovdqa %xmm10,208(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,224(%rsp)
- vmovdqu 48(%rdi),%xmm10
- vmovdqa %xmm10,240(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,256(%rsp)
- vmovdqu 64(%rdi),%xmm10
- vmovdqa %xmm10,272(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,288(%rsp)
- vmovdqu 8(%rdi),%xmm10
- vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,304(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,320(%rsp)
- vmovdqu 24(%rdi),%xmm10
- vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,336(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,352(%rsp)
- vmovdqu 40(%rdi),%xmm10
- vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,368(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,384(%rsp)
- vmovdqu 56(%rdi),%xmm10
- vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,400(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,416(%rsp)
- vmovdqu 0(%rdi),%xmm10
- vmovdqu 64(%rdi),%xmm11
- vblendps $12, %xmm11, %xmm10, %xmm10
- vpshufd $2,%xmm10,%xmm10
- vpmuludq curve25519_sandy2x_v38_1(%rip),%xmm10,%xmm10
- vmovdqa %xmm10,432(%rsp)
- movq 0(%rsi),%rdx
- movq 8(%rsi),%rcx
- movq 16(%rsi),%r8
- movq 24(%rsi),%r9
- shrd $1,%rcx,%rdx
- shrd $1,%r8,%rcx
- shrd $1,%r9,%r8
- shr $1,%r9
- xorq 0(%rsi),%rdx
- xorq 8(%rsi),%rcx
- xorq 16(%rsi),%r8
- xorq 24(%rsi),%r9
- leaq 800(%rsp),%rsi
- mov $64,%rax
-
- .align 16
- .Lladder_small_loop:
- mov %rdx,%r10
- mov %rcx,%r11
- mov %r8,%r12
- mov %r9,%r13
- shr $1,%rdx
- shr $1,%rcx
- shr $1,%r8
- shr $1,%r9
- and $1,%r10d
- and $1,%r11d
- and $1,%r12d
- and $1,%r13d
- neg %r10
- neg %r11
- neg %r12
- neg %r13
- movl %r10d,0(%rsi)
- movl %r11d,256(%rsi)
- movl %r12d,512(%rsi)
- movl %r13d,768(%rsi)
- add $4,%rsi
- sub $1,%rax
- jne .Lladder_small_loop
- mov $255,%rdx
- add $760,%rsi
-
- .align 16
- .Lladder_loop:
- sub $1,%rdx
- vbroadcastss 0(%rsi),%xmm10
- sub $4,%rsi
- vmovdqa 0(%rsp),%xmm11
- vmovdqa 80(%rsp),%xmm12
- vpxor %xmm11,%xmm0,%xmm13
- vpand %xmm10,%xmm13,%xmm13
- vpxor %xmm13,%xmm0,%xmm0
- vpxor %xmm13,%xmm11,%xmm11
- vpxor %xmm12,%xmm1,%xmm13
- vpand %xmm10,%xmm13,%xmm13
- vpxor %xmm13,%xmm1,%xmm1
- vpxor %xmm13,%xmm12,%xmm12
- vmovdqa 16(%rsp),%xmm13
- vmovdqa 96(%rsp),%xmm14
- vpxor %xmm13,%xmm2,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm2,%xmm2
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm14,%xmm3,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm3,%xmm3
- vpxor %xmm15,%xmm14,%xmm14
- vmovdqa %xmm13,0(%rsp)
- vmovdqa %xmm14,16(%rsp)
- vmovdqa 32(%rsp),%xmm13
- vmovdqa 112(%rsp),%xmm14
- vpxor %xmm13,%xmm4,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm4,%xmm4
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm14,%xmm5,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm5,%xmm5
- vpxor %xmm15,%xmm14,%xmm14
- vmovdqa %xmm13,32(%rsp)
- vmovdqa %xmm14,80(%rsp)
- vmovdqa 48(%rsp),%xmm13
- vmovdqa 128(%rsp),%xmm14
- vpxor %xmm13,%xmm6,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm6,%xmm6
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm14,%xmm7,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm7,%xmm7
- vpxor %xmm15,%xmm14,%xmm14
- vmovdqa %xmm13,48(%rsp)
- vmovdqa %xmm14,96(%rsp)
- vmovdqa 64(%rsp),%xmm13
- vmovdqa 144(%rsp),%xmm14
- vpxor %xmm13,%xmm8,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm14,%xmm9,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm9,%xmm9
- vpxor %xmm15,%xmm14,%xmm14
- vmovdqa %xmm13,64(%rsp)
- vmovdqa %xmm14,112(%rsp)
- vpaddq curve25519_sandy2x_subc0(%rip),%xmm11,%xmm10
- vpsubq %xmm12,%xmm10,%xmm10
- vpaddq %xmm12,%xmm11,%xmm11
- vpunpckhqdq %xmm10,%xmm11,%xmm12
- vpunpcklqdq %xmm10,%xmm11,%xmm10
- vpaddq %xmm1,%xmm0,%xmm11
- vpaddq curve25519_sandy2x_subc0(%rip),%xmm0,%xmm0
- vpsubq %xmm1,%xmm0,%xmm0
- vpunpckhqdq %xmm11,%xmm0,%xmm1
- vpunpcklqdq %xmm11,%xmm0,%xmm0
- vpmuludq %xmm0,%xmm10,%xmm11
- vpmuludq %xmm1,%xmm10,%xmm13
- vmovdqa %xmm1,128(%rsp)
- vpaddq %xmm1,%xmm1,%xmm1
- vpmuludq %xmm0,%xmm12,%xmm14
- vmovdqa %xmm0,144(%rsp)
- vpaddq %xmm14,%xmm13,%xmm13
- vpmuludq %xmm1,%xmm12,%xmm0
- vmovdqa %xmm1,448(%rsp)
- vpaddq %xmm3,%xmm2,%xmm1
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm2,%xmm2
- vpsubq %xmm3,%xmm2,%xmm2
- vpunpckhqdq %xmm1,%xmm2,%xmm3
- vpunpcklqdq %xmm1,%xmm2,%xmm1
- vpmuludq %xmm1,%xmm10,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpmuludq %xmm3,%xmm10,%xmm2
- vmovdqa %xmm3,464(%rsp)
- vpaddq %xmm3,%xmm3,%xmm3
- vpmuludq %xmm1,%xmm12,%xmm14
- vmovdqa %xmm1,480(%rsp)
- vpaddq %xmm14,%xmm2,%xmm2
- vpmuludq %xmm3,%xmm12,%xmm1
- vmovdqa %xmm3,496(%rsp)
- vpaddq %xmm5,%xmm4,%xmm3
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm4,%xmm4
- vpsubq %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm3,%xmm4,%xmm5
- vpunpcklqdq %xmm3,%xmm4,%xmm3
- vpmuludq %xmm3,%xmm10,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vpmuludq %xmm5,%xmm10,%xmm4
- vmovdqa %xmm5,512(%rsp)
- vpaddq %xmm5,%xmm5,%xmm5
- vpmuludq %xmm3,%xmm12,%xmm14
- vmovdqa %xmm3,528(%rsp)
- vpaddq %xmm14,%xmm4,%xmm4
- vpaddq %xmm7,%xmm6,%xmm3
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm6,%xmm6
- vpsubq %xmm7,%xmm6,%xmm6
- vpunpckhqdq %xmm3,%xmm6,%xmm7
- vpunpcklqdq %xmm3,%xmm6,%xmm3
- vpmuludq %xmm3,%xmm10,%xmm6
- vpmuludq %xmm5,%xmm12,%xmm14
- vmovdqa %xmm5,544(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm5,%xmm5
- vmovdqa %xmm5,560(%rsp)
- vpaddq %xmm14,%xmm6,%xmm6
- vpmuludq %xmm7,%xmm10,%xmm5
- vmovdqa %xmm7,576(%rsp)
- vpaddq %xmm7,%xmm7,%xmm7
- vpmuludq %xmm3,%xmm12,%xmm14
- vmovdqa %xmm3,592(%rsp)
- vpaddq %xmm14,%xmm5,%xmm5
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vmovdqa %xmm3,608(%rsp)
- vpaddq %xmm9,%xmm8,%xmm3
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm8,%xmm8
- vpsubq %xmm9,%xmm8,%xmm8
- vpunpckhqdq %xmm3,%xmm8,%xmm9
- vpunpcklqdq %xmm3,%xmm8,%xmm3
- vmovdqa %xmm3,624(%rsp)
- vpmuludq %xmm7,%xmm12,%xmm8
- vmovdqa %xmm7,640(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm7,%xmm7
- vmovdqa %xmm7,656(%rsp)
- vpmuludq %xmm3,%xmm10,%xmm7
- vpaddq %xmm7,%xmm8,%xmm8
- vpmuludq %xmm9,%xmm10,%xmm7
- vmovdqa %xmm9,672(%rsp)
- vpaddq %xmm9,%xmm9,%xmm9
- vpmuludq %xmm3,%xmm12,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vmovdqa %xmm3,688(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm12,%xmm12
- vpmuludq %xmm9,%xmm12,%xmm3
- vmovdqa %xmm9,704(%rsp)
- vpaddq %xmm3,%xmm11,%xmm11
- vmovdqa 0(%rsp),%xmm3
- vmovdqa 16(%rsp),%xmm9
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10
- vpsubq %xmm9,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm10,%xmm3,%xmm9
- vpunpcklqdq %xmm10,%xmm3,%xmm3
- vpmuludq 144(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpmuludq 128(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm2,%xmm2
- vpmuludq 480(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpmuludq 464(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm4,%xmm4
- vpmuludq 528(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpmuludq 512(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm5,%xmm5
- vpmuludq 592(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpmuludq 576(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vpmuludq 624(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpmuludq 672(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 144(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 448(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm1,%xmm1
- vpmuludq 480(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 496(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 528(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 544(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpmuludq 592(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9
- vpmuludq 640(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 624(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 704(%rsp),%xmm9,%xmm9
- vpaddq %xmm9,%xmm0,%xmm0
- vmovdqa 32(%rsp),%xmm3
- vmovdqa 80(%rsp),%xmm9
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10
- vpsubq %xmm9,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm10,%xmm3,%xmm9
- vpunpcklqdq %xmm10,%xmm3,%xmm3
- vpmuludq 144(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpmuludq 128(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm4,%xmm4
- vpmuludq 480(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpmuludq 464(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm5,%xmm5
- vpmuludq 528(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpmuludq 512(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vpmuludq 592(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpmuludq 576(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm13,%xmm13
- vpmuludq 624(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpmuludq 672(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 144(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 448(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 480(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 496(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpmuludq 528(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9
- vpmuludq 544(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 592(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 640(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm0,%xmm0
- vpmuludq 624(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 704(%rsp),%xmm9,%xmm9
- vpaddq %xmm9,%xmm1,%xmm1
- vmovdqa 48(%rsp),%xmm3
- vmovdqa 96(%rsp),%xmm9
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10
- vpsubq %xmm9,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm10,%xmm3,%xmm9
- vpunpcklqdq %xmm10,%xmm3,%xmm3
- vpmuludq 144(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpmuludq 128(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm5,%xmm5
- vpmuludq 480(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpmuludq 464(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vpmuludq 528(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpmuludq 512(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm13,%xmm13
- vpmuludq 592(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpmuludq 576(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm2,%xmm2
- vpmuludq 624(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpmuludq 672(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 144(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 448(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpmuludq 480(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9
- vpmuludq 496(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 528(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 544(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm0,%xmm0
- vpmuludq 592(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 640(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm1,%xmm1
- vpmuludq 624(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 704(%rsp),%xmm9,%xmm9
- vpaddq %xmm9,%xmm6,%xmm6
- vmovdqa 64(%rsp),%xmm3
- vmovdqa 112(%rsp),%xmm9
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10
- vpsubq %xmm9,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm10,%xmm3,%xmm9
- vpunpcklqdq %xmm10,%xmm3,%xmm3
- vpmuludq 144(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpmuludq 128(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vpmuludq 480(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpmuludq 464(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm13,%xmm13
- vpmuludq 528(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpmuludq 512(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm2,%xmm2
- vpmuludq 592(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpmuludq 576(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm4,%xmm4
- vpmuludq 624(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpmuludq 672(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 144(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9
- vpmuludq 448(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 480(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 496(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm0,%xmm0
- vpmuludq 528(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 544(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm1,%xmm1
- vpmuludq 592(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 640(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 624(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 704(%rsp),%xmm9,%xmm9
- vpaddq %xmm9,%xmm8,%xmm8
- vpsrlq $25,%xmm4,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4
- vpsrlq $26,%xmm11,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11
- vpsrlq $26,%xmm6,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6
- vpsrlq $25,%xmm13,%xmm3
- vpaddq %xmm3,%xmm0,%xmm0
- vpand curve25519_sandy2x_m25(%rip),%xmm13,%xmm13
- vpsrlq $25,%xmm5,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5
- vpsrlq $26,%xmm0,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpand curve25519_sandy2x_m26(%rip),%xmm0,%xmm0
- vpsrlq $26,%xmm8,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8
- vpsrlq $25,%xmm2,%xmm3
- vpaddq %xmm3,%xmm1,%xmm1
- vpand curve25519_sandy2x_m25(%rip),%xmm2,%xmm2
- vpsrlq $25,%xmm7,%xmm3
- vpsllq $4,%xmm3,%xmm9
- vpaddq %xmm3,%xmm11,%xmm11
- vpsllq $1,%xmm3,%xmm3
- vpaddq %xmm3,%xmm9,%xmm9
- vpaddq %xmm9,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7
- vpsrlq $26,%xmm1,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1
- vpsrlq $26,%xmm11,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11
- vpsrlq $25,%xmm4,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4
- vpunpcklqdq %xmm13,%xmm11,%xmm3
- vpunpckhqdq %xmm13,%xmm11,%xmm9
- vpaddq curve25519_sandy2x_subc0(%rip),%xmm9,%xmm10
- vpsubq %xmm3,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm3,%xmm10,%xmm9
- vpunpcklqdq %xmm3,%xmm10,%xmm10
- vpmuludq %xmm10,%xmm10,%xmm3
- vpaddq %xmm10,%xmm10,%xmm10
- vpmuludq %xmm9,%xmm10,%xmm11
- vpunpcklqdq %xmm2,%xmm0,%xmm12
- vpunpckhqdq %xmm2,%xmm0,%xmm0
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm0,%xmm2
- vpsubq %xmm12,%xmm2,%xmm2
- vpaddq %xmm0,%xmm12,%xmm12
- vpunpckhqdq %xmm12,%xmm2,%xmm0
- vpunpcklqdq %xmm12,%xmm2,%xmm2
- vpmuludq %xmm2,%xmm10,%xmm12
- vpaddq %xmm9,%xmm9,%xmm13
- vpmuludq %xmm13,%xmm9,%xmm9
- vpaddq %xmm9,%xmm12,%xmm12
- vpmuludq %xmm0,%xmm10,%xmm9
- vpmuludq %xmm2,%xmm13,%xmm14
- vpaddq %xmm14,%xmm9,%xmm9
- vpunpcklqdq %xmm4,%xmm1,%xmm14
- vpunpckhqdq %xmm4,%xmm1,%xmm1
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm1,%xmm4
- vpsubq %xmm14,%xmm4,%xmm4
- vpaddq %xmm1,%xmm14,%xmm14
- vpunpckhqdq %xmm14,%xmm4,%xmm1
- vpunpcklqdq %xmm14,%xmm4,%xmm4
- vmovdqa %xmm1,0(%rsp)
- vpaddq %xmm1,%xmm1,%xmm1
- vmovdqa %xmm1,16(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vmovdqa %xmm1,32(%rsp)
- vpmuludq %xmm4,%xmm10,%xmm1
- vpmuludq %xmm2,%xmm2,%xmm14
- vpaddq %xmm14,%xmm1,%xmm1
- vpmuludq 0(%rsp),%xmm10,%xmm14
- vpmuludq %xmm4,%xmm13,%xmm15
- vpaddq %xmm15,%xmm14,%xmm14
- vpunpcklqdq %xmm5,%xmm6,%xmm15
- vpunpckhqdq %xmm5,%xmm6,%xmm5
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm6
- vpsubq %xmm15,%xmm6,%xmm6
- vpaddq %xmm5,%xmm15,%xmm15
- vpunpckhqdq %xmm15,%xmm6,%xmm5
- vpunpcklqdq %xmm15,%xmm6,%xmm6
- vmovdqa %xmm6,48(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm6,%xmm6
- vmovdqa %xmm6,64(%rsp)
- vmovdqa %xmm5,80(%rsp)
- vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm5,%xmm5
- vmovdqa %xmm5,96(%rsp)
- vpmuludq 48(%rsp),%xmm10,%xmm5
- vpaddq %xmm0,%xmm0,%xmm6
- vpmuludq %xmm6,%xmm0,%xmm0
- vpaddq %xmm0,%xmm5,%xmm5
- vpmuludq 80(%rsp),%xmm10,%xmm0
- vpmuludq %xmm4,%xmm6,%xmm15
- vpaddq %xmm15,%xmm0,%xmm0
- vpmuludq %xmm6,%xmm13,%xmm15
- vpaddq %xmm15,%xmm1,%xmm1
- vpmuludq %xmm6,%xmm2,%xmm15
- vpaddq %xmm15,%xmm14,%xmm14
- vpunpcklqdq %xmm7,%xmm8,%xmm15
- vpunpckhqdq %xmm7,%xmm8,%xmm7
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm8
- vpsubq %xmm15,%xmm8,%xmm8
- vpaddq %xmm7,%xmm15,%xmm15
- vpunpckhqdq %xmm15,%xmm8,%xmm7
- vpunpcklqdq %xmm15,%xmm8,%xmm8
- vmovdqa %xmm8,112(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm8,%xmm8
- vmovdqa %xmm8,448(%rsp)
- vpmuludq 112(%rsp),%xmm10,%xmm8
- vpmuludq %xmm7,%xmm10,%xmm10
- vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm7,%xmm15
- vpmuludq %xmm15,%xmm7,%xmm7
- vpaddq %xmm7,%xmm8,%xmm8
- vpmuludq %xmm15,%xmm13,%xmm7
- vpaddq %xmm7,%xmm3,%xmm3
- vpmuludq %xmm15,%xmm2,%xmm7
- vpaddq %xmm7,%xmm11,%xmm11
- vpmuludq 80(%rsp),%xmm13,%xmm7
- vpaddq %xmm7,%xmm7,%xmm7
- vpaddq %xmm7,%xmm8,%xmm8
- vpmuludq 16(%rsp),%xmm13,%xmm7
- vpaddq %xmm7,%xmm5,%xmm5
- vpmuludq 48(%rsp),%xmm13,%xmm7
- vpaddq %xmm7,%xmm0,%xmm0
- vpmuludq 112(%rsp),%xmm13,%xmm7
- vpaddq %xmm7,%xmm10,%xmm10
- vpmuludq %xmm15,%xmm6,%xmm7
- vpaddq %xmm7,%xmm12,%xmm12
- vpmuludq %xmm15,%xmm4,%xmm7
- vpaddq %xmm7,%xmm9,%xmm9
- vpaddq %xmm2,%xmm2,%xmm2
- vpmuludq %xmm4,%xmm2,%xmm7
- vpaddq %xmm7,%xmm5,%xmm5
- vpmuludq 448(%rsp),%xmm2,%xmm7
- vpaddq %xmm7,%xmm3,%xmm3
- vpmuludq 448(%rsp),%xmm6,%xmm7
- vpaddq %xmm7,%xmm11,%xmm11
- vpmuludq 0(%rsp),%xmm2,%xmm7
- vpaddq %xmm7,%xmm0,%xmm0
- vpmuludq 48(%rsp),%xmm2,%xmm7
- vpaddq %xmm7,%xmm8,%xmm8
- vpmuludq 80(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 96(%rsp),%xmm4,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq %xmm4,%xmm4,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpaddq %xmm4,%xmm4,%xmm2
- vpmuludq 448(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vpmuludq 16(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vpmuludq 48(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm14,%xmm14
- vpmuludq 96(%rsp),%xmm6,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vmovdqa 16(%rsp),%xmm4
- vpmuludq 448(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm9,%xmm9
- vpmuludq 16(%rsp),%xmm6,%xmm4
- vpaddq %xmm4,%xmm8,%xmm8
- vpmuludq 48(%rsp),%xmm6,%xmm4
- vpaddq %xmm4,%xmm10,%xmm10
- vpmuludq 80(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm4,%xmm4
- vpaddq %xmm4,%xmm5,%xmm5
- vpmuludq 112(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm0,%xmm0
- vmovdqa 48(%rsp),%xmm4
- vpaddq %xmm4,%xmm4,%xmm4
- vpmuludq 448(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vmovdqa 80(%rsp),%xmm4
- vpaddq %xmm4,%xmm4,%xmm4
- vpmuludq 448(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm14,%xmm14
- vpmuludq 64(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vmovdqa 16(%rsp),%xmm4
- vpmuludq 64(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm11,%xmm11
- vmovdqa 16(%rsp),%xmm4
- vpmuludq 96(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vmovdqa 48(%rsp),%xmm4
- vpmuludq 96(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm9,%xmm9
- vpmuludq 0(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vmovdqa 32(%rsp),%xmm2
- vpmuludq 0(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vmovdqa 64(%rsp),%xmm2
- vpmuludq 48(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vmovdqa 96(%rsp),%xmm2
- vpmuludq 80(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm1,%xmm1
- vmovdqa 448(%rsp),%xmm2
- vpmuludq 112(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpsrlq $26,%xmm3,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3
- vpsrlq $25,%xmm14,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14
- vpsrlq $25,%xmm11,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpand curve25519_sandy2x_m25(%rip),%xmm11,%xmm11
- vpsrlq $26,%xmm5,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5
- vpsrlq $26,%xmm12,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpand curve25519_sandy2x_m26(%rip),%xmm12,%xmm12
- vpsrlq $25,%xmm0,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0
- vpsrlq $25,%xmm9,%xmm2
- vpaddq %xmm2,%xmm1,%xmm1
- vpand curve25519_sandy2x_m25(%rip),%xmm9,%xmm9
- vpsrlq $26,%xmm8,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8
- vpsrlq $26,%xmm1,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1
- vpsrlq $25,%xmm10,%xmm2
- vpsllq $4,%xmm2,%xmm4
- vpaddq %xmm2,%xmm3,%xmm3
- vpsllq $1,%xmm2,%xmm2
- vpaddq %xmm2,%xmm4,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $25,%xmm14,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14
- vpsrlq $26,%xmm3,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3
- vpunpckhqdq %xmm11,%xmm3,%xmm2
- vmovdqa %xmm2,0(%rsp)
- vpshufd $0,%xmm3,%xmm2
- vpshufd $0,%xmm11,%xmm3
- vpmuludq 160(%rsp),%xmm2,%xmm4
- vpmuludq 432(%rsp),%xmm3,%xmm6
- vpaddq %xmm6,%xmm4,%xmm4
- vpmuludq 176(%rsp),%xmm2,%xmm6
- vpmuludq 304(%rsp),%xmm3,%xmm7
- vpaddq %xmm7,%xmm6,%xmm6
- vpmuludq 208(%rsp),%xmm2,%xmm7
- vpmuludq 336(%rsp),%xmm3,%xmm11
- vpaddq %xmm11,%xmm7,%xmm7
- vpmuludq 240(%rsp),%xmm2,%xmm11
- vpmuludq 368(%rsp),%xmm3,%xmm13
- vpaddq %xmm13,%xmm11,%xmm11
- vpmuludq 272(%rsp),%xmm2,%xmm2
- vpmuludq 400(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpunpckhqdq %xmm9,%xmm12,%xmm3
- vmovdqa %xmm3,16(%rsp)
- vpshufd $0,%xmm12,%xmm3
- vpshufd $0,%xmm9,%xmm9
- vpmuludq 288(%rsp),%xmm3,%xmm12
- vpaddq %xmm12,%xmm4,%xmm4
- vpmuludq 416(%rsp),%xmm9,%xmm12
- vpaddq %xmm12,%xmm4,%xmm4
- vpmuludq 160(%rsp),%xmm3,%xmm12
- vpaddq %xmm12,%xmm6,%xmm6
- vpmuludq 432(%rsp),%xmm9,%xmm12
- vpaddq %xmm12,%xmm6,%xmm6
- vpmuludq 176(%rsp),%xmm3,%xmm12
- vpaddq %xmm12,%xmm7,%xmm7
- vpmuludq 304(%rsp),%xmm9,%xmm12
- vpaddq %xmm12,%xmm7,%xmm7
- vpmuludq 208(%rsp),%xmm3,%xmm12
- vpaddq %xmm12,%xmm11,%xmm11
- vpmuludq 336(%rsp),%xmm9,%xmm12
- vpaddq %xmm12,%xmm11,%xmm11
- vpmuludq 240(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 368(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm1,%xmm3
- vmovdqa %xmm3,32(%rsp)
- vpshufd $0,%xmm1,%xmm1
- vpshufd $0,%xmm14,%xmm3
- vpmuludq 256(%rsp),%xmm1,%xmm9
- vpaddq %xmm9,%xmm4,%xmm4
- vpmuludq 384(%rsp),%xmm3,%xmm9
- vpaddq %xmm9,%xmm4,%xmm4
- vpmuludq 288(%rsp),%xmm1,%xmm9
- vpaddq %xmm9,%xmm6,%xmm6
- vpmuludq 416(%rsp),%xmm3,%xmm9
- vpaddq %xmm9,%xmm6,%xmm6
- vpmuludq 160(%rsp),%xmm1,%xmm9
- vpaddq %xmm9,%xmm7,%xmm7
- vpmuludq 432(%rsp),%xmm3,%xmm9
- vpaddq %xmm9,%xmm7,%xmm7
- vpmuludq 176(%rsp),%xmm1,%xmm9
- vpaddq %xmm9,%xmm11,%xmm11
- vpmuludq 304(%rsp),%xmm3,%xmm9
- vpaddq %xmm9,%xmm11,%xmm11
- vpmuludq 208(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm2,%xmm2
- vpmuludq 336(%rsp),%xmm3,%xmm1
- vpaddq %xmm1,%xmm2,%xmm2
- vpunpckhqdq %xmm0,%xmm5,%xmm1
- vmovdqa %xmm1,48(%rsp)
- vpshufd $0,%xmm5,%xmm1
- vpshufd $0,%xmm0,%xmm0
- vpmuludq 224(%rsp),%xmm1,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 352(%rsp),%xmm0,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 256(%rsp),%xmm1,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 384(%rsp),%xmm0,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 288(%rsp),%xmm1,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq 416(%rsp),%xmm0,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq 160(%rsp),%xmm1,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 432(%rsp),%xmm0,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 176(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm2,%xmm2
- vpmuludq 304(%rsp),%xmm0,%xmm0
- vpaddq %xmm0,%xmm2,%xmm2
- vpunpckhqdq %xmm10,%xmm8,%xmm0
- vmovdqa %xmm0,64(%rsp)
- vpshufd $0,%xmm8,%xmm0
- vpshufd $0,%xmm10,%xmm1
- vpmuludq 192(%rsp),%xmm0,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 320(%rsp),%xmm1,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 224(%rsp),%xmm0,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 352(%rsp),%xmm1,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 256(%rsp),%xmm0,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq 384(%rsp),%xmm1,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq 288(%rsp),%xmm0,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 416(%rsp),%xmm1,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 160(%rsp),%xmm0,%xmm0
- vpaddq %xmm0,%xmm2,%xmm2
- vpmuludq 432(%rsp),%xmm1,%xmm0
- vpaddq %xmm0,%xmm2,%xmm2
- vmovdqa %xmm4,80(%rsp)
- vmovdqa %xmm6,96(%rsp)
- vmovdqa %xmm7,112(%rsp)
- vmovdqa %xmm11,448(%rsp)
- vmovdqa %xmm2,496(%rsp)
- vmovdqa 144(%rsp),%xmm0
- vpmuludq %xmm0,%xmm0,%xmm1
- vpaddq %xmm0,%xmm0,%xmm0
- vmovdqa 128(%rsp),%xmm2
- vpmuludq %xmm2,%xmm0,%xmm3
- vmovdqa 480(%rsp),%xmm4
- vpmuludq %xmm4,%xmm0,%xmm5
- vmovdqa 464(%rsp),%xmm6
- vpmuludq %xmm6,%xmm0,%xmm7
- vmovdqa 528(%rsp),%xmm8
- vpmuludq %xmm8,%xmm0,%xmm9
- vpmuludq 512(%rsp),%xmm0,%xmm10
- vpmuludq 592(%rsp),%xmm0,%xmm11
- vpmuludq 576(%rsp),%xmm0,%xmm12
- vpmuludq 624(%rsp),%xmm0,%xmm13
- vmovdqa 672(%rsp),%xmm14
- vpmuludq %xmm14,%xmm0,%xmm0
- vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm14,%xmm15
- vpmuludq %xmm15,%xmm14,%xmm14
- vpaddq %xmm14,%xmm13,%xmm13
- vpaddq %xmm6,%xmm6,%xmm14
- vpmuludq %xmm14,%xmm6,%xmm6
- vpaddq %xmm6,%xmm11,%xmm11
- vpaddq %xmm2,%xmm2,%xmm6
- vpmuludq %xmm6,%xmm2,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq %xmm15,%xmm6,%xmm2
- vpaddq %xmm2,%xmm1,%xmm1
- vpmuludq %xmm15,%xmm4,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vpmuludq 544(%rsp),%xmm6,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq 592(%rsp),%xmm6,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 640(%rsp),%xmm6,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 624(%rsp),%xmm6,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpmuludq %xmm4,%xmm6,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq %xmm14,%xmm6,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq %xmm8,%xmm6,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq %xmm15,%xmm14,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq %xmm15,%xmm8,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq %xmm4,%xmm4,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq %xmm14,%xmm4,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpaddq %xmm4,%xmm4,%xmm2
- vpmuludq %xmm8,%xmm2,%xmm4
- vpaddq %xmm4,%xmm11,%xmm11
- vpmuludq 688(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vpmuludq 688(%rsp),%xmm14,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vpmuludq 512(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vpmuludq 592(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm13,%xmm13
- vpmuludq 576(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpmuludq 656(%rsp),%xmm8,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vpmuludq %xmm8,%xmm14,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq %xmm8,%xmm8,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpaddq %xmm8,%xmm8,%xmm2
- vpmuludq 688(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm5,%xmm5
- vpmuludq 544(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm9,%xmm9
- vpmuludq 592(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm10,%xmm10
- vpmuludq 656(%rsp),%xmm14,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vmovdqa 544(%rsp),%xmm4
- vpmuludq 688(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm7,%xmm7
- vpmuludq 544(%rsp),%xmm14,%xmm4
- vpaddq %xmm4,%xmm13,%xmm13
- vpmuludq 592(%rsp),%xmm14,%xmm4
- vpaddq %xmm4,%xmm0,%xmm0
- vpmuludq 640(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm11,%xmm11
- vpmuludq 624(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vmovdqa 592(%rsp),%xmm4
- vpaddq %xmm4,%xmm4,%xmm4
- vpmuludq 688(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm9,%xmm9
- vpmuludq 608(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vmovdqa 544(%rsp),%xmm4
- vpmuludq 608(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vmovdqa 544(%rsp),%xmm4
- vpmuludq 656(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm5,%xmm5
- vmovdqa 592(%rsp),%xmm4
- vpmuludq 656(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm7,%xmm7
- vmovdqa 640(%rsp),%xmm4
- vpmuludq 688(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm10,%xmm10
- vpmuludq 512(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vmovdqa 560(%rsp),%xmm2
- vpmuludq 512(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm1,%xmm1
- vmovdqa 608(%rsp),%xmm2
- vpmuludq 592(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vmovdqa 656(%rsp),%xmm2
- vpmuludq 576(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vmovdqa 688(%rsp),%xmm2
- vpmuludq 624(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpsrlq $26,%xmm1,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1
- vpsrlq $25,%xmm10,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $25,%xmm3,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3
- vpsrlq $26,%xmm11,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11
- vpsrlq $26,%xmm5,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5
- vpsrlq $25,%xmm12,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12
- vpsrlq $25,%xmm7,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7
- vpsrlq $26,%xmm13,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13
- vpsrlq $26,%xmm9,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9
- vpsrlq $25,%xmm0,%xmm2
- vpsllq $4,%xmm2,%xmm4
- vpaddq %xmm2,%xmm1,%xmm1
- vpsllq $1,%xmm2,%xmm2
- vpaddq %xmm2,%xmm4,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0
- vpsrlq $25,%xmm10,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $26,%xmm1,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1
- vpunpckhqdq %xmm3,%xmm1,%xmm2
- vpunpcklqdq %xmm3,%xmm1,%xmm1
- vmovdqa %xmm1,464(%rsp)
- vpaddq curve25519_sandy2x_subc0(%rip),%xmm2,%xmm3
- vpsubq %xmm1,%xmm3,%xmm3
- vpunpckhqdq %xmm3,%xmm2,%xmm1
- vpunpcklqdq %xmm3,%xmm2,%xmm2
- vmovdqa %xmm2,480(%rsp)
- vmovdqa %xmm1,512(%rsp)
- vpsllq $1,%xmm1,%xmm1
- vmovdqa %xmm1,528(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm3,%xmm3
- vmovdqa 80(%rsp),%xmm1
- vpunpcklqdq %xmm1,%xmm3,%xmm2
- vpunpckhqdq %xmm1,%xmm3,%xmm1
- vpunpckhqdq %xmm7,%xmm5,%xmm3
- vpunpcklqdq %xmm7,%xmm5,%xmm4
- vmovdqa %xmm4,544(%rsp)
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm5
- vpsubq %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm5,%xmm3,%xmm4
- vpunpcklqdq %xmm5,%xmm3,%xmm3
- vmovdqa %xmm3,560(%rsp)
- vmovdqa %xmm4,576(%rsp)
- vpsllq $1,%xmm4,%xmm4
- vmovdqa %xmm4,592(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm5,%xmm5
- vmovdqa 96(%rsp),%xmm3
- vpunpcklqdq %xmm3,%xmm5,%xmm4
- vpunpckhqdq %xmm3,%xmm5,%xmm3
- vpunpckhqdq %xmm10,%xmm9,%xmm5
- vpunpcklqdq %xmm10,%xmm9,%xmm6
- vmovdqa %xmm6,608(%rsp)
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm7
- vpsubq %xmm6,%xmm7,%xmm7
- vpunpckhqdq %xmm7,%xmm5,%xmm6
- vpunpcklqdq %xmm7,%xmm5,%xmm5
- vmovdqa %xmm5,624(%rsp)
- vmovdqa %xmm6,640(%rsp)
- vpsllq $1,%xmm6,%xmm6
- vmovdqa %xmm6,656(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm7,%xmm7
- vmovdqa 112(%rsp),%xmm5
- vpunpcklqdq %xmm5,%xmm7,%xmm6
- vpunpckhqdq %xmm5,%xmm7,%xmm5
- vpunpckhqdq %xmm12,%xmm11,%xmm7
- vpunpcklqdq %xmm12,%xmm11,%xmm8
- vmovdqa %xmm8,672(%rsp)
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm9
- vpsubq %xmm8,%xmm9,%xmm9
- vpunpckhqdq %xmm9,%xmm7,%xmm8
- vpunpcklqdq %xmm9,%xmm7,%xmm7
- vmovdqa %xmm7,688(%rsp)
- vmovdqa %xmm8,704(%rsp)
- vpsllq $1,%xmm8,%xmm8
- vmovdqa %xmm8,720(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm9,%xmm9
- vmovdqa 448(%rsp),%xmm7
- vpunpcklqdq %xmm7,%xmm9,%xmm8
- vpunpckhqdq %xmm7,%xmm9,%xmm7
- vpunpckhqdq %xmm0,%xmm13,%xmm9
- vpunpcklqdq %xmm0,%xmm13,%xmm0
- vmovdqa %xmm0,448(%rsp)
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm9,%xmm10
- vpsubq %xmm0,%xmm10,%xmm10
- vpunpckhqdq %xmm10,%xmm9,%xmm0
- vpunpcklqdq %xmm10,%xmm9,%xmm9
- vmovdqa %xmm9,736(%rsp)
- vmovdqa %xmm0,752(%rsp)
- vpsllq $1,%xmm0,%xmm0
- vmovdqa %xmm0,768(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm10,%xmm10
- vmovdqa 496(%rsp),%xmm0
- vpunpcklqdq %xmm0,%xmm10,%xmm9
- vpunpckhqdq %xmm0,%xmm10,%xmm0
- vpsrlq $26,%xmm2,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2
- vpsrlq $25,%xmm5,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5
- vpsrlq $25,%xmm1,%xmm10
- vpaddq %xmm10,%xmm4,%xmm4
- vpand curve25519_sandy2x_m25(%rip),%xmm1,%xmm1
- vpsrlq $26,%xmm8,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8
- vpsrlq $26,%xmm4,%xmm10
- vpaddq %xmm10,%xmm3,%xmm3
- vpand curve25519_sandy2x_m26(%rip),%xmm4,%xmm4
- vpsrlq $25,%xmm7,%xmm10
- vpaddq %xmm10,%xmm9,%xmm9
- vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7
- vpsrlq $25,%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3
- vpsrlq $26,%xmm9,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9
- vpsrlq $26,%xmm6,%xmm10
- vpaddq %xmm10,%xmm5,%xmm5
- vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6
- vpsrlq $25,%xmm0,%xmm10
- vpsllq $4,%xmm10,%xmm11
- vpaddq %xmm10,%xmm2,%xmm2
- vpsllq $1,%xmm10,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpaddq %xmm11,%xmm2,%xmm2
- vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0
- vpsrlq $25,%xmm5,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5
- vpsrlq $26,%xmm2,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2
- vpunpckhqdq %xmm1,%xmm2,%xmm10
- vmovdqa %xmm10,80(%rsp)
- vpunpcklqdq %xmm1,%xmm2,%xmm1
- vpunpckhqdq %xmm3,%xmm4,%xmm2
- vmovdqa %xmm2,96(%rsp)
- vpunpcklqdq %xmm3,%xmm4,%xmm2
- vpunpckhqdq %xmm5,%xmm6,%xmm3
- vmovdqa %xmm3,112(%rsp)
- vpunpcklqdq %xmm5,%xmm6,%xmm3
- vpunpckhqdq %xmm7,%xmm8,%xmm4
- vmovdqa %xmm4,128(%rsp)
- vpunpcklqdq %xmm7,%xmm8,%xmm4
- vpunpckhqdq %xmm0,%xmm9,%xmm5
- vmovdqa %xmm5,144(%rsp)
- vpunpcklqdq %xmm0,%xmm9,%xmm0
- vmovdqa 464(%rsp),%xmm5
- vpaddq %xmm5,%xmm1,%xmm1
- vpunpcklqdq %xmm1,%xmm5,%xmm6
- vpunpckhqdq %xmm1,%xmm5,%xmm1
- vpmuludq 512(%rsp),%xmm6,%xmm5
- vpmuludq 480(%rsp),%xmm1,%xmm7
- vpaddq %xmm7,%xmm5,%xmm5
- vpmuludq 560(%rsp),%xmm6,%xmm7
- vpmuludq 528(%rsp),%xmm1,%xmm8
- vpaddq %xmm8,%xmm7,%xmm7
- vpmuludq 576(%rsp),%xmm6,%xmm8
- vpmuludq 560(%rsp),%xmm1,%xmm9
- vpaddq %xmm9,%xmm8,%xmm8
- vpmuludq 624(%rsp),%xmm6,%xmm9
- vpmuludq 592(%rsp),%xmm1,%xmm10
- vpaddq %xmm10,%xmm9,%xmm9
- vpmuludq 640(%rsp),%xmm6,%xmm10
- vpmuludq 624(%rsp),%xmm1,%xmm11
- vpaddq %xmm11,%xmm10,%xmm10
- vpmuludq 688(%rsp),%xmm6,%xmm11
- vpmuludq 656(%rsp),%xmm1,%xmm12
- vpaddq %xmm12,%xmm11,%xmm11
- vpmuludq 704(%rsp),%xmm6,%xmm12
- vpmuludq 688(%rsp),%xmm1,%xmm13
- vpaddq %xmm13,%xmm12,%xmm12
- vpmuludq 736(%rsp),%xmm6,%xmm13
- vpmuludq 720(%rsp),%xmm1,%xmm14
- vpaddq %xmm14,%xmm13,%xmm13
- vpmuludq 752(%rsp),%xmm6,%xmm14
- vpmuludq 736(%rsp),%xmm1,%xmm15
- vpaddq %xmm15,%xmm14,%xmm14
- vpmuludq 480(%rsp),%xmm6,%xmm6
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vpmuludq 768(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm6,%xmm6
- vmovdqa 544(%rsp),%xmm1
- vpaddq %xmm1,%xmm2,%xmm2
- vpunpcklqdq %xmm2,%xmm1,%xmm15
- vpunpckhqdq %xmm2,%xmm1,%xmm1
- vpmuludq 480(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq 512(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 560(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq 576(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 624(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq 640(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 688(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 704(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm15,%xmm15
- vpmuludq 736(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm6,%xmm6
- vpmuludq 752(%rsp),%xmm15,%xmm15
- vpaddq %xmm15,%xmm5,%xmm5
- vpmuludq 480(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 528(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq 560(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 592(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq 624(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 656(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 688(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vpmuludq 720(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm6,%xmm6
- vpmuludq 736(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq 768(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm7,%xmm7
- vmovdqa 608(%rsp),%xmm1
- vpaddq %xmm1,%xmm3,%xmm3
- vpunpcklqdq %xmm3,%xmm1,%xmm2
- vpunpckhqdq %xmm3,%xmm1,%xmm1
- vpmuludq 480(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm9,%xmm9
- vpmuludq 512(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm10,%xmm10
- vpmuludq 560(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 576(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm12,%xmm12
- vpmuludq 624(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 640(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2
- vpmuludq 688(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 704(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 736(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq 752(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 480(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 528(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq 560(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 592(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 624(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vpmuludq 656(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm6,%xmm6
- vpmuludq 688(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq 720(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq 736(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 768(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm9,%xmm9
- vmovdqa 672(%rsp),%xmm1
- vpaddq %xmm1,%xmm4,%xmm4
- vpunpcklqdq %xmm4,%xmm1,%xmm2
- vpunpckhqdq %xmm4,%xmm1,%xmm1
- vpmuludq 480(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 512(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm12,%xmm12
- vpmuludq 560(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 576(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2
- vpmuludq 624(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 640(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 688(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq 704(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpmuludq 736(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm9,%xmm9
- vpmuludq 752(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 480(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 528(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 560(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vpmuludq 592(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm6,%xmm6
- vpmuludq 624(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq 656(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq 688(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 720(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq 736(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 768(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm11,%xmm11
- vmovdqa 448(%rsp),%xmm1
- vpaddq %xmm1,%xmm0,%xmm0
- vpunpcklqdq %xmm0,%xmm1,%xmm2
- vpunpckhqdq %xmm0,%xmm1,%xmm0
- vpmuludq 480(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm13,%xmm13
- vpmuludq 512(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2
- vpmuludq 560(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm6,%xmm6
- vpmuludq 576(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm5,%xmm5
- vpmuludq 624(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm7,%xmm7
- vpmuludq 640(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm8,%xmm8
- vpmuludq 688(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm9,%xmm9
- vpmuludq 704(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm10,%xmm10
- vpmuludq 736(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm11,%xmm11
- vpmuludq 752(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 480(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm0,%xmm0
- vpmuludq 528(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm6,%xmm6
- vpmuludq 560(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm5,%xmm5
- vpmuludq 592(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm7,%xmm7
- vpmuludq 624(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm8,%xmm8
- vpmuludq 656(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm9,%xmm9
- vpmuludq 688(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm10,%xmm10
- vpmuludq 720(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm11,%xmm11
- vpmuludq 736(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm12,%xmm12
- vpmuludq 768(%rsp),%xmm0,%xmm0
- vpaddq %xmm0,%xmm13,%xmm13
- vpsrlq $26,%xmm6,%xmm0
- vpaddq %xmm0,%xmm5,%xmm5
- vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6
- vpsrlq $25,%xmm10,%xmm0
- vpaddq %xmm0,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $25,%xmm5,%xmm0
- vpaddq %xmm0,%xmm7,%xmm7
- vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5
- vpsrlq $26,%xmm11,%xmm0
- vpaddq %xmm0,%xmm12,%xmm12
- vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11
- vpsrlq $26,%xmm7,%xmm0
- vpaddq %xmm0,%xmm8,%xmm8
- vpand curve25519_sandy2x_m26(%rip),%xmm7,%xmm7
- vpsrlq $25,%xmm12,%xmm0
- vpaddq %xmm0,%xmm13,%xmm13
- vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12
- vpsrlq $25,%xmm8,%xmm0
- vpaddq %xmm0,%xmm9,%xmm9
- vpand curve25519_sandy2x_m25(%rip),%xmm8,%xmm8
- vpsrlq $26,%xmm13,%xmm0
- vpaddq %xmm0,%xmm14,%xmm14
- vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13
- vpsrlq $26,%xmm9,%xmm0
- vpaddq %xmm0,%xmm10,%xmm10
- vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9
- vpsrlq $25,%xmm14,%xmm0
- vpsllq $4,%xmm0,%xmm1
- vpaddq %xmm0,%xmm6,%xmm6
- vpsllq $1,%xmm0,%xmm0
- vpaddq %xmm0,%xmm1,%xmm1
- vpaddq %xmm1,%xmm6,%xmm6
- vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14
- vpsrlq $25,%xmm10,%xmm0
- vpaddq %xmm0,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $26,%xmm6,%xmm0
- vpaddq %xmm0,%xmm5,%xmm5
- vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6
- vpunpckhqdq %xmm5,%xmm6,%xmm1
- vpunpcklqdq %xmm5,%xmm6,%xmm0
- vpunpckhqdq %xmm8,%xmm7,%xmm3
- vpunpcklqdq %xmm8,%xmm7,%xmm2
- vpunpckhqdq %xmm10,%xmm9,%xmm5
- vpunpcklqdq %xmm10,%xmm9,%xmm4
- vpunpckhqdq %xmm12,%xmm11,%xmm7
- vpunpcklqdq %xmm12,%xmm11,%xmm6
- vpunpckhqdq %xmm14,%xmm13,%xmm9
- vpunpcklqdq %xmm14,%xmm13,%xmm8
- cmp $0,%rdx
- jne .Lladder_loop
- vmovdqu %xmm1,160(%rdi)
- vmovdqu %xmm0,80(%rdi)
- vmovdqu %xmm3,176(%rdi)
- vmovdqu %xmm2,96(%rdi)
- vmovdqu %xmm5,192(%rdi)
- vmovdqu %xmm4,112(%rdi)
- vmovdqu %xmm7,208(%rdi)
- vmovdqu %xmm6,128(%rdi)
- vmovdqu %xmm9,224(%rdi)
- vmovdqu %xmm8,144(%rdi)
- movq 1824(%rsp),%r11
- movq 1832(%rsp),%r12
- movq 1840(%rsp),%r13
- movq 1848(%rsp),%r14
- leave
- ret
-ENDPROC(curve25519_sandy2x_ladder)
-
-.align 32
-ENTRY(curve25519_sandy2x_ladder_base)
- push %rbp
- mov %rsp,%rbp
- sub $1568,%rsp
- and $-32,%rsp
- movq %r11,1536(%rsp)
- movq %r12,1544(%rsp)
- movq %r13,1552(%rsp)
- vmovdqa curve25519_sandy2x_v0_0(%rip),%xmm0
- vmovdqa curve25519_sandy2x_v1_0(%rip),%xmm1
- vmovdqa curve25519_sandy2x_v9_0(%rip),%xmm2
- vmovdqa %xmm2,0(%rsp)
- vmovdqa %xmm0,16(%rsp)
- vmovdqa %xmm0,32(%rsp)
- vmovdqa %xmm0,48(%rsp)
- vmovdqa %xmm0,64(%rsp)
- vmovdqa %xmm1,80(%rsp)
- vmovdqa %xmm0,96(%rsp)
- vmovdqa %xmm0,112(%rsp)
- vmovdqa %xmm0,128(%rsp)
- vmovdqa %xmm0,144(%rsp)
- vmovdqa %xmm1,%xmm0
- vpxor %xmm1,%xmm1,%xmm1
- vpxor %xmm2,%xmm2,%xmm2
- vpxor %xmm3,%xmm3,%xmm3
- vpxor %xmm4,%xmm4,%xmm4
- vpxor %xmm5,%xmm5,%xmm5
- vpxor %xmm6,%xmm6,%xmm6
- vpxor %xmm7,%xmm7,%xmm7
- vpxor %xmm8,%xmm8,%xmm8
- vpxor %xmm9,%xmm9,%xmm9
- movq 0(%rsi),%rdx
- movq 8(%rsi),%rcx
- movq 16(%rsi),%r8
- movq 24(%rsi),%r9
- shrd $1,%rcx,%rdx
- shrd $1,%r8,%rcx
- shrd $1,%r9,%r8
- shr $1,%r9
- xorq 0(%rsi),%rdx
- xorq 8(%rsi),%rcx
- xorq 16(%rsi),%r8
- xorq 24(%rsi),%r9
- leaq 512(%rsp),%rsi
- mov $64,%rax
-
- .align 16
- .Lladder_base_small_loop:
- mov %rdx,%r10
- mov %rcx,%r11
- mov %r8,%r12
- mov %r9,%r13
- shr $1,%rdx
- shr $1,%rcx
- shr $1,%r8
- shr $1,%r9
- and $1,%r10d
- and $1,%r11d
- and $1,%r12d
- and $1,%r13d
- neg %r10
- neg %r11
- neg %r12
- neg %r13
- movl %r10d,0(%rsi)
- movl %r11d,256(%rsi)
- movl %r12d,512(%rsi)
- movl %r13d,768(%rsi)
- add $4,%rsi
- sub $1,%rax
- jne .Lladder_base_small_loop
- mov $255,%rdx
- add $760,%rsi
-
- .align 16
- .Lladder_base_loop:
- sub $1,%rdx
- vbroadcastss 0(%rsi),%xmm10
- sub $4,%rsi
- vmovdqa 0(%rsp),%xmm11
- vmovdqa 80(%rsp),%xmm12
- vpxor %xmm11,%xmm0,%xmm13
- vpand %xmm10,%xmm13,%xmm13
- vpxor %xmm13,%xmm0,%xmm0
- vpxor %xmm13,%xmm11,%xmm11
- vpxor %xmm12,%xmm1,%xmm13
- vpand %xmm10,%xmm13,%xmm13
- vpxor %xmm13,%xmm1,%xmm1
- vpxor %xmm13,%xmm12,%xmm12
- vmovdqa 16(%rsp),%xmm13
- vmovdqa 96(%rsp),%xmm14
- vpxor %xmm13,%xmm2,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm2,%xmm2
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm14,%xmm3,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm3,%xmm3
- vpxor %xmm15,%xmm14,%xmm14
- vmovdqa %xmm13,0(%rsp)
- vmovdqa %xmm14,16(%rsp)
- vmovdqa 32(%rsp),%xmm13
- vmovdqa 112(%rsp),%xmm14
- vpxor %xmm13,%xmm4,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm4,%xmm4
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm14,%xmm5,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm5,%xmm5
- vpxor %xmm15,%xmm14,%xmm14
- vmovdqa %xmm13,32(%rsp)
- vmovdqa %xmm14,80(%rsp)
- vmovdqa 48(%rsp),%xmm13
- vmovdqa 128(%rsp),%xmm14
- vpxor %xmm13,%xmm6,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm6,%xmm6
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm14,%xmm7,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm7,%xmm7
- vpxor %xmm15,%xmm14,%xmm14
- vmovdqa %xmm13,48(%rsp)
- vmovdqa %xmm14,96(%rsp)
- vmovdqa 64(%rsp),%xmm13
- vmovdqa 144(%rsp),%xmm14
- vpxor %xmm13,%xmm8,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm14,%xmm9,%xmm15
- vpand %xmm10,%xmm15,%xmm15
- vpxor %xmm15,%xmm9,%xmm9
- vpxor %xmm15,%xmm14,%xmm14
- vmovdqa %xmm13,64(%rsp)
- vmovdqa %xmm14,112(%rsp)
- vpaddq curve25519_sandy2x_subc0(%rip),%xmm11,%xmm10
- vpsubq %xmm12,%xmm10,%xmm10
- vpaddq %xmm12,%xmm11,%xmm11
- vpunpckhqdq %xmm10,%xmm11,%xmm12
- vpunpcklqdq %xmm10,%xmm11,%xmm10
- vpaddq %xmm1,%xmm0,%xmm11
- vpaddq curve25519_sandy2x_subc0(%rip),%xmm0,%xmm0
- vpsubq %xmm1,%xmm0,%xmm0
- vpunpckhqdq %xmm11,%xmm0,%xmm1
- vpunpcklqdq %xmm11,%xmm0,%xmm0
- vpmuludq %xmm0,%xmm10,%xmm11
- vpmuludq %xmm1,%xmm10,%xmm13
- vmovdqa %xmm1,128(%rsp)
- vpaddq %xmm1,%xmm1,%xmm1
- vpmuludq %xmm0,%xmm12,%xmm14
- vmovdqa %xmm0,144(%rsp)
- vpaddq %xmm14,%xmm13,%xmm13
- vpmuludq %xmm1,%xmm12,%xmm0
- vmovdqa %xmm1,160(%rsp)
- vpaddq %xmm3,%xmm2,%xmm1
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm2,%xmm2
- vpsubq %xmm3,%xmm2,%xmm2
- vpunpckhqdq %xmm1,%xmm2,%xmm3
- vpunpcklqdq %xmm1,%xmm2,%xmm1
- vpmuludq %xmm1,%xmm10,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpmuludq %xmm3,%xmm10,%xmm2
- vmovdqa %xmm3,176(%rsp)
- vpaddq %xmm3,%xmm3,%xmm3
- vpmuludq %xmm1,%xmm12,%xmm14
- vmovdqa %xmm1,192(%rsp)
- vpaddq %xmm14,%xmm2,%xmm2
- vpmuludq %xmm3,%xmm12,%xmm1
- vmovdqa %xmm3,208(%rsp)
- vpaddq %xmm5,%xmm4,%xmm3
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm4,%xmm4
- vpsubq %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm3,%xmm4,%xmm5
- vpunpcklqdq %xmm3,%xmm4,%xmm3
- vpmuludq %xmm3,%xmm10,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vpmuludq %xmm5,%xmm10,%xmm4
- vmovdqa %xmm5,224(%rsp)
- vpaddq %xmm5,%xmm5,%xmm5
- vpmuludq %xmm3,%xmm12,%xmm14
- vmovdqa %xmm3,240(%rsp)
- vpaddq %xmm14,%xmm4,%xmm4
- vpaddq %xmm7,%xmm6,%xmm3
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm6,%xmm6
- vpsubq %xmm7,%xmm6,%xmm6
- vpunpckhqdq %xmm3,%xmm6,%xmm7
- vpunpcklqdq %xmm3,%xmm6,%xmm3
- vpmuludq %xmm3,%xmm10,%xmm6
- vpmuludq %xmm5,%xmm12,%xmm14
- vmovdqa %xmm5,256(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm5,%xmm5
- vmovdqa %xmm5,272(%rsp)
- vpaddq %xmm14,%xmm6,%xmm6
- vpmuludq %xmm7,%xmm10,%xmm5
- vmovdqa %xmm7,288(%rsp)
- vpaddq %xmm7,%xmm7,%xmm7
- vpmuludq %xmm3,%xmm12,%xmm14
- vmovdqa %xmm3,304(%rsp)
- vpaddq %xmm14,%xmm5,%xmm5
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vmovdqa %xmm3,320(%rsp)
- vpaddq %xmm9,%xmm8,%xmm3
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm8,%xmm8
- vpsubq %xmm9,%xmm8,%xmm8
- vpunpckhqdq %xmm3,%xmm8,%xmm9
- vpunpcklqdq %xmm3,%xmm8,%xmm3
- vmovdqa %xmm3,336(%rsp)
- vpmuludq %xmm7,%xmm12,%xmm8
- vmovdqa %xmm7,352(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm7,%xmm7
- vmovdqa %xmm7,368(%rsp)
- vpmuludq %xmm3,%xmm10,%xmm7
- vpaddq %xmm7,%xmm8,%xmm8
- vpmuludq %xmm9,%xmm10,%xmm7
- vmovdqa %xmm9,384(%rsp)
- vpaddq %xmm9,%xmm9,%xmm9
- vpmuludq %xmm3,%xmm12,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vmovdqa %xmm3,400(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm12,%xmm12
- vpmuludq %xmm9,%xmm12,%xmm3
- vmovdqa %xmm9,416(%rsp)
- vpaddq %xmm3,%xmm11,%xmm11
- vmovdqa 0(%rsp),%xmm3
- vmovdqa 16(%rsp),%xmm9
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10
- vpsubq %xmm9,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm10,%xmm3,%xmm9
- vpunpcklqdq %xmm10,%xmm3,%xmm3
- vpmuludq 144(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpmuludq 128(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm2,%xmm2
- vpmuludq 192(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpmuludq 176(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm4,%xmm4
- vpmuludq 240(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpmuludq 224(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm5,%xmm5
- vpmuludq 304(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpmuludq 288(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vpmuludq 336(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpmuludq 384(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 144(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 160(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm1,%xmm1
- vpmuludq 192(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 208(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 240(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 256(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpmuludq 304(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9
- vpmuludq 352(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 336(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 416(%rsp),%xmm9,%xmm9
- vpaddq %xmm9,%xmm0,%xmm0
- vmovdqa 32(%rsp),%xmm3
- vmovdqa 80(%rsp),%xmm9
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10
- vpsubq %xmm9,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm10,%xmm3,%xmm9
- vpunpcklqdq %xmm10,%xmm3,%xmm3
- vpmuludq 144(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpmuludq 128(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm4,%xmm4
- vpmuludq 192(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpmuludq 176(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm5,%xmm5
- vpmuludq 240(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpmuludq 224(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vpmuludq 304(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpmuludq 288(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm13,%xmm13
- vpmuludq 336(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpmuludq 384(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 144(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 160(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 192(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 208(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpmuludq 240(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9
- vpmuludq 256(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 304(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 352(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm0,%xmm0
- vpmuludq 336(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 416(%rsp),%xmm9,%xmm9
- vpaddq %xmm9,%xmm1,%xmm1
- vmovdqa 48(%rsp),%xmm3
- vmovdqa 96(%rsp),%xmm9
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10
- vpsubq %xmm9,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm10,%xmm3,%xmm9
- vpunpcklqdq %xmm10,%xmm3,%xmm3
- vpmuludq 144(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpmuludq 128(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm5,%xmm5
- vpmuludq 192(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpmuludq 176(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vpmuludq 240(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpmuludq 224(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm13,%xmm13
- vpmuludq 304(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpmuludq 288(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm2,%xmm2
- vpmuludq 336(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpmuludq 384(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 144(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 160(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpmuludq 192(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9
- vpmuludq 208(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 240(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 256(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm0,%xmm0
- vpmuludq 304(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 352(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm1,%xmm1
- vpmuludq 336(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 416(%rsp),%xmm9,%xmm9
- vpaddq %xmm9,%xmm6,%xmm6
- vmovdqa 64(%rsp),%xmm3
- vmovdqa 112(%rsp),%xmm9
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10
- vpsubq %xmm9,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm10,%xmm3,%xmm9
- vpunpcklqdq %xmm10,%xmm3,%xmm3
- vpmuludq 144(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpmuludq 128(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3
- vpmuludq 192(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpmuludq 176(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm13,%xmm13
- vpmuludq 240(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpmuludq 224(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm2,%xmm2
- vpmuludq 304(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpmuludq 288(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm4,%xmm4
- vpmuludq 336(%rsp),%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpmuludq 384(%rsp),%xmm3,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 144(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9
- vpmuludq 160(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 192(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 208(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm0,%xmm0
- vpmuludq 240(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpmuludq 256(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm1,%xmm1
- vpmuludq 304(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpmuludq 352(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 336(%rsp),%xmm9,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 416(%rsp),%xmm9,%xmm9
- vpaddq %xmm9,%xmm8,%xmm8
- vpsrlq $25,%xmm4,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4
- vpsrlq $26,%xmm11,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11
- vpsrlq $26,%xmm6,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6
- vpsrlq $25,%xmm13,%xmm3
- vpaddq %xmm3,%xmm0,%xmm0
- vpand curve25519_sandy2x_m25(%rip),%xmm13,%xmm13
- vpsrlq $25,%xmm5,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5
- vpsrlq $26,%xmm0,%xmm3
- vpaddq %xmm3,%xmm2,%xmm2
- vpand curve25519_sandy2x_m26(%rip),%xmm0,%xmm0
- vpsrlq $26,%xmm8,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8
- vpsrlq $25,%xmm2,%xmm3
- vpaddq %xmm3,%xmm1,%xmm1
- vpand curve25519_sandy2x_m25(%rip),%xmm2,%xmm2
- vpsrlq $25,%xmm7,%xmm3
- vpsllq $4,%xmm3,%xmm9
- vpaddq %xmm3,%xmm11,%xmm11
- vpsllq $1,%xmm3,%xmm3
- vpaddq %xmm3,%xmm9,%xmm9
- vpaddq %xmm9,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7
- vpsrlq $26,%xmm1,%xmm3
- vpaddq %xmm3,%xmm4,%xmm4
- vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1
- vpsrlq $26,%xmm11,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11
- vpsrlq $25,%xmm4,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4
- vpunpcklqdq %xmm13,%xmm11,%xmm3
- vpunpckhqdq %xmm13,%xmm11,%xmm9
- vpaddq curve25519_sandy2x_subc0(%rip),%xmm9,%xmm10
- vpsubq %xmm3,%xmm10,%xmm10
- vpaddq %xmm9,%xmm3,%xmm3
- vpunpckhqdq %xmm3,%xmm10,%xmm9
- vpunpcklqdq %xmm3,%xmm10,%xmm10
- vpmuludq %xmm10,%xmm10,%xmm3
- vpaddq %xmm10,%xmm10,%xmm10
- vpmuludq %xmm9,%xmm10,%xmm11
- vpunpcklqdq %xmm2,%xmm0,%xmm12
- vpunpckhqdq %xmm2,%xmm0,%xmm0
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm0,%xmm2
- vpsubq %xmm12,%xmm2,%xmm2
- vpaddq %xmm0,%xmm12,%xmm12
- vpunpckhqdq %xmm12,%xmm2,%xmm0
- vpunpcklqdq %xmm12,%xmm2,%xmm2
- vpmuludq %xmm2,%xmm10,%xmm12
- vpaddq %xmm9,%xmm9,%xmm13
- vpmuludq %xmm13,%xmm9,%xmm9
- vpaddq %xmm9,%xmm12,%xmm12
- vpmuludq %xmm0,%xmm10,%xmm9
- vpmuludq %xmm2,%xmm13,%xmm14
- vpaddq %xmm14,%xmm9,%xmm9
- vpunpcklqdq %xmm4,%xmm1,%xmm14
- vpunpckhqdq %xmm4,%xmm1,%xmm1
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm1,%xmm4
- vpsubq %xmm14,%xmm4,%xmm4
- vpaddq %xmm1,%xmm14,%xmm14
- vpunpckhqdq %xmm14,%xmm4,%xmm1
- vpunpcklqdq %xmm14,%xmm4,%xmm4
- vmovdqa %xmm1,0(%rsp)
- vpaddq %xmm1,%xmm1,%xmm1
- vmovdqa %xmm1,16(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vmovdqa %xmm1,32(%rsp)
- vpmuludq %xmm4,%xmm10,%xmm1
- vpmuludq %xmm2,%xmm2,%xmm14
- vpaddq %xmm14,%xmm1,%xmm1
- vpmuludq 0(%rsp),%xmm10,%xmm14
- vpmuludq %xmm4,%xmm13,%xmm15
- vpaddq %xmm15,%xmm14,%xmm14
- vpunpcklqdq %xmm5,%xmm6,%xmm15
- vpunpckhqdq %xmm5,%xmm6,%xmm5
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm6
- vpsubq %xmm15,%xmm6,%xmm6
- vpaddq %xmm5,%xmm15,%xmm15
- vpunpckhqdq %xmm15,%xmm6,%xmm5
- vpunpcklqdq %xmm15,%xmm6,%xmm6
- vmovdqa %xmm6,48(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm6,%xmm6
- vmovdqa %xmm6,64(%rsp)
- vmovdqa %xmm5,80(%rsp)
- vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm5,%xmm5
- vmovdqa %xmm5,96(%rsp)
- vpmuludq 48(%rsp),%xmm10,%xmm5
- vpaddq %xmm0,%xmm0,%xmm6
- vpmuludq %xmm6,%xmm0,%xmm0
- vpaddq %xmm0,%xmm5,%xmm5
- vpmuludq 80(%rsp),%xmm10,%xmm0
- vpmuludq %xmm4,%xmm6,%xmm15
- vpaddq %xmm15,%xmm0,%xmm0
- vpmuludq %xmm6,%xmm13,%xmm15
- vpaddq %xmm15,%xmm1,%xmm1
- vpmuludq %xmm6,%xmm2,%xmm15
- vpaddq %xmm15,%xmm14,%xmm14
- vpunpcklqdq %xmm7,%xmm8,%xmm15
- vpunpckhqdq %xmm7,%xmm8,%xmm7
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm8
- vpsubq %xmm15,%xmm8,%xmm8
- vpaddq %xmm7,%xmm15,%xmm15
- vpunpckhqdq %xmm15,%xmm8,%xmm7
- vpunpcklqdq %xmm15,%xmm8,%xmm8
- vmovdqa %xmm8,112(%rsp)
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm8,%xmm8
- vmovdqa %xmm8,160(%rsp)
- vpmuludq 112(%rsp),%xmm10,%xmm8
- vpmuludq %xmm7,%xmm10,%xmm10
- vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm7,%xmm15
- vpmuludq %xmm15,%xmm7,%xmm7
- vpaddq %xmm7,%xmm8,%xmm8
- vpmuludq %xmm15,%xmm13,%xmm7
- vpaddq %xmm7,%xmm3,%xmm3
- vpmuludq %xmm15,%xmm2,%xmm7
- vpaddq %xmm7,%xmm11,%xmm11
- vpmuludq 80(%rsp),%xmm13,%xmm7
- vpaddq %xmm7,%xmm7,%xmm7
- vpaddq %xmm7,%xmm8,%xmm8
- vpmuludq 16(%rsp),%xmm13,%xmm7
- vpaddq %xmm7,%xmm5,%xmm5
- vpmuludq 48(%rsp),%xmm13,%xmm7
- vpaddq %xmm7,%xmm0,%xmm0
- vpmuludq 112(%rsp),%xmm13,%xmm7
- vpaddq %xmm7,%xmm10,%xmm10
- vpmuludq %xmm15,%xmm6,%xmm7
- vpaddq %xmm7,%xmm12,%xmm12
- vpmuludq %xmm15,%xmm4,%xmm7
- vpaddq %xmm7,%xmm9,%xmm9
- vpaddq %xmm2,%xmm2,%xmm2
- vpmuludq %xmm4,%xmm2,%xmm7
- vpaddq %xmm7,%xmm5,%xmm5
- vpmuludq 160(%rsp),%xmm2,%xmm7
- vpaddq %xmm7,%xmm3,%xmm3
- vpmuludq 160(%rsp),%xmm6,%xmm7
- vpaddq %xmm7,%xmm11,%xmm11
- vpmuludq 0(%rsp),%xmm2,%xmm7
- vpaddq %xmm7,%xmm0,%xmm0
- vpmuludq 48(%rsp),%xmm2,%xmm7
- vpaddq %xmm7,%xmm8,%xmm8
- vpmuludq 80(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 96(%rsp),%xmm4,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq %xmm4,%xmm4,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpaddq %xmm4,%xmm4,%xmm2
- vpmuludq 160(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vpmuludq 16(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vpmuludq 48(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm14,%xmm14
- vpmuludq 96(%rsp),%xmm6,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vmovdqa 16(%rsp),%xmm4
- vpmuludq 160(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm9,%xmm9
- vpmuludq 16(%rsp),%xmm6,%xmm4
- vpaddq %xmm4,%xmm8,%xmm8
- vpmuludq 48(%rsp),%xmm6,%xmm4
- vpaddq %xmm4,%xmm10,%xmm10
- vpmuludq 80(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm4,%xmm4
- vpaddq %xmm4,%xmm5,%xmm5
- vpmuludq 112(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm0,%xmm0
- vmovdqa 48(%rsp),%xmm4
- vpaddq %xmm4,%xmm4,%xmm4
- vpmuludq 160(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vmovdqa 80(%rsp),%xmm4
- vpaddq %xmm4,%xmm4,%xmm4
- vpmuludq 160(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm14,%xmm14
- vpmuludq 64(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vmovdqa 16(%rsp),%xmm4
- vpmuludq 64(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm11,%xmm11
- vmovdqa 16(%rsp),%xmm4
- vpmuludq 96(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vmovdqa 48(%rsp),%xmm4
- vpmuludq 96(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm9,%xmm9
- vpmuludq 0(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vmovdqa 32(%rsp),%xmm2
- vpmuludq 0(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vmovdqa 64(%rsp),%xmm2
- vpmuludq 48(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vmovdqa 96(%rsp),%xmm2
- vpmuludq 80(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm1,%xmm1
- vmovdqa 160(%rsp),%xmm2
- vpmuludq 112(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpsrlq $26,%xmm3,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3
- vpsrlq $25,%xmm14,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14
- vpsrlq $25,%xmm11,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpand curve25519_sandy2x_m25(%rip),%xmm11,%xmm11
- vpsrlq $26,%xmm5,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5
- vpsrlq $26,%xmm12,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpand curve25519_sandy2x_m26(%rip),%xmm12,%xmm12
- vpsrlq $25,%xmm0,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0
- vpsrlq $25,%xmm9,%xmm2
- vpaddq %xmm2,%xmm1,%xmm1
- vpand curve25519_sandy2x_m25(%rip),%xmm9,%xmm9
- vpsrlq $26,%xmm8,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8
- vpsrlq $26,%xmm1,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1
- vpsrlq $25,%xmm10,%xmm2
- vpsllq $4,%xmm2,%xmm4
- vpaddq %xmm2,%xmm3,%xmm3
- vpsllq $1,%xmm2,%xmm2
- vpaddq %xmm2,%xmm4,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $25,%xmm14,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14
- vpsrlq $26,%xmm3,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3
- vpunpckhqdq %xmm11,%xmm3,%xmm2
- vmovdqa %xmm2,0(%rsp)
- vpunpcklqdq %xmm11,%xmm3,%xmm2
- vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm2,%xmm2
- vmovdqa %xmm2,80(%rsp)
- vpunpckhqdq %xmm9,%xmm12,%xmm2
- vmovdqa %xmm2,16(%rsp)
- vpunpcklqdq %xmm9,%xmm12,%xmm2
- vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm2,%xmm2
- vmovdqa %xmm2,96(%rsp)
- vpunpckhqdq %xmm14,%xmm1,%xmm2
- vmovdqa %xmm2,32(%rsp)
- vpunpcklqdq %xmm14,%xmm1,%xmm1
- vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm1,%xmm1
- vmovdqa %xmm1,112(%rsp)
- vpunpckhqdq %xmm0,%xmm5,%xmm1
- vmovdqa %xmm1,48(%rsp)
- vpunpcklqdq %xmm0,%xmm5,%xmm0
- vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm0,%xmm0
- vmovdqa %xmm0,160(%rsp)
- vpunpckhqdq %xmm10,%xmm8,%xmm0
- vmovdqa %xmm0,64(%rsp)
- vpunpcklqdq %xmm10,%xmm8,%xmm0
- vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm0,%xmm0
- vmovdqa %xmm0,208(%rsp)
- vmovdqa 144(%rsp),%xmm0
- vpmuludq %xmm0,%xmm0,%xmm1
- vpaddq %xmm0,%xmm0,%xmm0
- vmovdqa 128(%rsp),%xmm2
- vpmuludq %xmm2,%xmm0,%xmm3
- vmovdqa 192(%rsp),%xmm4
- vpmuludq %xmm4,%xmm0,%xmm5
- vmovdqa 176(%rsp),%xmm6
- vpmuludq %xmm6,%xmm0,%xmm7
- vmovdqa 240(%rsp),%xmm8
- vpmuludq %xmm8,%xmm0,%xmm9
- vpmuludq 224(%rsp),%xmm0,%xmm10
- vpmuludq 304(%rsp),%xmm0,%xmm11
- vpmuludq 288(%rsp),%xmm0,%xmm12
- vpmuludq 336(%rsp),%xmm0,%xmm13
- vmovdqa 384(%rsp),%xmm14
- vpmuludq %xmm14,%xmm0,%xmm0
- vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm14,%xmm15
- vpmuludq %xmm15,%xmm14,%xmm14
- vpaddq %xmm14,%xmm13,%xmm13
- vpaddq %xmm6,%xmm6,%xmm14
- vpmuludq %xmm14,%xmm6,%xmm6
- vpaddq %xmm6,%xmm11,%xmm11
- vpaddq %xmm2,%xmm2,%xmm6
- vpmuludq %xmm6,%xmm2,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq %xmm15,%xmm6,%xmm2
- vpaddq %xmm2,%xmm1,%xmm1
- vpmuludq %xmm15,%xmm4,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vpmuludq 256(%rsp),%xmm6,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq 304(%rsp),%xmm6,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 352(%rsp),%xmm6,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 336(%rsp),%xmm6,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpmuludq %xmm4,%xmm6,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq %xmm14,%xmm6,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq %xmm8,%xmm6,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq %xmm15,%xmm14,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq %xmm15,%xmm8,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq %xmm4,%xmm4,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq %xmm14,%xmm4,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpaddq %xmm4,%xmm4,%xmm2
- vpmuludq %xmm8,%xmm2,%xmm4
- vpaddq %xmm4,%xmm11,%xmm11
- vpmuludq 400(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vpmuludq 400(%rsp),%xmm14,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vpmuludq 224(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vpmuludq 304(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm13,%xmm13
- vpmuludq 288(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpmuludq 368(%rsp),%xmm8,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vpmuludq %xmm8,%xmm14,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq %xmm8,%xmm8,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpaddq %xmm8,%xmm8,%xmm2
- vpmuludq 400(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm5,%xmm5
- vpmuludq 256(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm9,%xmm9
- vpmuludq 304(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm10,%xmm10
- vpmuludq 368(%rsp),%xmm14,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vmovdqa 256(%rsp),%xmm4
- vpmuludq 400(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm7,%xmm7
- vpmuludq 256(%rsp),%xmm14,%xmm4
- vpaddq %xmm4,%xmm13,%xmm13
- vpmuludq 304(%rsp),%xmm14,%xmm4
- vpaddq %xmm4,%xmm0,%xmm0
- vpmuludq 352(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm11,%xmm11
- vpmuludq 336(%rsp),%xmm15,%xmm4
- vpaddq %xmm4,%xmm12,%xmm12
- vmovdqa 304(%rsp),%xmm4
- vpaddq %xmm4,%xmm4,%xmm4
- vpmuludq 400(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm9,%xmm9
- vpmuludq 320(%rsp),%xmm2,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vmovdqa 256(%rsp),%xmm4
- vpmuludq 320(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm3,%xmm3
- vmovdqa 256(%rsp),%xmm4
- vpmuludq 368(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm5,%xmm5
- vmovdqa 304(%rsp),%xmm4
- vpmuludq 368(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm7,%xmm7
- vmovdqa 352(%rsp),%xmm4
- vpmuludq 400(%rsp),%xmm4,%xmm4
- vpaddq %xmm4,%xmm10,%xmm10
- vpmuludq 224(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vmovdqa 272(%rsp),%xmm2
- vpmuludq 224(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm1,%xmm1
- vmovdqa 320(%rsp),%xmm2
- vpmuludq 304(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vmovdqa 368(%rsp),%xmm2
- vpmuludq 288(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vmovdqa 400(%rsp),%xmm2
- vpmuludq 336(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpsrlq $26,%xmm1,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1
- vpsrlq $25,%xmm10,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $25,%xmm3,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3
- vpsrlq $26,%xmm11,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11
- vpsrlq $26,%xmm5,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5
- vpsrlq $25,%xmm12,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12
- vpsrlq $25,%xmm7,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7
- vpsrlq $26,%xmm13,%xmm2
- vpaddq %xmm2,%xmm0,%xmm0
- vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13
- vpsrlq $26,%xmm9,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9
- vpsrlq $25,%xmm0,%xmm2
- vpsllq $4,%xmm2,%xmm4
- vpaddq %xmm2,%xmm1,%xmm1
- vpsllq $1,%xmm2,%xmm2
- vpaddq %xmm2,%xmm4,%xmm4
- vpaddq %xmm4,%xmm1,%xmm1
- vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0
- vpsrlq $25,%xmm10,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $26,%xmm1,%xmm2
- vpaddq %xmm2,%xmm3,%xmm3
- vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1
- vpunpckhqdq %xmm3,%xmm1,%xmm2
- vpunpcklqdq %xmm3,%xmm1,%xmm1
- vmovdqa %xmm1,176(%rsp)
- vpaddq curve25519_sandy2x_subc0(%rip),%xmm2,%xmm3
- vpsubq %xmm1,%xmm3,%xmm3
- vpunpckhqdq %xmm3,%xmm2,%xmm1
- vpunpcklqdq %xmm3,%xmm2,%xmm2
- vmovdqa %xmm2,192(%rsp)
- vmovdqa %xmm1,224(%rsp)
- vpsllq $1,%xmm1,%xmm1
- vmovdqa %xmm1,240(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm3,%xmm3
- vmovdqa 80(%rsp),%xmm1
- vpunpcklqdq %xmm1,%xmm3,%xmm2
- vpunpckhqdq %xmm1,%xmm3,%xmm1
- vpunpckhqdq %xmm7,%xmm5,%xmm3
- vpunpcklqdq %xmm7,%xmm5,%xmm4
- vmovdqa %xmm4,256(%rsp)
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm5
- vpsubq %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm5,%xmm3,%xmm4
- vpunpcklqdq %xmm5,%xmm3,%xmm3
- vmovdqa %xmm3,272(%rsp)
- vmovdqa %xmm4,288(%rsp)
- vpsllq $1,%xmm4,%xmm4
- vmovdqa %xmm4,304(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm5,%xmm5
- vmovdqa 96(%rsp),%xmm3
- vpunpcklqdq %xmm3,%xmm5,%xmm4
- vpunpckhqdq %xmm3,%xmm5,%xmm3
- vpunpckhqdq %xmm10,%xmm9,%xmm5
- vpunpcklqdq %xmm10,%xmm9,%xmm6
- vmovdqa %xmm6,320(%rsp)
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm7
- vpsubq %xmm6,%xmm7,%xmm7
- vpunpckhqdq %xmm7,%xmm5,%xmm6
- vpunpcklqdq %xmm7,%xmm5,%xmm5
- vmovdqa %xmm5,336(%rsp)
- vmovdqa %xmm6,352(%rsp)
- vpsllq $1,%xmm6,%xmm6
- vmovdqa %xmm6,368(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm7,%xmm7
- vmovdqa 112(%rsp),%xmm5
- vpunpcklqdq %xmm5,%xmm7,%xmm6
- vpunpckhqdq %xmm5,%xmm7,%xmm5
- vpunpckhqdq %xmm12,%xmm11,%xmm7
- vpunpcklqdq %xmm12,%xmm11,%xmm8
- vmovdqa %xmm8,384(%rsp)
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm9
- vpsubq %xmm8,%xmm9,%xmm9
- vpunpckhqdq %xmm9,%xmm7,%xmm8
- vpunpcklqdq %xmm9,%xmm7,%xmm7
- vmovdqa %xmm7,400(%rsp)
- vmovdqa %xmm8,416(%rsp)
- vpsllq $1,%xmm8,%xmm8
- vmovdqa %xmm8,432(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm9,%xmm9
- vmovdqa 160(%rsp),%xmm7
- vpunpcklqdq %xmm7,%xmm9,%xmm8
- vpunpckhqdq %xmm7,%xmm9,%xmm7
- vpunpckhqdq %xmm0,%xmm13,%xmm9
- vpunpcklqdq %xmm0,%xmm13,%xmm0
- vmovdqa %xmm0,160(%rsp)
- vpaddq curve25519_sandy2x_subc2(%rip),%xmm9,%xmm10
- vpsubq %xmm0,%xmm10,%xmm10
- vpunpckhqdq %xmm10,%xmm9,%xmm0
- vpunpcklqdq %xmm10,%xmm9,%xmm9
- vmovdqa %xmm9,448(%rsp)
- vmovdqa %xmm0,464(%rsp)
- vpsllq $1,%xmm0,%xmm0
- vmovdqa %xmm0,480(%rsp)
- vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm10,%xmm10
- vmovdqa 208(%rsp),%xmm0
- vpunpcklqdq %xmm0,%xmm10,%xmm9
- vpunpckhqdq %xmm0,%xmm10,%xmm0
- vpsrlq $26,%xmm2,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2
- vpsrlq $25,%xmm5,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5
- vpsrlq $25,%xmm1,%xmm10
- vpaddq %xmm10,%xmm4,%xmm4
- vpand curve25519_sandy2x_m25(%rip),%xmm1,%xmm1
- vpsrlq $26,%xmm8,%xmm10
- vpaddq %xmm10,%xmm7,%xmm7
- vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8
- vpsrlq $26,%xmm4,%xmm10
- vpaddq %xmm10,%xmm3,%xmm3
- vpand curve25519_sandy2x_m26(%rip),%xmm4,%xmm4
- vpsrlq $25,%xmm7,%xmm10
- vpaddq %xmm10,%xmm9,%xmm9
- vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7
- vpsrlq $25,%xmm3,%xmm10
- vpaddq %xmm10,%xmm6,%xmm6
- vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3
- vpsrlq $26,%xmm9,%xmm10
- vpaddq %xmm10,%xmm0,%xmm0
- vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9
- vpsrlq $26,%xmm6,%xmm10
- vpaddq %xmm10,%xmm5,%xmm5
- vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6
- vpsrlq $25,%xmm0,%xmm10
- vpsllq $4,%xmm10,%xmm11
- vpaddq %xmm10,%xmm2,%xmm2
- vpsllq $1,%xmm10,%xmm10
- vpaddq %xmm10,%xmm11,%xmm11
- vpaddq %xmm11,%xmm2,%xmm2
- vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0
- vpsrlq $25,%xmm5,%xmm10
- vpaddq %xmm10,%xmm8,%xmm8
- vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5
- vpsrlq $26,%xmm2,%xmm10
- vpaddq %xmm10,%xmm1,%xmm1
- vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2
- vpunpckhqdq %xmm1,%xmm2,%xmm10
- vmovdqa %xmm10,80(%rsp)
- vpunpcklqdq %xmm1,%xmm2,%xmm1
- vpunpckhqdq %xmm3,%xmm4,%xmm2
- vmovdqa %xmm2,96(%rsp)
- vpunpcklqdq %xmm3,%xmm4,%xmm2
- vpunpckhqdq %xmm5,%xmm6,%xmm3
- vmovdqa %xmm3,112(%rsp)
- vpunpcklqdq %xmm5,%xmm6,%xmm3
- vpunpckhqdq %xmm7,%xmm8,%xmm4
- vmovdqa %xmm4,128(%rsp)
- vpunpcklqdq %xmm7,%xmm8,%xmm4
- vpunpckhqdq %xmm0,%xmm9,%xmm5
- vmovdqa %xmm5,144(%rsp)
- vpunpcklqdq %xmm0,%xmm9,%xmm0
- vmovdqa 176(%rsp),%xmm5
- vpaddq %xmm5,%xmm1,%xmm1
- vpunpcklqdq %xmm1,%xmm5,%xmm6
- vpunpckhqdq %xmm1,%xmm5,%xmm1
- vpmuludq 224(%rsp),%xmm6,%xmm5
- vpmuludq 192(%rsp),%xmm1,%xmm7
- vpaddq %xmm7,%xmm5,%xmm5
- vpmuludq 272(%rsp),%xmm6,%xmm7
- vpmuludq 240(%rsp),%xmm1,%xmm8
- vpaddq %xmm8,%xmm7,%xmm7
- vpmuludq 288(%rsp),%xmm6,%xmm8
- vpmuludq 272(%rsp),%xmm1,%xmm9
- vpaddq %xmm9,%xmm8,%xmm8
- vpmuludq 336(%rsp),%xmm6,%xmm9
- vpmuludq 304(%rsp),%xmm1,%xmm10
- vpaddq %xmm10,%xmm9,%xmm9
- vpmuludq 352(%rsp),%xmm6,%xmm10
- vpmuludq 336(%rsp),%xmm1,%xmm11
- vpaddq %xmm11,%xmm10,%xmm10
- vpmuludq 400(%rsp),%xmm6,%xmm11
- vpmuludq 368(%rsp),%xmm1,%xmm12
- vpaddq %xmm12,%xmm11,%xmm11
- vpmuludq 416(%rsp),%xmm6,%xmm12
- vpmuludq 400(%rsp),%xmm1,%xmm13
- vpaddq %xmm13,%xmm12,%xmm12
- vpmuludq 448(%rsp),%xmm6,%xmm13
- vpmuludq 432(%rsp),%xmm1,%xmm14
- vpaddq %xmm14,%xmm13,%xmm13
- vpmuludq 464(%rsp),%xmm6,%xmm14
- vpmuludq 448(%rsp),%xmm1,%xmm15
- vpaddq %xmm15,%xmm14,%xmm14
- vpmuludq 192(%rsp),%xmm6,%xmm6
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vpmuludq 480(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm6,%xmm6
- vmovdqa 256(%rsp),%xmm1
- vpaddq %xmm1,%xmm2,%xmm2
- vpunpcklqdq %xmm2,%xmm1,%xmm15
- vpunpckhqdq %xmm2,%xmm1,%xmm1
- vpmuludq 192(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq 224(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 272(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq 288(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 336(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq 352(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 400(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 416(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm15,%xmm15
- vpmuludq 448(%rsp),%xmm15,%xmm2
- vpaddq %xmm2,%xmm6,%xmm6
- vpmuludq 464(%rsp),%xmm15,%xmm15
- vpaddq %xmm15,%xmm5,%xmm5
- vpmuludq 192(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 240(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq 272(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 304(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq 336(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 368(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 400(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vpmuludq 432(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm6,%xmm6
- vpmuludq 448(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq 480(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm7,%xmm7
- vmovdqa 320(%rsp),%xmm1
- vpaddq %xmm1,%xmm3,%xmm3
- vpunpcklqdq %xmm3,%xmm1,%xmm2
- vpunpckhqdq %xmm3,%xmm1,%xmm1
- vpmuludq 192(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm9,%xmm9
- vpmuludq 224(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm10,%xmm10
- vpmuludq 272(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 288(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm12,%xmm12
- vpmuludq 336(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 352(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2
- vpmuludq 400(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 416(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 448(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq 464(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 192(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 240(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm11,%xmm11
- vpmuludq 272(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 304(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 336(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vpmuludq 368(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm6,%xmm6
- vpmuludq 400(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq 432(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq 448(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 480(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm9,%xmm9
- vmovdqa 384(%rsp),%xmm1
- vpaddq %xmm1,%xmm4,%xmm4
- vpunpcklqdq %xmm4,%xmm1,%xmm2
- vpunpckhqdq %xmm4,%xmm1,%xmm1
- vpmuludq 192(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm11,%xmm11
- vpmuludq 224(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm12,%xmm12
- vpmuludq 272(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm13,%xmm13
- vpmuludq 288(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2
- vpmuludq 336(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm6,%xmm6
- vpmuludq 352(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm5,%xmm5
- vpmuludq 400(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm7,%xmm7
- vpmuludq 416(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm8,%xmm8
- vpmuludq 448(%rsp),%xmm2,%xmm3
- vpaddq %xmm3,%xmm9,%xmm9
- vpmuludq 464(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 192(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 240(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm13,%xmm13
- vpmuludq 272(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1
- vpmuludq 304(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm6,%xmm6
- vpmuludq 336(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm5,%xmm5
- vpmuludq 368(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm7,%xmm7
- vpmuludq 400(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm8,%xmm8
- vpmuludq 432(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm9,%xmm9
- vpmuludq 448(%rsp),%xmm1,%xmm2
- vpaddq %xmm2,%xmm10,%xmm10
- vpmuludq 480(%rsp),%xmm1,%xmm1
- vpaddq %xmm1,%xmm11,%xmm11
- vmovdqa 160(%rsp),%xmm1
- vpaddq %xmm1,%xmm0,%xmm0
- vpunpcklqdq %xmm0,%xmm1,%xmm2
- vpunpckhqdq %xmm0,%xmm1,%xmm0
- vpmuludq 192(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm13,%xmm13
- vpmuludq 224(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2
- vpmuludq 272(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm6,%xmm6
- vpmuludq 288(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm5,%xmm5
- vpmuludq 336(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm7,%xmm7
- vpmuludq 352(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm8,%xmm8
- vpmuludq 400(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm9,%xmm9
- vpmuludq 416(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm10,%xmm10
- vpmuludq 448(%rsp),%xmm2,%xmm1
- vpaddq %xmm1,%xmm11,%xmm11
- vpmuludq 464(%rsp),%xmm2,%xmm2
- vpaddq %xmm2,%xmm12,%xmm12
- vpmuludq 192(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm14,%xmm14
- vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm0,%xmm0
- vpmuludq 240(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm6,%xmm6
- vpmuludq 272(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm5,%xmm5
- vpmuludq 304(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm7,%xmm7
- vpmuludq 336(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm8,%xmm8
- vpmuludq 368(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm9,%xmm9
- vpmuludq 400(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm10,%xmm10
- vpmuludq 432(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm11,%xmm11
- vpmuludq 448(%rsp),%xmm0,%xmm1
- vpaddq %xmm1,%xmm12,%xmm12
- vpmuludq 480(%rsp),%xmm0,%xmm0
- vpaddq %xmm0,%xmm13,%xmm13
- vpsrlq $26,%xmm6,%xmm0
- vpaddq %xmm0,%xmm5,%xmm5
- vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6
- vpsrlq $25,%xmm10,%xmm0
- vpaddq %xmm0,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $25,%xmm5,%xmm0
- vpaddq %xmm0,%xmm7,%xmm7
- vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5
- vpsrlq $26,%xmm11,%xmm0
- vpaddq %xmm0,%xmm12,%xmm12
- vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11
- vpsrlq $26,%xmm7,%xmm0
- vpaddq %xmm0,%xmm8,%xmm8
- vpand curve25519_sandy2x_m26(%rip),%xmm7,%xmm7
- vpsrlq $25,%xmm12,%xmm0
- vpaddq %xmm0,%xmm13,%xmm13
- vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12
- vpsrlq $25,%xmm8,%xmm0
- vpaddq %xmm0,%xmm9,%xmm9
- vpand curve25519_sandy2x_m25(%rip),%xmm8,%xmm8
- vpsrlq $26,%xmm13,%xmm0
- vpaddq %xmm0,%xmm14,%xmm14
- vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13
- vpsrlq $26,%xmm9,%xmm0
- vpaddq %xmm0,%xmm10,%xmm10
- vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9
- vpsrlq $25,%xmm14,%xmm0
- vpsllq $4,%xmm0,%xmm1
- vpaddq %xmm0,%xmm6,%xmm6
- vpsllq $1,%xmm0,%xmm0
- vpaddq %xmm0,%xmm1,%xmm1
- vpaddq %xmm1,%xmm6,%xmm6
- vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14
- vpsrlq $25,%xmm10,%xmm0
- vpaddq %xmm0,%xmm11,%xmm11
- vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10
- vpsrlq $26,%xmm6,%xmm0
- vpaddq %xmm0,%xmm5,%xmm5
- vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6
- vpunpckhqdq %xmm5,%xmm6,%xmm1
- vpunpcklqdq %xmm5,%xmm6,%xmm0
- vpunpckhqdq %xmm8,%xmm7,%xmm3
- vpunpcklqdq %xmm8,%xmm7,%xmm2
- vpunpckhqdq %xmm10,%xmm9,%xmm5
- vpunpcklqdq %xmm10,%xmm9,%xmm4
- vpunpckhqdq %xmm12,%xmm11,%xmm7
- vpunpcklqdq %xmm12,%xmm11,%xmm6
- vpunpckhqdq %xmm14,%xmm13,%xmm9
- vpunpcklqdq %xmm14,%xmm13,%xmm8
- cmp $0,%rdx
- jne .Lladder_base_loop
- vmovdqu %xmm1,80(%rdi)
- vmovdqu %xmm0,0(%rdi)
- vmovdqu %xmm3,96(%rdi)
- vmovdqu %xmm2,16(%rdi)
- vmovdqu %xmm5,112(%rdi)
- vmovdqu %xmm4,32(%rdi)
- vmovdqu %xmm7,128(%rdi)
- vmovdqu %xmm6,48(%rdi)
- vmovdqu %xmm9,144(%rdi)
- vmovdqu %xmm8,64(%rdi)
- movq 1536(%rsp),%r11
- movq 1544(%rsp),%r12
- movq 1552(%rsp),%r13
- leave
- ret
-ENDPROC(curve25519_sandy2x_ladder_base)
-#endif /* CONFIG_AS_AVX */
diff --git a/src/crypto/curve25519-x86_64.h b/src/crypto/curve25519-x86_64.h
index c90b13d..49120bd 100644
--- a/src/crypto/curve25519-x86_64.h
+++ b/src/crypto/curve25519-x86_64.h
@@ -1,176 +1,2068 @@
/* SPDX-License-Identifier: GPL-2.0
*
- * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * Based on algorithms from Tung Chou <blueprint@crypto.tw>
+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
*/
#include <asm/cpufeature.h>
#include <asm/processor.h>
-#include <asm/fpu/api.h>
-#include <asm/simd.h>
-static bool curve25519_use_avx __ro_after_init;
+static bool curve25519_use_bmi2 __ro_after_init;
+static bool curve25519_use_adx __ro_after_init;
void __init curve25519_fpu_init(void)
{
-#ifndef CONFIG_UML
- curve25519_use_avx = boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#endif
+ curve25519_use_bmi2 = boot_cpu_has(X86_FEATURE_BMI2);
+ curve25519_use_adx = boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX);
+}
+
+enum { NUM_WORDS_ELTFP25519 = 4 };
+typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
+typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
+
+#define mul_eltfp25519_1w_adx(c, a, b) do { \
+ mul_256x256_integer_adx(buffer_1w, a, b); \
+ red_eltfp25519_1w_adx(c, buffer_1w); \
+} while (0)
+
+#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
+ mul_256x256_integer_bmi2(buffer_1w, a, b); \
+ red_eltfp25519_1w_bmi2(c, buffer_1w); \
+} while(0)
+
+#define sqr_eltfp25519_1w_adx(a) do { \
+ sqr_256x256_integer_adx(buffer_1w, a); \
+ red_eltfp25519_1w_adx(a, buffer_1w); \
+} while (0)
+
+#define sqr_eltfp25519_1w_bmi2(a) do { \
+ sqr_256x256_integer_bmi2(buffer_1w, a); \
+ red_eltfp25519_1w_bmi2(a, buffer_1w); \
+} while (0)
+
+#define mul_eltfp25519_2w_adx(c, a, b) do { \
+ mul2_256x256_integer_adx(buffer_2w, a, b); \
+ red_eltfp25519_2w_adx(c, buffer_2w); \
+} while (0)
+
+#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
+ mul2_256x256_integer_bmi2(buffer_2w, a, b); \
+ red_eltfp25519_2w_bmi2(c, buffer_2w); \
+} while (0)
+
+#define sqr_eltfp25519_2w_adx(a) do { \
+ sqr2_256x256_integer_adx(buffer_2w, a); \
+ red_eltfp25519_2w_adx(a, buffer_2w); \
+} while (0)
+
+#define sqr_eltfp25519_2w_bmi2(a) do { \
+ sqr2_256x256_integer_bmi2(buffer_2w, a); \
+ red_eltfp25519_2w_bmi2(a, buffer_2w); \
+} while (0)
+
+#define sqrn_eltfp25519_1w_adx(a, times) do { \
+ int ____counter = (times); \
+ while (____counter-- > 0) \
+ sqr_eltfp25519_1w_adx(a); \
+} while (0)
+
+#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
+ int ____counter = (times); \
+ while (____counter-- > 0) \
+ sqr_eltfp25519_1w_bmi2(a); \
+} while (0)
+
+#define copy_eltfp25519_1w(C, A) do { \
+ (C)[0] = (A)[0]; \
+ (C)[1] = (A)[1]; \
+ (C)[2] = (A)[2]; \
+ (C)[3] = (A)[3]; \
+} while (0)
+
+#define setzero_eltfp25519_1w(C) do { \
+ (C)[0] = 0; \
+ (C)[1] = 0; \
+ (C)[2] = 0; \
+ (C)[3] = 0; \
+} while (0)
+
+__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
+ /* 1 */ 0xfffffffffffffff3, 0xffffffffffffffff, 0xffffffffffffffff, 0x5fffffffffffffff,
+ /* 2 */ 0x6b8220f416aafe96, 0x82ebeb2b4f566a34, 0xd5a9a5b075a5950f, 0x5142b2cf4b2488f4,
+ /* 3 */ 0x6aaebc750069680c, 0x89cf7820a0f99c41, 0x2a58d9183b56d0f4, 0x4b5aca80e36011a4,
+ /* 4 */ 0x329132348c29745d, 0xf4a2e616e1642fd7, 0x1e45bb03ff67bc34, 0x306912d0f42a9b4a,
+ /* 5 */ 0xff886507e6af7154, 0x04f50e13dfeec82f, 0xaa512fe82abab5ce, 0x174e251a68d5f222,
+ /* 6 */ 0xcf96700d82028898, 0x1743e3370a2c02c5, 0x379eec98b4e86eaa, 0x0c59888a51e0482e,
+ /* 7 */ 0xfbcbf1d699b5d189, 0xacaef0d58e9fdc84, 0xc1c20d06231f7614, 0x2938218da274f972,
+ /* 8 */ 0xf6af49beff1d7f18, 0xcc541c22387ac9c2, 0x96fcc9ef4015c56b, 0x69c1627c690913a9,
+ /* 9 */ 0x7a86fd2f4733db0e, 0xfdb8c4f29e087de9, 0x095e4b1a8ea2a229, 0x1ad7a7c829b37a79,
+ /* 10 */ 0x342d89cad17ea0c0, 0x67bedda6cced2051, 0x19ca31bf2bb42f74, 0x3df7b4c84980acbb,
+ /* 11 */ 0xa8c6444dc80ad883, 0xb91e440366e3ab85, 0xc215cda00164f6d8, 0x3d867c6ef247e668,
+ /* 12 */ 0xc7dd582bcc3e658c, 0xfd2c4748ee0e5528, 0xa0fd9b95cc9f4f71, 0x7529d871b0675ddf,
+ /* 13 */ 0xb8f568b42d3cbd78, 0x1233011b91f3da82, 0x2dce6ccd4a7c3b62, 0x75e7fc8e9e498603,
+ /* 14 */ 0x2f4f13f1fcd0b6ec, 0xf1a8ca1f29ff7a45, 0xc249c1a72981e29b, 0x6ebe0dbb8c83b56a,
+ /* 15 */ 0x7114fa8d170bb222, 0x65a2dcd5bf93935f, 0xbdc41f68b59c979a, 0x2f0eef79a2ce9289,
+ /* 16 */ 0x42ecbf0c083c37ce, 0x2930bc09ec496322, 0xf294b0c19cfeac0d, 0x3780aa4bedfabb80,
+ /* 17 */ 0x56c17d3e7cead929, 0xe7cb4beb2e5722c5, 0x0ce931732dbfe15a, 0x41b883c7621052f8,
+ /* 18 */ 0xdbf75ca0c3d25350, 0x2936be086eb1e351, 0xc936e03cb4a9b212, 0x1d45bf82322225aa,
+ /* 19 */ 0xe81ab1036a024cc5, 0xe212201c304c9a72, 0xc5d73fba6832b1fc, 0x20ffdb5a4d839581,
+ /* 20 */ 0xa283d367be5d0fad, 0x6c2b25ca8b164475, 0x9d4935467caaf22e, 0x5166408eee85ff49,
+ /* 21 */ 0x3c67baa2fab4e361, 0xb3e433c67ef35cef, 0x5259729241159b1c, 0x6a621892d5b0ab33,
+ /* 22 */ 0x20b74a387555cdcb, 0x532aa10e1208923f, 0xeaa17b7762281dd1, 0x61ab3443f05c44bf,
+ /* 23 */ 0x257a6c422324def8, 0x131c6c1017e3cf7f, 0x23758739f630a257, 0x295a407a01a78580,
+ /* 24 */ 0xf8c443246d5da8d9, 0x19d775450c52fa5d, 0x2afcfc92731bf83d, 0x7d10c8e81b2b4700,
+ /* 25 */ 0xc8e0271f70baa20b, 0x993748867ca63957, 0x5412efb3cb7ed4bb, 0x3196d36173e62975,
+ /* 26 */ 0xde5bcad141c7dffc, 0x47cc8cd2b395c848, 0xa34cd942e11af3cb, 0x0256dbf2d04ecec2,
+ /* 27 */ 0x875ab7e94b0e667f, 0xcad4dd83c0850d10, 0x47f12e8f4e72c79f, 0x5f1a87bb8c85b19b,
+ /* 28 */ 0x7ae9d0b6437f51b8, 0x12c7ce5518879065, 0x2ade09fe5cf77aee, 0x23a05a2f7d2c5627,
+ /* 29 */ 0x5908e128f17c169a, 0xf77498dd8ad0852d, 0x74b4c4ceab102f64, 0x183abadd10139845,
+ /* 30 */ 0xb165ba8daa92aaac, 0xd5c5ef9599386705, 0xbe2f8f0cf8fc40d1, 0x2701e635ee204514,
+ /* 31 */ 0x629fa80020156514, 0xf223868764a8c1ce, 0x5b894fff0b3f060e, 0x60d9944cf708a3fa,
+ /* 32 */ 0xaeea001a1c7a201f, 0xebf16a633ee2ce63, 0x6f7709594c7a07e1, 0x79b958150d0208cb,
+ /* 33 */ 0x24b55e5301d410e7, 0xe3a34edff3fdc84d, 0xd88768e4904032d8, 0x131384427b3aaeec,
+ /* 34 */ 0x8405e51286234f14, 0x14dc4739adb4c529, 0xb8a2b5b250634ffd, 0x2fe2a94ad8a7ff93,
+ /* 35 */ 0xec5c57efe843fadd, 0x2843ce40f0bb9918, 0xa4b561d6cf3d6305, 0x743629bde8fb777e,
+ /* 36 */ 0x343edd46bbaf738f, 0xed981828b101a651, 0xa401760b882c797a, 0x1fc223e28dc88730,
+ /* 37 */ 0x48604e91fc0fba0e, 0xb637f78f052c6fa4, 0x91ccac3d09e9239c, 0x23f7eed4437a687c,
+ /* 38 */ 0x5173b1118d9bd800, 0x29d641b63189d4a7, 0xfdbf177988bbc586, 0x2959894fcad81df5,
+ /* 39 */ 0xaebc8ef3b4bbc899, 0x4148995ab26992b9, 0x24e20b0134f92cfb, 0x40d158894a05dee8,
+ /* 40 */ 0x46b00b1185af76f6, 0x26bac77873187a79, 0x3dc0bf95ab8fff5f, 0x2a608bd8945524d7,
+ /* 41 */ 0x26449588bd446302, 0x7c4bc21c0388439c, 0x8e98a4f383bd11b2, 0x26218d7bc9d876b9,
+ /* 42 */ 0xe3081542997c178a, 0x3c2d29a86fb6606f, 0x5c217736fa279374, 0x7dde05734afeb1fa,
+ /* 43 */ 0x3bf10e3906d42bab, 0xe4f7803e1980649c, 0xe6053bf89595bf7a, 0x394faf38da245530,
+ /* 44 */ 0x7a8efb58896928f4, 0xfbc778e9cc6a113c, 0x72670ce330af596f, 0x48f222a81d3d6cf7,
+ /* 45 */ 0xf01fce410d72caa7, 0x5a20ecc7213b5595, 0x7bc21165c1fa1483, 0x07f89ae31da8a741,
+ /* 46 */ 0x05d2c2b4c6830ff9, 0xd43e330fc6316293, 0xa5a5590a96d3a904, 0x705edb91a65333b6,
+ /* 47 */ 0x048ee15e0bb9a5f7, 0x3240cfca9e0aaf5d, 0x8f4b71ceedc4a40b, 0x621c0da3de544a6d,
+ /* 48 */ 0x92872836a08c4091, 0xce8375b010c91445, 0x8a72eb524f276394, 0x2667fcfa7ec83635,
+ /* 49 */ 0x7f4c173345e8752a, 0x061b47feee7079a5, 0x25dd9afa9f86ff34, 0x3780cef5425dc89c,
+ /* 50 */ 0x1a46035a513bb4e9, 0x3e1ef379ac575ada, 0xc78c5f1c5fa24b50, 0x321a967634fd9f22,
+ /* 51 */ 0x946707b8826e27fa, 0x3dca84d64c506fd0, 0xc189218075e91436, 0x6d9284169b3b8484,
+ /* 52 */ 0x3a67e840383f2ddf, 0x33eec9a30c4f9b75, 0x3ec7c86fa783ef47, 0x26ec449fbac9fbc4,
+ /* 53 */ 0x5c0f38cba09b9e7d, 0x81168cc762a3478c, 0x3e23b0d306fc121c, 0x5a238aa0a5efdcdd,
+ /* 54 */ 0x1ba26121c4ea43ff, 0x36f8c77f7c8832b5, 0x88fbea0b0adcf99a, 0x5ca9938ec25bebf9,
+ /* 55 */ 0xd5436a5e51fccda0, 0x1dbc4797c2cd893b, 0x19346a65d3224a08, 0x0f5034e49b9af466,
+ /* 56 */ 0xf23c3967a1e0b96e, 0xe58b08fa867a4d88, 0xfb2fabc6a7341679, 0x2a75381eb6026946,
+ /* 57 */ 0xc80a3be4c19420ac, 0x66b1f6c681f2b6dc, 0x7cf7036761e93388, 0x25abbbd8a660a4c4,
+ /* 58 */ 0x91ea12ba14fd5198, 0x684950fc4a3cffa9, 0xf826842130f5ad28, 0x3ea988f75301a441,
+ /* 59 */ 0xc978109a695f8c6f, 0x1746eb4a0530c3f3, 0x444d6d77b4459995, 0x75952b8c054e5cc7,
+ /* 60 */ 0xa3703f7915f4d6aa, 0x66c346202f2647d8, 0xd01469df811d644b, 0x77fea47d81a5d71f,
+ /* 61 */ 0xc5e9529ef57ca381, 0x6eeeb4b9ce2f881a, 0xb6e91a28e8009bd6, 0x4b80be3e9afc3fec,
+ /* 62 */ 0x7e3773c526aed2c5, 0x1b4afcb453c9a49d, 0xa920bdd7baffb24d, 0x7c54699f122d400e,
+ /* 63 */ 0xef46c8e14fa94bc8, 0xe0b074ce2952ed5e, 0xbea450e1dbd885d5, 0x61b68649320f712c,
+ /* 64 */ 0x8a485f7309ccbdd1, 0xbd06320d7d4d1a2d, 0x25232973322dbef4, 0x445dc4758c17f770,
+ /* 65 */ 0xdb0434177cc8933c, 0xed6fe82175ea059f, 0x1efebefdc053db34, 0x4adbe867c65daf99,
+ /* 66 */ 0x3acd71a2a90609df, 0xe5e991856dd04050, 0x1ec69b688157c23c, 0x697427f6885cfe4d,
+ /* 67 */ 0xd7be7b9b65e1a851, 0xa03d28d522c536dd, 0x28399d658fd2b645, 0x49e5b7e17c2641e1,
+ /* 68 */ 0x6f8c3a98700457a4, 0x5078f0a25ebb6778, 0xd13c3ccbc382960f, 0x2e003258a7df84b1,
+ /* 69 */ 0x8ad1f39be6296a1c, 0xc1eeaa652a5fbfb2, 0x33ee0673fd26f3cb, 0x59256173a69d2ccc,
+ /* 70 */ 0x41ea07aa4e18fc41, 0xd9fc19527c87a51e, 0xbdaacb805831ca6f, 0x445b652dc916694f,
+ /* 71 */ 0xce92a3a7f2172315, 0x1edc282de11b9964, 0xa1823aafe04c314a, 0x790a2d94437cf586,
+ /* 72 */ 0x71c447fb93f6e009, 0x8922a56722845276, 0xbf70903b204f5169, 0x2f7a89891ba319fe,
+ /* 73 */ 0x02a08eb577e2140c, 0xed9a4ed4427bdcf4, 0x5253ec44e4323cd1, 0x3e88363c14e9355b,
+ /* 74 */ 0xaa66c14277110b8c, 0x1ae0391610a23390, 0x2030bd12c93fc2a2, 0x3ee141579555c7ab,
+ /* 75 */ 0x9214de3a6d6e7d41, 0x3ccdd88607f17efe, 0x674f1288f8e11217, 0x5682250f329f93d0,
+ /* 76 */ 0x6cf00b136d2e396e, 0x6e4cf86f1014debf, 0x5930b1b5bfcc4e83, 0x047069b48aba16b6,
+ /* 77 */ 0x0d4ce4ab69b20793, 0xb24db91a97d0fb9e, 0xcdfa50f54e00d01d, 0x221b1085368bddb5,
+ /* 78 */ 0xe7e59468b1e3d8d2, 0x53c56563bd122f93, 0xeee8a903e0663f09, 0x61efa662cbbe3d42,
+ /* 79 */ 0x2cf8ddddde6eab2a, 0x9bf80ad51435f231, 0x5deadacec9f04973, 0x29275b5d41d29b27,
+ /* 80 */ 0xcfde0f0895ebf14f, 0xb9aab96b054905a7, 0xcae80dd9a1c420fd, 0x0a63bf2f1673bbc7,
+ /* 81 */ 0x092f6e11958fbc8c, 0x672a81e804822fad, 0xcac8351560d52517, 0x6f3f7722c8f192f8,
+ /* 82 */ 0xf8ba90ccc2e894b7, 0x2c7557a438ff9f0d, 0x894d1d855ae52359, 0x68e122157b743d69,
+ /* 83 */ 0xd87e5570cfb919f3, 0x3f2cdecd95798db9, 0x2121154710c0a2ce, 0x3c66a115246dc5b2,
+ /* 84 */ 0xcbedc562294ecb72, 0xba7143c36a280b16, 0x9610c2efd4078b67, 0x6144735d946a4b1e,
+ /* 85 */ 0x536f111ed75b3350, 0x0211db8c2041d81b, 0xf93cb1000e10413c, 0x149dfd3c039e8876,
+ /* 86 */ 0xd479dde46b63155b, 0xb66e15e93c837976, 0xdafde43b1f13e038, 0x5fafda1a2e4b0b35,
+ /* 87 */ 0x3600bbdf17197581, 0x3972050bbe3cd2c2, 0x5938906dbdd5be86, 0x34fce5e43f9b860f,
+ /* 88 */ 0x75a8a4cd42d14d02, 0x828dabc53441df65, 0x33dcabedd2e131d3, 0x3ebad76fb814d25f,
+ /* 89 */ 0xd4906f566f70e10f, 0x5d12f7aa51690f5a, 0x45adb16e76cefcf2, 0x01f768aead232999,
+ /* 90 */ 0x2b6cc77b6248febd, 0x3cd30628ec3aaffd, 0xce1c0b80d4ef486a, 0x4c3bff2ea6f66c23,
+ /* 91 */ 0x3f2ec4094aeaeb5f, 0x61b19b286e372ca7, 0x5eefa966de2a701d, 0x23b20565de55e3ef,
+ /* 92 */ 0xe301ca5279d58557, 0x07b2d4ce27c2874f, 0xa532cd8a9dcf1d67, 0x2a52fee23f2bff56,
+ /* 93 */ 0x8624efb37cd8663d, 0xbbc7ac20ffbd7594, 0x57b85e9c82d37445, 0x7b3052cb86a6ec66,
+ /* 94 */ 0x3482f0ad2525e91e, 0x2cb68043d28edca0, 0xaf4f6d052e1b003a, 0x185f8c2529781b0a,
+ /* 95 */ 0xaa41de5bd80ce0d6, 0x9407b2416853e9d6, 0x563ec36e357f4c3a, 0x4cc4b8dd0e297bce,
+ /* 96 */ 0xa2fc1a52ffb8730e, 0x1811f16e67058e37, 0x10f9a366cddf4ee1, 0x72f4a0c4a0b9f099,
+ /* 97 */ 0x8c16c06f663f4ea7, 0x693b3af74e970fba, 0x2102e7f1d69ec345, 0x0ba53cbc968a8089,
+ /* 98 */ 0xca3d9dc7fea15537, 0x4c6824bb51536493, 0xb9886314844006b1, 0x40d2a72ab454cc60,
+ /* 99 */ 0x5936a1b712570975, 0x91b9d648debda657, 0x3344094bb64330ea, 0x006ba10d12ee51d0,
+ /* 100 */ 0x19228468f5de5d58, 0x0eb12f4c38cc05b0, 0xa1039f9dd5601990, 0x4502d4ce4fff0e0b,
+ /* 101 */ 0xeb2054106837c189, 0xd0f6544c6dd3b93c, 0x40727064c416d74f, 0x6e15c6114b502ef0,
+ /* 102 */ 0x4df2a398cfb1a76b, 0x11256c7419f2f6b1, 0x4a497962066e6043, 0x705b3aab41355b44,
+ /* 103 */ 0x365ef536d797b1d8, 0x00076bd622ddf0db, 0x3bbf33b0e0575a88, 0x3777aa05c8e4ca4d,
+ /* 104 */ 0x392745c85578db5f, 0x6fda4149dbae5ae2, 0xb1f0b00b8adc9867, 0x09963437d36f1da3,
+ /* 105 */ 0x7e824e90a5dc3853, 0xccb5f6641f135cbd, 0x6736d86c87ce8fcc, 0x625f3ce26604249f,
+ /* 106 */ 0xaf8ac8059502f63f, 0x0c05e70a2e351469, 0x35292e9c764b6305, 0x1a394360c7e23ac3,
+ /* 107 */ 0xd5c6d53251183264, 0x62065abd43c2b74f, 0xb5fbf5d03b973f9b, 0x13a3da3661206e5e,
+ /* 108 */ 0xc6bd5837725d94e5, 0x18e30912205016c5, 0x2088ce1570033c68, 0x7fba1f495c837987,
+ /* 109 */ 0x5a8c7423f2f9079d, 0x1735157b34023fc5, 0xe4f9b49ad2fab351, 0x6691ff72c878e33c,
+ /* 110 */ 0x122c2adedc5eff3e, 0xf8dd4bf1d8956cf4, 0xeb86205d9e9e5bda, 0x049b92b9d975c743,
+ /* 111 */ 0xa5379730b0f6c05a, 0x72a0ffacc6f3a553, 0xb0032c34b20dcd6d, 0x470e9dbc88d5164a,
+ /* 112 */ 0xb19cf10ca237c047, 0xb65466711f6c81a2, 0xb3321bd16dd80b43, 0x48c14f600c5fbe8e,
+ /* 113 */ 0x66451c264aa6c803, 0xb66e3904a4fa7da6, 0xd45f19b0b3128395, 0x31602627c3c9bc10,
+ /* 114 */ 0x3120dc4832e4e10d, 0xeb20c46756c717f7, 0x00f52e3f67280294, 0x566d4fc14730c509,
+ /* 115 */ 0x7e3a5d40fd837206, 0xc1e926dc7159547a, 0x216730fba68d6095, 0x22e8c3843f69cea7,
+ /* 116 */ 0x33d074e8930e4b2b, 0xb6e4350e84d15816, 0x5534c26ad6ba2365, 0x7773c12f89f1f3f3,
+ /* 117 */ 0x8cba404da57962aa, 0x5b9897a81999ce56, 0x508e862f121692fc, 0x3a81907fa093c291,
+ /* 118 */ 0x0dded0ff4725a510, 0x10d8cc10673fc503, 0x5b9d151c9f1f4e89, 0x32a5c1d5cb09a44c,
+ /* 119 */ 0x1e0aa442b90541fb, 0x5f85eb7cc1b485db, 0xbee595ce8a9df2e5, 0x25e496c722422236,
+ /* 120 */ 0x5edf3c46cd0fe5b9, 0x34e75a7ed2a43388, 0xe488de11d761e352, 0x0e878a01a085545c,
+ /* 121 */ 0xba493c77e021bb04, 0x2b4d1843c7df899a, 0x9ea37a487ae80d67, 0x67a9958011e41794,
+ /* 122 */ 0x4b58051a6697b065, 0x47e33f7d8d6ba6d4, 0xbb4da8d483ca46c1, 0x68becaa181c2db0d,
+ /* 123 */ 0x8d8980e90b989aa5, 0xf95eb14a2c93c99b, 0x51c6c7c4796e73a2, 0x6e228363b5efb569,
+ /* 124 */ 0xc6bbc0b02dd624c8, 0x777eb47dec8170ee, 0x3cde15a004cfafa9, 0x1dc6bc087160bf9b,
+ /* 125 */ 0x2e07e043eec34002, 0x18e9fc677a68dc7f, 0xd8da03188bd15b9a, 0x48fbc3bb00568253,
+ /* 126 */ 0x57547d4cfb654ce1, 0xd3565b82a058e2ad, 0xf63eaf0bbf154478, 0x47531ef114dfbb18,
+ /* 127 */ 0xe1ec630a4278c587, 0x5507d546ca8e83f3, 0x85e135c63adc0c2b, 0x0aa7efa85682844e,
+ /* 128 */ 0x72691ba8b3e1f615, 0x32b4e9701fbe3ffa, 0x97b6d92e39bb7868, 0x2cfe53dea02e39e8,
+ /* 129 */ 0x687392cd85cd52b0, 0x27ff66c910e29831, 0x97134556a9832d06, 0x269bb0360a84f8a0,
+ /* 130 */ 0x706e55457643f85c, 0x3734a48c9b597d1b, 0x7aee91e8c6efa472, 0x5cd6abc198a9d9e0,
+ /* 131 */ 0x0e04de06cb3ce41a, 0xd8c6eb893402e138, 0x904659bb686e3772, 0x7215c371746ba8c8,
+ /* 132 */ 0xfd12a97eeae4a2d9, 0x9514b7516394f2c5, 0x266fd5809208f294, 0x5c847085619a26b9,
+ /* 133 */ 0x52985410fed694ea, 0x3c905b934a2ed254, 0x10bb47692d3be467, 0x063b3d2d69e5e9e1,
+ /* 134 */ 0x472726eedda57deb, 0xefb6c4ae10f41891, 0x2b1641917b307614, 0x117c554fc4f45b7c,
+ /* 135 */ 0xc07cf3118f9d8812, 0x01dbd82050017939, 0xd7e803f4171b2827, 0x1015e87487d225ea,
+ /* 136 */ 0xc58de3fed23acc4d, 0x50db91c294a7be2d, 0x0b94d43d1c9cf457, 0x6b1640fa6e37524a,
+ /* 137 */ 0x692f346c5fda0d09, 0x200b1c59fa4d3151, 0xb8c46f760777a296, 0x4b38395f3ffdfbcf,
+ /* 138 */ 0x18d25e00be54d671, 0x60d50582bec8aba6, 0x87ad8f263b78b982, 0x50fdf64e9cda0432,
+ /* 139 */ 0x90f567aac578dcf0, 0xef1e9b0ef2a3133b, 0x0eebba9242d9de71, 0x15473c9bf03101c7,
+ /* 140 */ 0x7c77e8ae56b78095, 0xb678e7666e6f078e, 0x2da0b9615348ba1f, 0x7cf931c1ff733f0b,
+ /* 141 */ 0x26b357f50a0a366c, 0xe9708cf42b87d732, 0xc13aeea5f91cb2c0, 0x35d90c991143bb4c,
+ /* 142 */ 0x47c1c404a9a0d9dc, 0x659e58451972d251, 0x3875a8c473b38c31, 0x1fbd9ed379561f24,
+ /* 143 */ 0x11fabc6fd41ec28d, 0x7ef8dfe3cd2a2dca, 0x72e73b5d8c404595, 0x6135fa4954b72f27,
+ /* 144 */ 0xccfc32a2de24b69c, 0x3f55698c1f095d88, 0xbe3350ed5ac3f929, 0x5e9bf806ca477eeb,
+ /* 145 */ 0xe9ce8fb63c309f68, 0x5376f63565e1f9f4, 0xd1afcfb35a6393f1, 0x6632a1ede5623506,
+ /* 146 */ 0x0b7d6c390c2ded4c, 0x56cb3281df04cb1f, 0x66305a1249ecc3c7, 0x5d588b60a38ca72a,
+ /* 147 */ 0xa6ecbf78e8e5f42d, 0x86eeb44b3c8a3eec, 0xec219c48fbd21604, 0x1aaf1af517c36731,
+ /* 148 */ 0xc306a2836769bde7, 0x208280622b1e2adb, 0x8027f51ffbff94a6, 0x76cfa1ce1124f26b,
+ /* 149 */ 0x18eb00562422abb6, 0xf377c4d58f8c29c3, 0x4dbbc207f531561a, 0x0253b7f082128a27,
+ /* 150 */ 0x3d1f091cb62c17e0, 0x4860e1abd64628a9, 0x52d17436309d4253, 0x356f97e13efae576,
+ /* 151 */ 0xd351e11aa150535b, 0x3e6b45bb1dd878cc, 0x0c776128bed92c98, 0x1d34ae93032885b8,
+ /* 152 */ 0x4ba0488ca85ba4c3, 0x985348c33c9ce6ce, 0x66124c6f97bda770, 0x0f81a0290654124a,
+ /* 153 */ 0x9ed09ca6569b86fd, 0x811009fd18af9a2d, 0xff08d03f93d8c20a, 0x52a148199faef26b,
+ /* 154 */ 0x3e03f9dc2d8d1b73, 0x4205801873961a70, 0xc0d987f041a35970, 0x07aa1f15a1c0d549,
+ /* 155 */ 0xdfd46ce08cd27224, 0x6d0a024f934e4239, 0x808a7a6399897b59, 0x0a4556e9e13d95a2,
+ /* 156 */ 0xd21a991fe9c13045, 0x9b0e8548fe7751b8, 0x5da643cb4bf30035, 0x77db28d63940f721,
+ /* 157 */ 0xfc5eeb614adc9011, 0x5229419ae8c411eb, 0x9ec3e7787d1dcf74, 0x340d053e216e4cb5,
+ /* 158 */ 0xcac7af39b48df2b4, 0xc0faec2871a10a94, 0x140a69245ca575ed, 0x0cf1c37134273a4c,
+ /* 159 */ 0xc8ee306ac224b8a5, 0x57eaee7ccb4930b0, 0xa1e806bdaacbe74f, 0x7d9a62742eeb657d,
+ /* 160 */ 0x9eb6b6ef546c4830, 0x885cca1fddb36e2e, 0xe6b9f383ef0d7105, 0x58654fef9d2e0412,
+ /* 161 */ 0xa905c4ffbe0e8e26, 0x942de5df9b31816e, 0x497d723f802e88e1, 0x30684dea602f408d,
+ /* 162 */ 0x21e5a278a3e6cb34, 0xaefb6e6f5b151dc4, 0xb30b8e049d77ca15, 0x28c3c9cf53b98981,
+ /* 163 */ 0x287fb721556cdd2a, 0x0d317ca897022274, 0x7468c7423a543258, 0x4a7f11464eb5642f,
+ /* 164 */ 0xa237a4774d193aa6, 0xd865986ea92129a1, 0x24c515ecf87c1a88, 0x604003575f39f5eb,
+ /* 165 */ 0x47b9f189570a9b27, 0x2b98cede465e4b78, 0x026df551dbb85c20, 0x74fcd91047e21901,
+ /* 166 */ 0x13e2a90a23c1bfa3, 0x0cb0074e478519f6, 0x5ff1cbbe3af6cf44, 0x67fe5438be812dbe,
+ /* 167 */ 0xd13cf64fa40f05b0, 0x054dfb2f32283787, 0x4173915b7f0d2aea, 0x482f144f1f610d4e,
+ /* 168 */ 0xf6210201b47f8234, 0x5d0ae1929e70b990, 0xdcd7f455b049567c, 0x7e93d0f1f0916f01,
+ /* 169 */ 0xdd79cbf18a7db4fa, 0xbe8391bf6f74c62f, 0x027145d14b8291bd, 0x585a73ea2cbf1705,
+ /* 170 */ 0x485ca03e928a0db2, 0x10fc01a5742857e7, 0x2f482edbd6d551a7, 0x0f0433b5048fdb8a,
+ /* 171 */ 0x60da2e8dd7dc6247, 0x88b4c9d38cd4819a, 0x13033ac001f66697, 0x273b24fe3b367d75,
+ /* 172 */ 0xc6e8f66a31b3b9d4, 0x281514a494df49d5, 0xd1726fdfc8b23da7, 0x4b3ae7d103dee548,
+ /* 173 */ 0xc6256e19ce4b9d7e, 0xff5c5cf186e3c61c, 0xacc63ca34b8ec145, 0x74621888fee66574,
+ /* 174 */ 0x956f409645290a1e, 0xef0bf8e3263a962e, 0xed6a50eb5ec2647b, 0x0694283a9dca7502,
+ /* 175 */ 0x769b963643a2dcd1, 0x42b7c8ea09fc5353, 0x4f002aee13397eab, 0x63005e2c19b7d63a,
+ /* 176 */ 0xca6736da63023bea, 0x966c7f6db12a99b7, 0xace09390c537c5e1, 0x0b696063a1aa89ee,
+ /* 177 */ 0xebb03e97288c56e5, 0x432a9f9f938c8be8, 0xa6a5a93d5b717f71, 0x1a5fb4c3e18f9d97,
+ /* 178 */ 0x1c94e7ad1c60cdce, 0xee202a43fc02c4a0, 0x8dafe4d867c46a20, 0x0a10263c8ac27b58,
+ /* 179 */ 0xd0dea9dfe4432a4a, 0x856af87bbe9277c5, 0xce8472acc212c71a, 0x6f151b6d9bbb1e91,
+ /* 180 */ 0x26776c527ceed56a, 0x7d211cb7fbf8faec, 0x37ae66a6fd4609cc, 0x1f81b702d2770c42,
+ /* 181 */ 0x2fb0b057eac58392, 0xe1dd89fe29744e9d, 0xc964f8eb17beb4f8, 0x29571073c9a2d41e,
+ /* 182 */ 0xa948a18981c0e254, 0x2df6369b65b22830, 0xa33eb2d75fcfd3c6, 0x078cd6ec4199a01f,
+ /* 183 */ 0x4a584a41ad900d2f, 0x32142b78e2c74c52, 0x68c4e8338431c978, 0x7f69ea9008689fc2,
+ /* 184 */ 0x52f2c81e46a38265, 0xfd78072d04a832fd, 0x8cd7d5fa25359e94, 0x4de71b7454cc29d2,
+ /* 185 */ 0x42eb60ad1eda6ac9, 0x0aad37dfdbc09c3a, 0x81004b71e33cc191, 0x44e6be345122803c,
+ /* 186 */ 0x03fe8388ba1920db, 0xf5d57c32150db008, 0x49c8c4281af60c29, 0x21edb518de701aee,
+ /* 187 */ 0x7fb63e418f06dc99, 0xa4460d99c166d7b8, 0x24dd5248ce520a83, 0x5ec3ad712b928358,
+ /* 188 */ 0x15022a5fbd17930f, 0xa4f64a77d82570e3, 0x12bc8d6915783712, 0x498194c0fc620abb,
+ /* 189 */ 0x38a2d9d255686c82, 0x785c6bd9193e21f0, 0xe4d5c81ab24a5484, 0x56307860b2e20989,
+ /* 190 */ 0x429d55f78b4d74c4, 0x22f1834643350131, 0x1e60c24598c71fff, 0x59f2f014979983ef,
+ /* 191 */ 0x46a47d56eb494a44, 0x3e22a854d636a18e, 0xb346e15274491c3b, 0x2ceafd4e5390cde7,
+ /* 192 */ 0xba8a8538be0d6675, 0x4b9074bb50818e23, 0xcbdab89085d304c3, 0x61a24fe0e56192c4,
+ /* 193 */ 0xcb7615e6db525bcb, 0xdd7d8c35a567e4ca, 0xe6b4153acafcdd69, 0x2d668e097f3c9766,
+ /* 194 */ 0xa57e7e265ce55ef0, 0x5d9f4e527cd4b967, 0xfbc83606492fd1e5, 0x090d52beb7c3f7ae,
+ /* 195 */ 0x09b9515a1e7b4d7c, 0x1f266a2599da44c0, 0xa1c49548e2c55504, 0x7ef04287126f15cc,
+ /* 196 */ 0xfed1659dbd30ef15, 0x8b4ab9eec4e0277b, 0x884d6236a5df3291, 0x1fd96ea6bf5cf788,
+ /* 197 */ 0x42a161981f190d9a, 0x61d849507e6052c1, 0x9fe113bf285a2cd5, 0x7c22d676dbad85d8,
+ /* 198 */ 0x82e770ed2bfbd27d, 0x4c05b2ece996f5a5, 0xcd40a9c2b0900150, 0x5895319213d9bf64,
+ /* 199 */ 0xe7cc5d703fea2e08, 0xb50c491258e2188c, 0xcce30baa48205bf0, 0x537c659ccfa32d62,
+ /* 200 */ 0x37b6623a98cfc088, 0xfe9bed1fa4d6aca4, 0x04d29b8e56a8d1b0, 0x725f71c40b519575,
+ /* 201 */ 0x28c7f89cd0339ce6, 0x8367b14469ddc18b, 0x883ada83a6a1652c, 0x585f1974034d6c17,
+ /* 202 */ 0x89cfb266f1b19188, 0xe63b4863e7c35217, 0xd88c9da6b4c0526a, 0x3e035c9df0954635,
+ /* 203 */ 0xdd9d5412fb45de9d, 0xdd684532e4cff40d, 0x4b5c999b151d671c, 0x2d8c2cc811e7f690,
+ /* 204 */ 0x7f54be1d90055d40, 0xa464c5df464aaf40, 0x33979624f0e917be, 0x2c018dc527356b30,
+ /* 205 */ 0xa5415024e330b3d4, 0x73ff3d96691652d3, 0x94ec42c4ef9b59f1, 0x0747201618d08e5a,
+ /* 206 */ 0x4d6ca48aca411c53, 0x66415f2fcfa66119, 0x9c4dd40051e227ff, 0x59810bc09a02f7eb,
+ /* 207 */ 0x2a7eb171b3dc101d, 0x441c5ab99ffef68e, 0x32025c9b93b359ea, 0x5e8ce0a71e9d112f,
+ /* 208 */ 0xbfcccb92429503fd, 0xd271ba752f095d55, 0x345ead5e972d091e, 0x18c8df11a83103ba,
+ /* 209 */ 0x90cd949a9aed0f4c, 0xc5d1f4cb6660e37e, 0xb8cac52d56c52e0b, 0x6e42e400c5808e0d,
+ /* 210 */ 0xa3b46966eeaefd23, 0x0c4f1f0be39ecdca, 0x189dc8c9d683a51d, 0x51f27f054c09351b,
+ /* 211 */ 0x4c487ccd2a320682, 0x587ea95bb3df1c96, 0xc8ccf79e555cb8e8, 0x547dc829a206d73d,
+ /* 212 */ 0xb822a6cd80c39b06, 0xe96d54732000d4c6, 0x28535b6f91463b4d, 0x228f4660e2486e1d,
+ /* 213 */ 0x98799538de8d3abf, 0x8cd8330045ebca6e, 0x79952a008221e738, 0x4322e1a7535cd2bb,
+ /* 214 */ 0xb114c11819d1801c, 0x2016e4d84f3f5ec7, 0xdd0e2df409260f4c, 0x5ec362c0ae5f7266,
+ /* 215 */ 0xc0462b18b8b2b4ee, 0x7cc8d950274d1afb, 0xf25f7105436b02d2, 0x43bbf8dcbff9ccd3,
+ /* 216 */ 0xb6ad1767a039e9df, 0xb0714da8f69d3583, 0x5e55fa18b42931f5, 0x4ed5558f33c60961,
+ /* 217 */ 0x1fe37901c647a5dd, 0x593ddf1f8081d357, 0x0249a4fd813fd7a6, 0x69acca274e9caf61,
+ /* 218 */ 0x047ba3ea330721c9, 0x83423fc20e7e1ea0, 0x1df4c0af01314a60, 0x09a62dab89289527,
+ /* 219 */ 0xa5b325a49cc6cb00, 0xe94b5dc654b56cb6, 0x3be28779adc994a0, 0x4296e8f8ba3a4aad,
+ /* 220 */ 0x328689761e451eab, 0x2e4d598bff59594a, 0x49b96853d7a7084a, 0x4980a319601420a8,
+ /* 221 */ 0x9565b9e12f552c42, 0x8a5318db7100fe96, 0x05c90b4d43add0d7, 0x538b4cd66a5d4eda,
+ /* 222 */ 0xf4e94fc3e89f039f, 0x592c9af26f618045, 0x08a36eb5fd4b9550, 0x25fffaf6c2ed1419,
+ /* 223 */ 0x34434459cc79d354, 0xeeecbfb4b1d5476b, 0xddeb34a061615d99, 0x5129cecceb64b773,
+ /* 224 */ 0xee43215894993520, 0x772f9c7cf14c0b3b, 0xd2e2fce306bedad5, 0x715f42b546f06a97,
+ /* 225 */ 0x434ecdceda5b5f1a, 0x0da17115a49741a9, 0x680bd77c73edad2e, 0x487c02354edd9041,
+ /* 226 */ 0xb8efeff3a70ed9c4, 0x56a32aa3e857e302, 0xdf3a68bd48a2a5a0, 0x07f650b73176c444,
+ /* 227 */ 0xe38b9b1626e0ccb1, 0x79e053c18b09fb36, 0x56d90319c9f94964, 0x1ca941e7ac9ff5c4,
+ /* 228 */ 0x49c4df29162fa0bb, 0x8488cf3282b33305, 0x95dfda14cabb437d, 0x3391f78264d5ad86,
+ /* 229 */ 0x729ae06ae2b5095d, 0xd58a58d73259a946, 0xe9834262d13921ed, 0x27fedafaa54bb592,
+ /* 230 */ 0xa99dc5b829ad48bb, 0x5f025742499ee260, 0x802c8ecd5d7513fd, 0x78ceb3ef3f6dd938,
+ /* 231 */ 0xc342f44f8a135d94, 0x7b9edb44828cdda3, 0x9436d11a0537cfe7, 0x5064b164ec1ab4c8,
+ /* 232 */ 0x7020eccfd37eb2fc, 0x1f31ea3ed90d25fc, 0x1b930d7bdfa1bb34, 0x5344467a48113044,
+ /* 233 */ 0x70073170f25e6dfb, 0xe385dc1a50114cc8, 0x2348698ac8fc4f00, 0x2a77a55284dd40d8,
+ /* 234 */ 0xfe06afe0c98c6ce4, 0xc235df96dddfd6e4, 0x1428d01e33bf1ed3, 0x785768ec9300bdaf,
+ /* 235 */ 0x9702e57a91deb63b, 0x61bdb8bfe5ce8b80, 0x645b426f3d1d58ac, 0x4804a82227a557bc,
+ /* 236 */ 0x8e57048ab44d2601, 0x68d6501a4b3a6935, 0xc39c9ec3f9e1c293, 0x4172f257d4de63e2,
+ /* 237 */ 0xd368b450330c6401, 0x040d3017418f2391, 0x2c34bb6090b7d90d, 0x16f649228fdfd51f,
+ /* 238 */ 0xbea6818e2b928ef5, 0xe28ccf91cdc11e72, 0x594aaa68e77a36cd, 0x313034806c7ffd0f,
+ /* 239 */ 0x8a9d27ac2249bd65, 0x19a3b464018e9512, 0xc26ccff352b37ec7, 0x056f68341d797b21,
+ /* 240 */ 0x5e79d6757efd2327, 0xfabdbcb6553afe15, 0xd3e7222c6eaf5a60, 0x7046c76d4dae743b,
+ /* 241 */ 0x660be872b18d4a55, 0x19992518574e1496, 0xc103053a302bdcbb, 0x3ed8e9800b218e8e,
+ /* 242 */ 0x7b0b9239fa75e03e, 0xefe9fb684633c083, 0x98a35fbe391a7793, 0x6065510fe2d0fe34,
+ /* 243 */ 0x55cb668548abad0c, 0xb4584548da87e527, 0x2c43ecea0107c1dd, 0x526028809372de35,
+ /* 244 */ 0x3415c56af9213b1f, 0x5bee1a4d017e98db, 0x13f6b105b5cf709b, 0x5ff20e3482b29ab6,
+ /* 245 */ 0x0aa29c75cc2e6c90, 0xfc7d73ca3a70e206, 0x899fc38fc4b5c515, 0x250386b124ffc207,
+ /* 246 */ 0x54ea28d5ae3d2b56, 0x9913149dd6de60ce, 0x16694fc58f06d6c1, 0x46b23975eb018fc7,
+ /* 247 */ 0x470a6a0fb4b7b4e2, 0x5d92475a8f7253de, 0xabeee5b52fbd3adb, 0x7fa20801a0806968,
+ /* 248 */ 0x76f3faf19f7714d2, 0xb3e840c12f4660c3, 0x0fb4cd8df212744e, 0x4b065a251d3a2dd2,
+ /* 249 */ 0x5cebde383d77cd4a, 0x6adf39df882c9cb1, 0xa2dd242eb09af759, 0x3147c0e50e5f6422,
+ /* 250 */ 0x164ca5101d1350db, 0xf8d13479c33fc962, 0xe640ce4d13e5da08, 0x4bdee0c45061f8ba,
+ /* 251 */ 0xd7c46dc1a4edb1c9, 0x5514d7b6437fd98a, 0x58942f6bb2a1c00b, 0x2dffb2ab1d70710e,
+ /* 252 */ 0xccdfcf2fc18b6d68, 0xa8ebcba8b7806167, 0x980697f95e2937e3, 0x02fbba1cd0126e8c
+};
+
+/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
+ * a is two 256-bit integers: a0[0:3] and a1[4:7]
+ * b is two 256-bit integers: b0[0:3] and b1[4:7]
+ */
+static void mul2_256x256_integer_adx(u64 *const c, u64 *const a, u64 *const b)
+{
+ asm volatile(
+ "xorl %%r14d, %%r14d ;"
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "adox %%r10, %%r12 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adox %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adox %%r10, %%rbx ;"
+ /******************************************/
+ "adox %%r14, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "adox %%r12, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rax ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rbx ;"
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rcx ;"
+ /******************************************/
+ "adox %%r14, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "adox %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rbx ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rcx ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%r12 ;"
+ /******************************************/
+ "adox %%r14, %%rax ;"
+ "adcx %%r14, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "adox %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ /******************************************/
+ "adox %%r14, %%rbx ;"
+ "adcx %%r14, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
+
+ "movq 32(%1), %%rdx; " /* C[0] */
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, 64(%0);"
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
+ "adox %%r10, %%r12 ;"
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
+ "adox %%r8, %%rax ;"
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
+ "adox %%r10, %%rbx ;"
+ /******************************************/
+ "adox %%r14, %%rcx ;"
+
+ "movq 40(%1), %%rdx; " /* C[1] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
+ "adox %%r12, %%r8 ;"
+ "movq %%r8, 72(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rax ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rbx ;"
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rcx ;"
+ /******************************************/
+ "adox %%r14, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+
+ "movq 48(%1), %%rdx; " /* C[2] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
+ "adox %%rax, %%r8 ;"
+ "movq %%r8, 80(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rbx ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rcx ;"
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%r12 ;"
+ /******************************************/
+ "adox %%r14, %%rax ;"
+ "adcx %%r14, %%rax ;"
+
+ "movq 56(%1), %%rdx; " /* C[3] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
+ "adox %%rbx, %%r8 ;"
+ "movq %%r8, 88(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rcx ;"
+ "movq %%rcx, 96(%0) ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%r12 ;"
+ "movq %%r12, 104(%0) ;"
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rax ;"
+ "movq %%rax, 112(%0) ;"
+ /******************************************/
+ "adox %%r14, %%rbx ;"
+ "adcx %%r14, %%rbx ;"
+ "movq %%rbx, 120(%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
+}
+
+static void mul2_256x256_integer_bmi2(u64 *const c, u64 *const a, u64 *const b)
+{
+ asm volatile(
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "addq %%r10, %%r12 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "addq %%r12, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r12 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r12 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r12 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "adcq %%r11, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
+
+ "movq 32(%1), %%rdx; " /* C[0] */
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */
+ "movq %%r8, 64(%0) ;"
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
+ "addq %%r10, %%r12 ;"
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 40(%1), %%rdx; " /* C[1] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
+ "addq %%r12, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r12 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r12 ;"
+
+ "movq 48(%1), %%rdx; " /* C[2] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 80(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r12 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 56(%1), %%rdx; " /* C[3] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 88(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 96(%0) ;"
+ "adcq %%r11, %%r12 ;"
+ "movq %%r12, 104(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 112(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 120(%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13");
+}
+
+static void sqr2_256x256_integer_adx(u64 *const c, u64 *const a)
+{
+ asm volatile(
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 24(%1), %%rdx ;" /* A[3] */
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
+ "adcx %%rax, %%r12 ;"
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%r12 ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%r12, %%r12 ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq (%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+
+
+ "movq 32(%1), %%rdx ;" /* B[0] */
+ "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 56(%1), %%rdx ;" /* B[3] */
+ "mulx 40(%1), %%r11, %%r12 ;" /* B[1]*B[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
+ "adcx %%rax, %%r12 ;"
+ "movq 40(%1), %%rdx ;" /* B[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%r12 ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%r12, %%r12 ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq 32(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
+ /*******************/
+ "movq %%rax, 64(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "movq 40(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 80(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 88(%0) ;"
+ "movq 48(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 96(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 104(%0) ;"
+ "movq 56(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 112(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 120(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15");
+}
+
+static void sqr2_256x256_integer_bmi2(u64 *const c, u64 *const a)
+{
+ asm volatile(
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
+
+ "movq 16(%1), %%rdx ;" /* A[2] */
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r12, %%r13 ;"
+ "shldq $1, %%r11, %%r12 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+
+ "movq 40(%1), %%rdx ;" /* B[1] */
+ "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
+ "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
+ "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
+
+ "movq 48(%1), %%rdx ;" /* B[2] */
+ "mulx 56(%1), %%r12, %%r13 ;" /* B[3]*B[2] */
+ "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq 32(%1), %%rdx ;" /* B[0] */
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r12, %%r13 ;"
+ "shldq $1, %%r11, %%r12 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
+ /*******************/
+ "movq %%rax, 64(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "movq 40(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 80(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 88(%0) ;"
+ "movq 48(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 96(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 104(%0) ;"
+ "movq 56(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 112(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 120(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
+}
+
+void red_eltfp25519_2w_adx(u64 *const c, u64 *const a)
+{
+ asm volatile(
+ "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox (%1), %%r8 ;"
+ "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 8(%1), %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 16(%1), %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 24(%1), %%r11 ;"
+ /***************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "clc ;"
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
+ "adcx %%rax, %%r8 ;"
+ "adcx %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+
+ "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox 64(%1), %%r8 ;"
+ "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 72(%1), %%r9 ;"
+ "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 80(%1), %%r10 ;"
+ "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 88(%1), %%r11 ;"
+ /****************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "clc ;"
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
+ "adcx %%rax, %%r8 ;"
+ "adcx %%rcx, %%r9 ;"
+ "movq %%r9, 40(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 48(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 56(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 32(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
+}
+
+void red_eltfp25519_2w_bmi2(u64 *const c, u64 *const a)
+{
+ asm volatile(
+ "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /***************************************/
+ "adcq $0, %%rcx ;"
+ "addq (%1), %%r8 ;"
+ "adcq 8(%1), %%r9 ;"
+ "adcq 16(%1), %%r10 ;"
+ "adcq 24(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
+ "addq %%rax, %%r8 ;"
+ "adcq %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+
+ "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /****************************************/
+ "adcq $0, %%rcx ;"
+ "addq 64(%1), %%r8 ;"
+ "adcq 72(%1), %%r9 ;"
+ "adcq 80(%1), %%r10 ;"
+ "adcq 88(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
+ "addq %%rax, %%r8 ;"
+ "adcq %%rcx, %%r9 ;"
+ "movq %%r9, 40(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 48(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 56(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 32(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
+}
+
+static void mul_256x256_integer_adx(u64 *const c, u64 *const a, u64 *const b)
+{
+ asm volatile(
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "movq %%r10, 8(%0) ;"
+ "mulx 16(%2), %%r12, %%r13; " /* A[0]*B[2] */
+ "adox %%r11, %%r12 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 8(%0), %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r12, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "mulx 16(%2), %%r12, %%r13; " /* A[1]*B[2] */
+ "adox %%r11, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 16(%0), %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r12, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "mulx 16(%2), %%r12, %%r13; " /* A[2]*B[2] */
+ "adox %%r11, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 24(%0), %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r12, %%r10 ;"
+ "movq %%r10, 32(%0) ;"
+ "mulx 16(%2), %%r12, %%r13; " /* A[3]*B[2] */
+ "adox %%r11, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq %%r14, 48(%0) ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+ "movq %%rax, 56(%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
+}
+
+static void mul_256x256_integer_bmi2(u64 *const c, u64 *const a, u64 *const b)
+{
+ asm volatile(
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "addq %%r10, %%r12 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "addq %%r12, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r12 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r12 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r12 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "adcq %%r11, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13");
+}
+
+static void sqr_256x256_integer_adx(u64 *const c, u64 *const a)
+{
+ asm volatile(
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 24(%1), %%rdx ;" /* A[3] */
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
+ "adcx %%rax, %%r12 ;"
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%r12 ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%r12, %%r12 ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq (%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15");
+}
+
+static void sqr_256x256_integer_bmi2(u64 *const c, u64 *const a)
+{
+ asm volatile(
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
+
+ "movq 16(%1), %%rdx ;" /* A[2] */
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r12, %%r13 ;"
+ "shldq $1, %%r11, %%r12 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
+}
+
+static void red_eltfp25519_1w_adx(u64 *const c, u64 *const a)
+{
+ asm volatile(
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox (%1), %%r8 ;"
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 8(%1), %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 16(%1), %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 24(%1), %%r11 ;"
+ /***************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "clc ;"
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
+ "adcx %%rax, %%r8 ;"
+ "adcx %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
+}
+
+static void red_eltfp25519_1w_bmi2(u64 *const c, u64 *const a)
+{
+ asm volatile(
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /***************************************/
+ "adcq $0, %%rcx ;"
+ "addq (%1), %%r8 ;"
+ "adcq 8(%1), %%r9 ;"
+ "adcq 16(%1), %%r10 ;"
+ "adcq 24(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
+ "addq %%rax, %%r8 ;"
+ "adcq %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
+}
+
+static __always_inline void add_eltfp25519_1w_adx(u64 *const c, u64 *const a, u64 *const b)
+{
+ asm volatile(
+ "mov $38, %%eax ;"
+ "xorl %%ecx, %%ecx ;"
+ "movq (%2), %%r8 ;"
+ "adcx (%1), %%r8 ;"
+ "movq 8(%2), %%r9 ;"
+ "adcx 8(%1), %%r9 ;"
+ "movq 16(%2), %%r10 ;"
+ "adcx 16(%1), %%r10 ;"
+ "movq 24(%2), %%r11 ;"
+ "adcx 24(%1), %%r11 ;"
+ "cmovc %%eax, %%ecx ;"
+ "xorl %%eax, %%eax ;"
+ "adcx %%rcx, %%r8 ;"
+ "adcx %%rax, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rax, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rax, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $38, %%ecx ;"
+ "cmovc %%ecx, %%eax ;"
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+}
+
+static __always_inline void add_eltfp25519_1w_bmi2(u64 *const c, u64 *const a, u64 *const b)
+{
+ asm volatile(
+ "mov $38, %%eax ;"
+ "movq (%2), %%r8 ;"
+ "addq (%1), %%r8 ;"
+ "movq 8(%2), %%r9 ;"
+ "adcq 8(%1), %%r9 ;"
+ "movq 16(%2), %%r10 ;"
+ "adcq 16(%1), %%r10 ;"
+ "movq 24(%2), %%r11 ;"
+ "adcq 24(%1), %%r11 ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "adcq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+}
+
+static __always_inline void sub_eltfp25519_1w(u64 *const c, u64 *const a, u64 *const b)
+{
+ asm volatile(
+ "mov $38, %%eax ;"
+ "movq (%1), %%r8 ;"
+ "subq (%2), %%r8 ;"
+ "movq 8(%1), %%r9 ;"
+ "sbbq 8(%2), %%r9 ;"
+ "movq 16(%1), %%r10 ;"
+ "sbbq 16(%2), %%r10 ;"
+ "movq 24(%1), %%r11 ;"
+ "sbbq 24(%2), %%r11 ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "subq %%rcx, %%r8 ;"
+ "sbbq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "sbbq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "sbbq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "subq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+}
+
+/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
+static __always_inline void mul_a24_eltfp25519_1w(u64 *const c, u64 *const a)
+{
+ const u64 a24 = 121666;
+ asm volatile(
+ "movq %2, %%rdx ;"
+ "mulx (%1), %%r8, %%r10 ;"
+ "mulx 8(%1), %%r9, %%r11 ;"
+ "addq %%r10, %%r9 ;"
+ "mulx 16(%1), %%r10, %%rax ;"
+ "adcq %%r11, %%r10 ;"
+ "mulx 24(%1), %%r11, %%rcx ;"
+ "adcq %%rax, %%r11 ;"
+ /**************************/
+ "adcq $0, %%rcx ;"
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
+ "mulx %%rcx, %%rax, %%rcx ;"
+ "addq %%rax, %%r8 ;"
+ "adcq %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(a24)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
+}
+
+static void inv_eltfp25519_1w_adx(u64 *const c, u64 *const a)
+{
+ eltfp25519_1w_buffer buffer_1w;
+ eltfp25519_1w x0, x1, x2;
+ u64 *T[5];
+
+ T[0] = x0;
+ T[1] = c; /* x^(-1) */
+ T[2] = x1;
+ T[3] = x2;
+ T[4] = a; /* x */
+
+ copy_eltfp25519_1w(T[1], a);
+ sqrn_eltfp25519_1w_adx(T[1], 1);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_adx(T[2], 2);
+ mul_eltfp25519_1w_adx(T[0], a, T[2]);
+ mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_adx(T[2], 1);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 5);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 10);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
+ copy_eltfp25519_1w(T[3], T[2]);
+ sqrn_eltfp25519_1w_adx(T[3], 20);
+ mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
+ sqrn_eltfp25519_1w_adx(T[3], 10);
+ mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
+ copy_eltfp25519_1w(T[0], T[3]);
+ sqrn_eltfp25519_1w_adx(T[0], 50);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 100);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 50);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
+ sqrn_eltfp25519_1w_adx(T[2], 5);
+ mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
+
+ memzero_explicit(&buffer_1w, sizeof(buffer_1w));
+ memzero_explicit(&x0, sizeof(x0));
+ memzero_explicit(&x1, sizeof(x1));
+ memzero_explicit(&x2, sizeof(x2));
+}
+
+static void inv_eltfp25519_1w_bmi2(u64 *const c, u64 *const a)
+{
+ eltfp25519_1w_buffer buffer_1w;
+ eltfp25519_1w x0, x1, x2;
+ u64 *T[5];
+
+ T[0] = x0;
+ T[1] = c; /* x^(-1) */
+ T[2] = x1;
+ T[3] = x2;
+ T[4] = a; /* x */
+
+ copy_eltfp25519_1w(T[1], a);
+ sqrn_eltfp25519_1w_bmi2(T[1], 1);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 2);
+ mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
+ mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 1);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 5);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 10);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
+ copy_eltfp25519_1w(T[3], T[2]);
+ sqrn_eltfp25519_1w_bmi2(T[3], 20);
+ mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
+ sqrn_eltfp25519_1w_bmi2(T[3], 10);
+ mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
+ copy_eltfp25519_1w(T[0], T[3]);
+ sqrn_eltfp25519_1w_bmi2(T[0], 50);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 100);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 50);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 5);
+ mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
+
+ memzero_explicit(&buffer_1w, sizeof(buffer_1w));
+ memzero_explicit(&x0, sizeof(x0));
+ memzero_explicit(&x1, sizeof(x1));
+ memzero_explicit(&x2, sizeof(x2));
+}
+
+/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
+ * with a number such that 0 <= C < 2**255-19.
+ */
+static __always_inline void fred_eltfp25519_1w(u64 *const c)
+{
+ asm volatile(
+ /* First, obtains a number less than 2^255. */
+ "btrq $63, 24(%0) ;"
+ "sbbl %%ecx, %%ecx ;"
+ "andq $19, %%rcx ;"
+ "addq %%rcx, (%0) ;"
+ "adcq $0, 8(%0) ;"
+ "adcq $0, 16(%0) ;"
+ "adcq $0, 24(%0) ;"
+
+ "btrq $63, 24(%0) ;"
+ "sbbl %%ecx, %%ecx ;"
+ "andq $19, %%rcx ;"
+ "addq %%rcx, (%0) ;"
+ "adcq $0, 8(%0) ;"
+ "adcq $0, 16(%0) ;"
+ "adcq $0, 24(%0) ;"
+
+ /* Then, in case the number fall into [2^255-19, 2^255-1] */
+ "cmpq $-19, (%0) ;"
+ "setaeb %%al ;"
+ "cmpq $-1, 8(%0) ;"
+ "setzb %%bl ;"
+ "cmpq $-1, 16(%0) ;"
+ "setzb %%cl ;"
+ "movq 24(%0), %%rdx ;"
+ "addq $1, %%rdx ;"
+ "shrq $63, %%rdx ;"
+ "andb %%bl, %%al ;"
+ "andb %%dl, %%cl ;"
+ "test %%cl, %%al ;"
+ "movl $0, %%eax ;"
+ "movl $19, %%ecx ;"
+ "cmovnz %%rcx, %%rax ;"
+ "addq %%rax, (%0) ;"
+ "adcq $0, 8(%0) ;"
+ "adcq $0, 16(%0) ;"
+ "adcq $0, 24(%0) ;"
+ "btrq $63, 24(%0) ;"
+ :
+ : "r"(c)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx");
+}
+
+static __always_inline void cswap(u64 bit, u64 *const px, u64 *const py)
+{
+ int i;
+ u64 mask = 0ULL - bit;
+
+ for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
+ u64 t = mask & (px[i] ^ py[i]);
+ px[i] = px[i] ^ t;
+ py[i] = py[i] ^ t;
+ }
+}
+
+static void curve25519_adx(u8 shared[CURVE25519_POINT_SIZE], const u8 private_key[CURVE25519_POINT_SIZE], const u8 session_key[CURVE25519_POINT_SIZE])
+{
+ __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 workspace[6 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u8 session[CURVE25519_POINT_SIZE];
+ __aligned(32) u8 private[CURVE25519_POINT_SIZE];
+
+ int i = 0, j = 0;
+ u64 prev = 0;
+ u64 *const X1 = (u64 *)session;
+ u64 *const key = (u64 *)private;
+ u64 *const Px = coordinates + 0;
+ u64 *const Pz = coordinates + 4;
+ u64 *const Qx = coordinates + 8;
+ u64 *const Qz = coordinates + 12;
+ u64 *const X2 = Qx;
+ u64 *const Z2 = Qz;
+ u64 *const X3 = Px;
+ u64 *const Z3 = Pz;
+ u64 *const X2Z2 = Qx;
+ u64 *const X3Z3 = Px;
+
+ u64 *const A = workspace + 0;
+ u64 *const B = workspace + 4;
+ u64 *const D = workspace + 8;
+ u64 *const C = workspace + 12;
+ u64 *const DA = workspace + 16;
+ u64 *const CB = workspace + 20;
+ u64 *const AB = A;
+ u64 *const DC = D;
+ u64 *const DACB = DA;
+ u64 *const buffer_1w = buffer;
+ u64 *const buffer_2w = buffer;
+
+ memcpy(private, private_key, sizeof(private));
+ memcpy(session, session_key, sizeof(session));
+
+ normalize_secret(private);
+
+ /* As in the draft:
+ * When receiving such an array, implementations of curve25519
+ * MUST mask the most-significant bit in the final byte. This
+ * is done to preserve compatibility with point formats which
+ * reserve the sign bit for use in other protocols and to
+ * increase resistance to implementation fingerprinting
+ */
+ session[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
+
+ copy_eltfp25519_1w(Px, X1);
+ setzero_eltfp25519_1w(Pz);
+ setzero_eltfp25519_1w(Qx);
+ setzero_eltfp25519_1w(Qz);
+
+ Pz[0] = 1;
+ Qx[0] = 1;
+
+ /* main-loop */
+ prev = 0;
+ j = 62;
+ for (i = 3; i >= 0; --i) {
+ while (j >= 0) {
+ u64 bit = (key[i] >> j) & 0x1;
+ u64 swap = bit ^ prev;
+ prev = bit;
+
+ add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */
+ sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
+ add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */
+ sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
+ mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
+
+ cswap(swap, A, C);
+ cswap(swap, B, D);
+
+ sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */
+ add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */
+ sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
+ sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+
+ copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
+ sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
+
+ mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
+ add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */
+ mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
+ mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
+ --j;
+ }
+ j = 63;
+ }
+
+ inv_eltfp25519_1w_adx(A, Qz);
+ mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
+ fred_eltfp25519_1w((u64 *)shared);
+
+ memzero_explicit(buffer, sizeof(buffer));
+ memzero_explicit(coordinates, sizeof(coordinates));
+ memzero_explicit(workspace, sizeof(workspace));
+ memzero_explicit(private, sizeof(private));
+ memzero_explicit(session, sizeof(session));
+}
+
+static void curve25519_adx_base(u8 session_key[CURVE25519_POINT_SIZE], const u8 private_key[CURVE25519_POINT_SIZE])
+{
+ __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 workspace[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u8 private[CURVE25519_POINT_SIZE];
+
+ const int ite[4] = { 64, 64, 64, 63 };
+ const int q = 3;
+ u64 swap = 1;
+
+ int i = 0, j = 0, k = 0;
+ u64 *const key = (u64 *)private;
+ u64 *const Ur1 = coordinates + 0;
+ u64 *const Zr1 = coordinates + 4;
+ u64 *const Ur2 = coordinates + 8;
+ u64 *const Zr2 = coordinates + 12;
+
+ u64 *const UZr1 = coordinates + 0;
+ u64 *const ZUr2 = coordinates + 8;
+
+ u64 *const A = workspace + 0;
+ u64 *const B = workspace + 4;
+ u64 *const C = workspace + 8;
+ u64 *const D = workspace + 12;
+
+ u64 *const AB = workspace + 0;
+ u64 *const CD = workspace + 8;
+
+ u64 *const buffer_1w = buffer;
+ u64 *const buffer_2w = buffer;
+ u64 *P = (u64 *)table_ladder_8k;
+
+ memcpy(private, private_key, sizeof(private));
+
+ normalize_secret(private);
+
+ setzero_eltfp25519_1w(Ur1);
+ setzero_eltfp25519_1w(Zr1);
+ setzero_eltfp25519_1w(Zr2);
+ Ur1[0] = 1;
+ Zr1[0] = 1;
+ Zr2[0] = 1;
+
+ /* G-S */
+ Ur2[3] = 0x1eaecdeee27cab34;
+ Ur2[2] = 0xadc7a0b9235d48e2;
+ Ur2[1] = 0xbbf095ae14b2edf8;
+ Ur2[0] = 0x7e94e1fec82faabd;
+
+ /* main-loop */
+ j = q;
+ for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
+ while (j < ite[i]) {
+ u64 bit = (key[i] >> j) & 0x1;
+ k = (64 * i + j - q);
+ swap = swap ^ bit;
+ cswap(swap, Ur1, Ur2);
+ cswap(swap, Zr1, Zr2);
+ swap = bit;
+ /* Addition */
+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
+ add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
+ mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
+ sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
+ add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
+ sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */
+ mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
+ ++j;
+ }
+ j = 0;
+ }
+
+ /* Doubling */
+ for (i = 0; i < q; ++i) {
+ add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
+ sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */
+ copy_eltfp25519_1w(C, B); /* C = B */
+ sub_eltfp25519_1w(B, A, B); /* B = A-B */
+ mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
+ add_eltfp25519_1w_adx(D, D, C); /* D = D+C */
+ mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
+ }
+
+ /* Convert to affine coordinates */
+ inv_eltfp25519_1w_adx(A, Zr1);
+ mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
+ fred_eltfp25519_1w((u64 *)session_key);
+
+ memzero_explicit(buffer, sizeof(buffer));
+ memzero_explicit(coordinates, sizeof(coordinates));
+ memzero_explicit(workspace, sizeof(workspace));
+ memzero_explicit(private, sizeof(private));
}
-typedef u64 fex[10];
-typedef u64 fe51[5];
-asmlinkage void curve25519_sandy2x_ladder(fex *, const u8 *);
-asmlinkage void curve25519_sandy2x_ladder_base(fex *, const u8 *);
-asmlinkage void curve25519_sandy2x_fe51_pack(u8 *, const fe51 *);
-asmlinkage void curve25519_sandy2x_fe51_mul(fe51 *, const fe51 *, const fe51 *);
-asmlinkage void curve25519_sandy2x_fe51_nsquare(fe51 *, const fe51 *, int);
-
-static inline u32 le24_to_cpupv(const u8 *in)
-{
- return le16_to_cpup((__le16 *)in) | ((u32)in[2]) << 16;
-}
-
-static inline void fex_frombytes(fex h, const u8 *s)
-{
- u64 h0 = le32_to_cpup((__le32 *)s);
- u64 h1 = le24_to_cpupv(s + 4) << 6;
- u64 h2 = le24_to_cpupv(s + 7) << 5;
- u64 h3 = le24_to_cpupv(s + 10) << 3;
- u64 h4 = le24_to_cpupv(s + 13) << 2;
- u64 h5 = le32_to_cpup((__le32 *)(s + 16));
- u64 h6 = le24_to_cpupv(s + 20) << 7;
- u64 h7 = le24_to_cpupv(s + 23) << 5;
- u64 h8 = le24_to_cpupv(s + 26) << 4;
- u64 h9 = (le24_to_cpupv(s + 29) & 8388607) << 2;
- u64 carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9;
-
- carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF;
- carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF;
- carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF;
- carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF;
- carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF;
-
- carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF;
- carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF;
- carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF;
- carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF;
- carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF;
-
- h[0] = h0;
- h[1] = h1;
- h[2] = h2;
- h[3] = h3;
- h[4] = h4;
- h[5] = h5;
- h[6] = h6;
- h[7] = h7;
- h[8] = h8;
- h[9] = h9;
-}
-
-static inline void fe51_invert(fe51 *r, const fe51 *x)
-{
- fe51 z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t;
-
- /* 2 */ curve25519_sandy2x_fe51_nsquare(&z2, x, 1);
- /* 4 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2, 1);
- /* 8 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 1);
- /* 9 */ curve25519_sandy2x_fe51_mul(&z9, (const fe51 *)&t, x);
- /* 11 */ curve25519_sandy2x_fe51_mul(&z11, (const fe51 *)&z9, (const fe51 *)&z2);
- /* 22 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z11, 1);
- /* 2^5 - 2^0 = 31 */ curve25519_sandy2x_fe51_mul(&z2_5_0, (const fe51 *)&t, (const fe51 *)&z9);
-
- /* 2^10 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_5_0, 5);
- /* 2^10 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_10_0, (const fe51 *)&t, (const fe51 *)&z2_5_0);
-
- /* 2^20 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_10_0, 10);
- /* 2^20 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_20_0, (const fe51 *)&t, (const fe51 *)&z2_10_0);
-
- /* 2^40 - 2^20 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_20_0, 20);
- /* 2^40 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_20_0);
-
- /* 2^50 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 10);
- /* 2^50 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_50_0, (const fe51 *)&t, (const fe51 *)&z2_10_0);
-
- /* 2^100 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_50_0, 50);
- /* 2^100 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_100_0, (const fe51 *)&t, (const fe51 *)&z2_50_0);
-
- /* 2^200 - 2^100 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_100_0, 100);
- /* 2^200 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_100_0);
-
- /* 2^250 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 50);
- /* 2^250 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_50_0);
-
- /* 2^255 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 5);
- /* 2^255 - 21 */ curve25519_sandy2x_fe51_mul(r, (const fe51 *)t, (const fe51 *)&z11);
-}
-
-static void curve25519_sandy2x(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE])
-{
- u8 e[32];
- fex var[3];
- fe51 x_51, z_51;
-
- memcpy(e, secret, 32);
- normalize_secret(e);
-#define x1 var[0]
-#define x2 var[1]
-#define z2 var[2]
- fex_frombytes(x1, basepoint);
- curve25519_sandy2x_ladder(var, e);
- z_51[0] = (z2[1] << 26) + z2[0];
- z_51[1] = (z2[3] << 26) + z2[2];
- z_51[2] = (z2[5] << 26) + z2[4];
- z_51[3] = (z2[7] << 26) + z2[6];
- z_51[4] = (z2[9] << 26) + z2[8];
- x_51[0] = (x2[1] << 26) + x2[0];
- x_51[1] = (x2[3] << 26) + x2[2];
- x_51[2] = (x2[5] << 26) + x2[4];
- x_51[3] = (x2[7] << 26) + x2[6];
- x_51[4] = (x2[9] << 26) + x2[8];
-#undef x1
-#undef x2
-#undef z2
- fe51_invert(&z_51, (const fe51 *)&z_51);
- curve25519_sandy2x_fe51_mul(&x_51, (const fe51 *)&x_51, (const fe51 *)&z_51);
- curve25519_sandy2x_fe51_pack(mypublic, (const fe51 *)&x_51);
-
- memzero_explicit(e, sizeof(e));
- memzero_explicit(var, sizeof(var));
- memzero_explicit(x_51, sizeof(x_51));
- memzero_explicit(z_51, sizeof(z_51));
-}
-
-static void curve25519_sandy2x_base(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE])
-{
- u8 e[32];
- fex var[3];
- fe51 x_51, z_51;
-
- memcpy(e, secret, 32);
- normalize_secret(e);
- curve25519_sandy2x_ladder_base(var, e);
-#define x2 var[0]
-#define z2 var[1]
- z_51[0] = (z2[1] << 26) + z2[0];
- z_51[1] = (z2[3] << 26) + z2[2];
- z_51[2] = (z2[5] << 26) + z2[4];
- z_51[3] = (z2[7] << 26) + z2[6];
- z_51[4] = (z2[9] << 26) + z2[8];
- x_51[0] = (x2[1] << 26) + x2[0];
- x_51[1] = (x2[3] << 26) + x2[2];
- x_51[2] = (x2[5] << 26) + x2[4];
- x_51[3] = (x2[7] << 26) + x2[6];
- x_51[4] = (x2[9] << 26) + x2[8];
-#undef x2
-#undef z2
- fe51_invert(&z_51, (const fe51 *)&z_51);
- curve25519_sandy2x_fe51_mul(&x_51, (const fe51 *)&x_51, (const fe51 *)&z_51);
- curve25519_sandy2x_fe51_pack(pub, (const fe51 *)&x_51);
-
- memzero_explicit(e, sizeof(e));
- memzero_explicit(var, sizeof(var));
- memzero_explicit(x_51, sizeof(x_51));
- memzero_explicit(z_51, sizeof(z_51));
+static void curve25519_bmi2(u8 shared[CURVE25519_POINT_SIZE], const u8 private_key[CURVE25519_POINT_SIZE], const u8 session_key[CURVE25519_POINT_SIZE])
+{
+ __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 workspace[6 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u8 session[CURVE25519_POINT_SIZE];
+ __aligned(32) u8 private[CURVE25519_POINT_SIZE];
+
+ int i = 0, j = 0;
+ u64 prev = 0;
+ u64 *const X1 = (u64 *)session;
+ u64 *const key = (u64 *)private;
+ u64 *const Px = coordinates + 0;
+ u64 *const Pz = coordinates + 4;
+ u64 *const Qx = coordinates + 8;
+ u64 *const Qz = coordinates + 12;
+ u64 *const X2 = Qx;
+ u64 *const Z2 = Qz;
+ u64 *const X3 = Px;
+ u64 *const Z3 = Pz;
+ u64 *const X2Z2 = Qx;
+ u64 *const X3Z3 = Px;
+
+ u64 *const A = workspace + 0;
+ u64 *const B = workspace + 4;
+ u64 *const D = workspace + 8;
+ u64 *const C = workspace + 12;
+ u64 *const DA = workspace + 16;
+ u64 *const CB = workspace + 20;
+ u64 *const AB = A;
+ u64 *const DC = D;
+ u64 *const DACB = DA;
+ u64 *const buffer_1w = buffer;
+ u64 *const buffer_2w = buffer;
+
+ memcpy(private, private_key, sizeof(private));
+ memcpy(session, session_key, sizeof(session));
+
+ normalize_secret(private);
+
+ /* As in the draft:
+ * When receiving such an array, implementations of curve25519
+ * MUST mask the most-significant bit in the final byte. This
+ * is done to preserve compatibility with point formats which
+ * reserve the sign bit for use in other protocols and to
+ * increase resistance to implementation fingerprinting
+ */
+ session[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
+
+ copy_eltfp25519_1w(Px, X1);
+ setzero_eltfp25519_1w(Pz);
+ setzero_eltfp25519_1w(Qx);
+ setzero_eltfp25519_1w(Qz);
+
+ Pz[0] = 1;
+ Qx[0] = 1;
+
+ /* main-loop */
+ prev = 0;
+ j = 62;
+ for (i = 3; i >= 0; --i) {
+ while (j >= 0) {
+ u64 bit = (key[i] >> j) & 0x1;
+ u64 swap = bit ^ prev;
+ prev = bit;
+
+ add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */
+ sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
+ add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */
+ sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
+ mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
+
+ cswap(swap, A, C);
+ cswap(swap, B, D);
+
+ sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
+ add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
+ sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
+ sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+
+ copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
+ sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
+
+ mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
+ add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */
+ mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
+ mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
+ --j;
+ }
+ j = 63;
+ }
+
+ inv_eltfp25519_1w_bmi2(A, Qz);
+ mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
+ fred_eltfp25519_1w((u64 *)shared);
+
+ memzero_explicit(buffer, sizeof(buffer));
+ memzero_explicit(coordinates, sizeof(coordinates));
+ memzero_explicit(workspace, sizeof(workspace));
+ memzero_explicit(private, sizeof(private));
+ memzero_explicit(session, sizeof(session));
+}
+
+static void curve25519_bmi2_base(u8 session_key[CURVE25519_POINT_SIZE], const u8 private_key[CURVE25519_POINT_SIZE])
+{
+ __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 workspace[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u8 private[CURVE25519_POINT_SIZE];
+
+ const int ite[4] = { 64, 64, 64, 63 };
+ const int q = 3;
+ u64 swap = 1;
+
+ int i = 0, j = 0, k = 0;
+ u64 *const key = (u64 *)private;
+ u64 *const Ur1 = coordinates + 0;
+ u64 *const Zr1 = coordinates + 4;
+ u64 *const Ur2 = coordinates + 8;
+ u64 *const Zr2 = coordinates + 12;
+
+ u64 *const UZr1 = coordinates + 0;
+ u64 *const ZUr2 = coordinates + 8;
+
+ u64 *const A = workspace + 0;
+ u64 *const B = workspace + 4;
+ u64 *const C = workspace + 8;
+ u64 *const D = workspace + 12;
+
+ u64 *const AB = workspace + 0;
+ u64 *const CD = workspace + 8;
+
+ u64 *const buffer_1w = buffer;
+ u64 *const buffer_2w = buffer;
+ u64 *P = (u64 *)table_ladder_8k;
+
+ memcpy(private, private_key, sizeof(private));
+
+ normalize_secret(private);
+
+ setzero_eltfp25519_1w(Ur1);
+ setzero_eltfp25519_1w(Zr1);
+ setzero_eltfp25519_1w(Zr2);
+ Ur1[0] = 1;
+ Zr1[0] = 1;
+ Zr2[0] = 1;
+
+ /* G-S */
+ Ur2[3] = 0x1eaecdeee27cab34;
+ Ur2[2] = 0xadc7a0b9235d48e2;
+ Ur2[1] = 0xbbf095ae14b2edf8;
+ Ur2[0] = 0x7e94e1fec82faabd;
+
+ /* main-loop */
+ j = q;
+ for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
+ while (j < ite[i]) {
+ u64 bit = (key[i] >> j) & 0x1;
+ k = (64 * i + j - q);
+ swap = swap ^ bit;
+ cswap(swap, Ur1, Ur2);
+ cswap(swap, Zr1, Zr2);
+ swap = bit;
+ /* Addition */
+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
+ add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
+ mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
+ sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
+ add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
+ sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */
+ mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
+ ++j;
+ }
+ j = 0;
+ }
+
+ /* Doubling */
+ for (i = 0; i < q; ++i) {
+ add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
+ sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */
+ copy_eltfp25519_1w(C, B); /* C = B */
+ sub_eltfp25519_1w(B, A, B); /* B = A-B */
+ mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
+ add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */
+ mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
+ }
+
+ /* Convert to affine coordinates */
+ inv_eltfp25519_1w_bmi2(A, Zr1);
+ mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
+ fred_eltfp25519_1w((u64 *)session_key);
+
+ memzero_explicit(buffer, sizeof(buffer));
+ memzero_explicit(coordinates, sizeof(coordinates));
+ memzero_explicit(workspace, sizeof(workspace));
+ memzero_explicit(private, sizeof(private));
}
diff --git a/src/crypto/curve25519.c b/src/crypto/curve25519.c
index eba94cd..8de8909 100644
--- a/src/crypto/curve25519.c
+++ b/src/crypto/curve25519.c
@@ -17,7 +17,7 @@ static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE])
secret[31] |= 64;
}
-#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX)
+#if defined(CONFIG_X86_64)
#include "curve25519-x86_64.h"
#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM)
#include "curve25519-arm.h"
@@ -35,12 +35,12 @@ static const u8 null_point[CURVE25519_POINT_SIZE] = { 0 };
bool curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE])
{
-#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX)
- if (curve25519_use_avx && irq_fpu_usable()) {
- kernel_fpu_begin();
- curve25519_sandy2x(mypublic, secret, basepoint);
- kernel_fpu_end();
- } else
+#if defined(CONFIG_X86_64)
+ if (curve25519_use_adx)
+ curve25519_adx(mypublic, secret, basepoint);
+ else if (curve25519_use_bmi2)
+ curve25519_bmi2(mypublic, secret, basepoint);
+ else
#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM)
if (curve25519_use_neon && may_use_simd()) {
kernel_neon_begin();
@@ -60,11 +60,13 @@ bool curve25519_generate_public(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[C
if (unlikely(!crypto_memneq(secret, null_point, CURVE25519_POINT_SIZE)))
return false;
-#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX)
- if (curve25519_use_avx && irq_fpu_usable()) {
- kernel_fpu_begin();
- curve25519_sandy2x_base(pub, secret);
- kernel_fpu_end();
+#if defined(CONFIG_X86_64)
+ if (curve25519_use_adx) {
+ curve25519_adx_base(pub, secret);
+ return crypto_memneq(pub, null_point, CURVE25519_POINT_SIZE);
+ }
+ if (curve25519_use_bmi2) {
+ curve25519_bmi2_base(pub, secret);
return crypto_memneq(pub, null_point, CURVE25519_POINT_SIZE);
}
#endif