From 06bab82a51b769ca96297d09ed96afcbcfb36189 Mon Sep 17 00:00:00 2001
From: Samuel Neves <sneves@dei.uc.pt>
Date: Sat, 28 Jul 2018 06:09:52 +0100
Subject: curve25519-x86_64: simplify the final reduction by adding 19
 beforehand

Correctness can be quickly verified with the following z3py script:

>>> from z3 import *
>>> x = BitVec("x", 256) # any 256-bit value
>>> ref = URem(x, 2**255 - 19) # correct value
>>> t = Extract(255, 255, x); x &= 2**255 - 1; # btrq $63, %3
>>> u = If(t != 0, BitVecVal(38, 256), BitVecVal(19, 256)) # cmovncl %k5, %k4
>>> x += u # addq %4, %0; adcq $0, %1; adcq $0, %2; adcq $0, %3;
>>> t = Extract(255, 255, x); x &= 2**255 - 1; # btrq $63, %3
>>> u = If(t != 0, BitVecVal(0, 256), BitVecVal(19, 256)) # cmovncl %k5, %k4
>>> x -= u # subq %4, %0; sbbq $0, %1; sbbq $0, %2; sbbq $0, %3;
>>> prove(x == ref)
proved

Change inspired by Andy Polyakov's OpenSSL implementation.

Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 src/crypto/curve25519-x86_64.h | 66 +++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 40 deletions(-)

diff --git a/src/crypto/curve25519-x86_64.h b/src/crypto/curve25519-x86_64.h
index fba4f00..29204de 100644
--- a/src/crypto/curve25519-x86_64.h
+++ b/src/crypto/curve25519-x86_64.h
@@ -1609,48 +1609,34 @@ static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
  */
 static __always_inline void fred_eltfp25519_1w(u64 *const c)
 {
+	u64 tmp0, tmp1;
 	asm volatile(
-		/* First, obtains a number less than 2^255. */
-		"btrq   $63, 24(%0) ;"
-		"sbbl %%ecx, %%ecx  ;"
-		"andq   $19, %%rcx  ;"
-		"addq %%rcx,   (%0) ;"
-		"adcq    $0,  8(%0) ;"
-		"adcq    $0, 16(%0) ;"
-		"adcq    $0, 24(%0) ;"
-
-		"btrq   $63, 24(%0) ;"
-		"sbbl %%ecx, %%ecx  ;"
-		"andq   $19, %%rcx  ;"
-		"addq %%rcx,   (%0) ;"
-		"adcq    $0,  8(%0) ;"
-		"adcq    $0, 16(%0) ;"
-		"adcq    $0, 24(%0) ;"
-
-		/* Then, in case the number fall into [2^255-19, 2^255-1] */
-		"cmpq $-19,   (%0)   ;"
-		"setaeb %%al         ;"
-		"cmpq  $-1,  8(%0)   ;"
-		"setzb %%bl          ;"
-		"cmpq  $-1, 16(%0)   ;"
-		"setzb %%cl          ;"
-		"movq 24(%0), %%rdx  ;"
-		"addq   $1, %%rdx    ;"
-		"shrq  $63, %%rdx    ;"
-		"andb %%bl, %%al     ;"
-		"andb %%dl, %%cl     ;"
-		"test %%cl, %%al     ;"
-		"movl  $0, %%eax     ;"
-		"movl $19, %%ecx     ;"
-		"cmovnz %%rcx, %%rax ;"
-		"addq %%rax,   (%0)  ;"
-		"adcq    $0,  8(%0)  ;"
-		"adcq    $0, 16(%0)  ;"
-		"adcq    $0, 24(%0)  ;"
-		"btrq   $63, 24(%0)  ;"
+		"movl   $19,   %k5 ;"
+		"movl   $38,   %k4 ;"
+
+		"btrq   $63,    %3 ;" /* Put bit 255 in carry flag and clear */
+		"cmovncl %k5,   %k4 ;" /* c[255] ? 38 : 19 */
+
+		/* Add either 19 or 38 to c */
+		"addq    %4,   %0 ;"
+		"adcq    $0,   %1 ;"
+		"adcq    $0,   %2 ;"
+		"adcq    $0,   %3 ;"
+
+		/* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
+		"movl    $0,  %k4 ;"
+		"cmovnsl %k5,  %k4 ;" /* c[255] ? 0 : 19 */
+		"btrq   $63,   %3 ;" /* Clear bit 255 */
+
+		/* Subtract 19 if necessary */
+		"subq    %4,   %0 ;"
+		"sbbq    $0,   %1 ;"
+		"sbbq    $0,   %2 ;"
+		"sbbq    $0,   %3 ;"
+
+		: "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "=r"(tmp0), "=r"(tmp1)
 		:
-		: "r"(c)
-		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx");
+		: "memory", "cc");
 }
 
 static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
-- 
cgit v1.2.3