From 06bab82a51b769ca96297d09ed96afcbcfb36189 Mon Sep 17 00:00:00 2001 From: Samuel Neves <sneves@dei.uc.pt> Date: Sat, 28 Jul 2018 06:09:52 +0100 Subject: curve25519-x86_64: simplify the final reduction by adding 19 beforehand Correctness can be quickly verified with the following z3py script: >>> from z3 import * >>> x = BitVec("x", 256) # any 256-bit value >>> ref = URem(x, 2**255 - 19) # correct value >>> t = Extract(255, 255, x); x &= 2**255 - 1; # btrq $63, %3 >>> u = If(t != 0, BitVecVal(38, 256), BitVecVal(19, 256)) # cmovncl %k5, %k4 >>> x += u # addq %4, %0; adcq $0, %1; adcq $0, %2; adcq $0, %3; >>> t = Extract(255, 255, x); x &= 2**255 - 1; # btrq $63, %3 >>> u = If(t != 0, BitVecVal(0, 256), BitVecVal(19, 256)) # cmovncl %k5, %k4 >>> x -= u # subq %4, %0; sbbq $0, %1; sbbq $0, %2; sbbq $0, %3; >>> prove(x == ref) proved Change inspired by Andy Polyakov's OpenSSL implementation. Signed-off-by: Samuel Neves <sneves@dei.uc.pt> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- src/crypto/curve25519-x86_64.h | 66 +++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/src/crypto/curve25519-x86_64.h b/src/crypto/curve25519-x86_64.h index fba4f00..29204de 100644 --- a/src/crypto/curve25519-x86_64.h +++ b/src/crypto/curve25519-x86_64.h @@ -1609,48 +1609,34 @@ static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) */ static __always_inline void fred_eltfp25519_1w(u64 *const c) { + u64 tmp0, tmp1; asm volatile( - /* First, obtains a number less than 2^255. */ - "btrq $63, 24(%0) ;" - "sbbl %%ecx, %%ecx ;" - "andq $19, %%rcx ;" - "addq %%rcx, (%0) ;" - "adcq $0, 8(%0) ;" - "adcq $0, 16(%0) ;" - "adcq $0, 24(%0) ;" - - "btrq $63, 24(%0) ;" - "sbbl %%ecx, %%ecx ;" - "andq $19, %%rcx ;" - "addq %%rcx, (%0) ;" - "adcq $0, 8(%0) ;" - "adcq $0, 16(%0) ;" - "adcq $0, 24(%0) ;" - - /* Then, in case the number fall into [2^255-19, 2^255-1] */ - "cmpq $-19, (%0) ;" - "setaeb %%al ;" - "cmpq $-1, 8(%0) ;" - "setzb %%bl ;" - "cmpq $-1, 16(%0) ;" - "setzb %%cl ;" - "movq 24(%0), %%rdx ;" - "addq $1, %%rdx ;" - "shrq $63, %%rdx ;" - "andb %%bl, %%al ;" - "andb %%dl, %%cl ;" - "test %%cl, %%al ;" - "movl $0, %%eax ;" - "movl $19, %%ecx ;" - "cmovnz %%rcx, %%rax ;" - "addq %%rax, (%0) ;" - "adcq $0, 8(%0) ;" - "adcq $0, 16(%0) ;" - "adcq $0, 24(%0) ;" - "btrq $63, 24(%0) ;" + "movl $19, %k5 ;" + "movl $38, %k4 ;" + + "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */ + "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */ + + /* Add either 19 or 38 to c */ + "addq %4, %0 ;" + "adcq $0, %1 ;" + "adcq $0, %2 ;" + "adcq $0, %3 ;" + + /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */ + "movl $0, %k4 ;" + "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */ + "btrq $63, %3 ;" /* Clear bit 255 */ + + /* Subtract 19 if necessary */ + "subq %4, %0 ;" + "sbbq $0, %1 ;" + "sbbq $0, %2 ;" + "sbbq $0, %3 ;" + + : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "=r"(tmp0), "=r"(tmp1) : - : "r"(c) - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx"); + : "memory", "cc"); } static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py) -- cgit v1.2.3