summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRené van Dorst <opensource@vdorst.com>2018-09-25 14:04:35 +0200
committerJason A. Donenfeld <Jason@zx2c4.com>2018-09-25 15:46:49 +0200
commit17f64817393f6f36f012da8a5b3a6bd60d4c1e1d (patch)
tree7492c76f96fbab4a5456d305a0c7ddbee85bbf15
parent5b46a8c6ae38ef48e7fcb76403e92d9ce62e7ad5 (diff)
chacha20-mips32r2: reduce stack and branches in loop, refactor jumptable handling
Signed-off-by: René van Dorst <opensource@vdorst.com> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
-rw-r--r--src/crypto/zinc/chacha20/chacha20-mips.S173
1 files changed, 94 insertions, 79 deletions
diff --git a/src/crypto/zinc/chacha20/chacha20-mips.S b/src/crypto/zinc/chacha20/chacha20-mips.S
index 8796da3..031ee5e 100644
--- a/src/crypto/zinc/chacha20/chacha20-mips.S
+++ b/src/crypto/zinc/chacha20/chacha20-mips.S
@@ -5,9 +5,8 @@
*/
#define MASK_U32 0x3c
-#define MASK_BYTES 0x03
#define CHACHA20_BLOCK_SIZE 64
-#define STACK_SIZE 40
+#define STACK_SIZE 32
#define X0 $t0
#define X1 $t1
@@ -19,7 +18,7 @@
#define X7 $t7
#define X8 $t8
#define X9 $t9
-#define X10 $s7
+#define X10 $v1
#define X11 $s6
#define X12 $s5
#define X13 $s4
@@ -49,10 +48,9 @@
* They are used to handling the last bytes which are not multiple of 4.
*/
#define SAVED_X X15
-#define SAVED_CA $fp
+#define SAVED_CA $s7
-#define PTR_LAST_ROUND $v1
-#define IS_UNALIGNED $fp
+#define IS_UNALIGNED $s7
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define MSB 0
@@ -212,12 +210,9 @@ chacha20_mips:
/* Return bytes = 0. */
beqz BYTES, .Lchacha20_mips_end
- /* Calculate PTR_LAST_ROUND */
- addiu PTR_LAST_ROUND, BYTES, -1
- ins PTR_LAST_ROUND, $zero, 0, 6
- addu PTR_LAST_ROUND, OUT
+ lw NONCE_0, 48(STATE)
- /* Save s0-s7, fp */
+ /* Save s0-s7 */
sw $s0, 0($sp)
sw $s1, 4($sp)
sw $s2, 8($sp)
@@ -226,9 +221,6 @@ chacha20_mips:
sw $s5, 20($sp)
sw $s6, 24($sp)
sw $s7, 28($sp)
- sw $fp, 32($sp)
-
- lw NONCE_0, 48(STATE)
/* Test IN or OUT is unaligned.
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003
@@ -236,7 +228,8 @@ chacha20_mips:
or IS_UNALIGNED, IN, OUT
andi IS_UNALIGNED, 0x3
- andi BYTES, (CHACHA20_BLOCK_SIZE-1)
+ /* Set number of rounds */
+ li $at, 20
b .Lchacha20_rounds_start
@@ -266,7 +259,6 @@ chacha20_mips:
lw X14, 56(STATE)
lw X15, 60(STATE)
- li $at, 20
.Loop_chacha20_xor_rounds:
addiu $at, -2
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
@@ -279,110 +271,107 @@ chacha20_mips:
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
bnez $at, .Loop_chacha20_xor_rounds
- andi $at, BYTES, MASK_U32
+ addiu BYTES, -(CHACHA20_BLOCK_SIZE)
/* Is data src/dst unaligned? Jump */
bnez IS_UNALIGNED, .Loop_chacha20_unaligned
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+ /* Set number rounds here to fill delayslot. */
+ li $at, 20
- /* Last round? No, do a full block. */
- bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_16_b
+ /* BYTES < 0, it has no full block. */
+ bltz BYTES, .Lchacha20_mips_no_full_block_aligned
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
+ FOR_EACH_WORD_REV(STORE_ALIGNED)
- /* Full block? Jump */
- beqz BYTES, .Lchacha20_mips_xor_aligned_16_b
+ /* BYTES > 0? Loop again. */
+ bgtz BYTES, .Loop_chacha20_rounds
- /* Add STATE with offset */
- addu T1, STATE, $at
+ /* Place this here to fill delay slot */
+ addiu NONCE_0, 1
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+ /* BYTES < 0? Handle last bytes */
+ bltz BYTES, .Lchacha20_mips_xor_bytes
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
+.Lchacha20_mips_xor_done:
+ /* Restore used registers */
+ lw $s0, 0($sp)
+ lw $s1, 4($sp)
+ lw $s2, 8($sp)
+ lw $s3, 12($sp)
+ lw $s4, 16($sp)
+ lw $s5, 20($sp)
+ lw $s6, 24($sp)
+ lw $s7, 28($sp)
- jr T0
+ /* Write NONCE_0 back to right location in state */
+ sw NONCE_0, 48(STATE)
-.Loop_chacha20_unaligned:
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+.Lchacha20_mips_end:
+ addiu $sp, STACK_SIZE
+ jr $ra
- /* Last round? No, do a full block. */
- bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_16_b
+.Lchacha20_mips_no_full_block_aligned:
+ /* Restore the offset on BYTES */
+ addiu BYTES, CHACHA20_BLOCK_SIZE
+
+ /* Get number of full WORDS */
+ andi $at, BYTES, MASK_U32
+
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
/* Calculate lower half jump table offset */
ins T0, $at, 1, 6
- /* Full block? Jump */
- beqz BYTES, .Lchacha20_mips_xor_unaligned_16_b
-
- /* Add STATE with offset */
+ /* Add offset to STATE */
addu T1, STATE, $at
/* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
/* Read value from STATE */
lw SAVED_CA, 0(T1)
- jr T0
+ /* Store remaining bytecounter as negative value */
+ subu BYTES, $at, BYTES
-/* Aligned code path
- */
-.align 4
- FOR_EACH_WORD_REV(STORE_ALIGNED)
-
- bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+ jr T0
- /* Increase NONCE_0, return NONCE_0 value */
- addiu NONCE_0, 1
- bne $at, BYTES, .Lchacha20_mips_xor_bytes
+ /* Jump table */
+ FOR_EACH_WORD(JMPTBL_ALIGNED)
-.Lchacha20_mips_xor_done:
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
- /* Restore used registers */
- lw $s0, 0($sp)
- lw $s1, 4($sp)
- lw $s2, 8($sp)
- lw $s3, 12($sp)
- lw $s4, 16($sp)
- lw $s5, 20($sp)
- lw $s6, 24($sp)
- lw $s7, 28($sp)
- lw $fp, 32($sp)
-.Lchacha20_mips_end:
- addiu $sp, STACK_SIZE
- jr $ra
+.Loop_chacha20_unaligned:
+ /* Set number rounds here to fill delayslot. */
+ li $at, 20
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_ALIGNED)
+ /* BYTES > 0, it has no full block. */
+ bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
- /* Unaligned code path */
FOR_EACH_WORD_REV(STORE_UNALIGNED)
- bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+ /* BYTES > 0? Loop again. */
+ bgtz BYTES, .Loop_chacha20_rounds
+ /* Write NONCE_0 back to right location in state */
+ sw NONCE_0, 48(STATE)
+
+ .set noreorder
/* Fall through to byte handling */
- .set noreorder
- beq $at, BYTES, .Lchacha20_mips_xor_done
- /* Empty delayslot, increase NONCE_0, return NONCE_0 value */
+ bgez BYTES, .Lchacha20_mips_xor_done
.Lchacha20_mips_xor_unaligned_0_b:
.Lchacha20_mips_xor_aligned_0_b:
+ /* Place this here to fill delay slot */
addiu NONCE_0, 1
- .set reorder
+ .set reorder
.Lchacha20_mips_xor_bytes:
addu IN, $at
addu OUT, $at
/* First byte */
lbu T1, 0(IN)
- andi $at, BYTES, 2
+ addiu $at, BYTES, 1
CPU_TO_LE32(SAVED_X)
ROTR(SAVED_X)
xor T1, SAVED_X
@@ -390,7 +379,7 @@ chacha20_mips:
beqz $at, .Lchacha20_mips_xor_done
/* Second byte */
lbu T1, 1(IN)
- andi $at, BYTES, 1
+ addiu $at, BYTES, 2
ROTx SAVED_X, 8
xor T1, SAVED_X
sb T1, 1(OUT)
@@ -402,7 +391,33 @@ chacha20_mips:
sb T1, 2(OUT)
b .Lchacha20_mips_xor_done
-.Lchacha20_mips_jmptbl_unaligned:
+.Lchacha20_mips_no_full_block_unaligned:
+ /* Restore the offset on BYTES */
+ addiu BYTES, CHACHA20_BLOCK_SIZE
+
+ /* Get number of full WORDS */
+ andi $at, BYTES, MASK_U32
+
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+
+ /* Calculate lower half jump table offset */
+ ins T0, $at, 1, 6
+
+ /* Add offset to STATE */
+ addu T1, STATE, $at
+
+ /* Add lower half jump table addr */
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+
+ /* Read value from STATE */
+ lw SAVED_CA, 0(T1)
+
+ /* Store remaining bytecounter as negative value */
+ subu BYTES, $at, BYTES
+
+ jr T0
+
/* Jump table */
FOR_EACH_WORD(JMPTBL_UNALIGNED)
.end chacha20_mips