diff options
-rw-r--r-- | src/crypto/zinc/chacha20/chacha20-mips-glue.h | 7 | ||||
-rw-r--r-- | src/crypto/zinc/chacha20/chacha20-mips.S | 315 |
2 files changed, 143 insertions, 179 deletions
diff --git a/src/crypto/zinc/chacha20/chacha20-mips-glue.h b/src/crypto/zinc/chacha20/chacha20-mips-glue.h index e38098e..929ca12 100644 --- a/src/crypto/zinc/chacha20/chacha20-mips-glue.h +++ b/src/crypto/zinc/chacha20/chacha20-mips-glue.h @@ -3,8 +3,8 @@ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ -asmlinkage void chacha20_mips(u8 *out, const u8 *in, const size_t len, - const u32 key[8], const u32 counter[4]); +asmlinkage void chacha20_mips(u32 state[16], u8 *out, const u8 *in, + const size_t len); static void __init chacha20_fpu_init(void) { } @@ -13,8 +13,7 @@ static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst, const u8 *src, const size_t len, simd_context_t *simd_context) { - chacha20_mips(dst, src, len, state->key, state->counter); - state->counter[0] += (len + 63) / 64; + chacha20_mips((u32 *)state, dst, src, len); return true; } diff --git a/src/crypto/zinc/chacha20/chacha20-mips.S b/src/crypto/zinc/chacha20/chacha20-mips.S index 2b82ebf..7e2b5e8 100644 --- a/src/crypto/zinc/chacha20/chacha20-mips.S +++ b/src/crypto/zinc/chacha20/chacha20-mips.S @@ -7,7 +7,7 @@ #define MASK_U32 0x3c #define MASK_BYTES 0x03 #define CHACHA20_BLOCK_SIZE 64 -#define STACK_SIZE 64 +#define STACK_SIZE 40 #define X0 $t0 #define X1 $t1 @@ -17,8 +17,8 @@ #define X5 $t5 #define X6 $t6 #define X7 $t7 -#define X8 $v1 -#define X9 $fp +#define X8 $t8 +#define X9 $t9 #define X10 $s7 #define X11 $s6 #define X12 $s5 @@ -32,13 +32,10 @@ #define X(n) X ## n /* Input arguments */ -#define OUT $a0 -#define IN $a1 -#define BYTES $a2 -/* KEY and NONCE argument must be u32 aligned */ -#define KEY $a3 -/* NONCE pointer is given via stack, must be u32 aligned */ -#define NONCE $t9 +#define STATE $a0 +#define OUT $a1 +#define IN $a2 +#define BYTES $a3 /* Output argument */ /* NONCE[0] is kept in a register and not in memory. @@ -54,16 +51,8 @@ #define SAVED_X X15 #define SAVED_CA $ra -#define PTR_LAST_ROUND $t8 - -/* ChaCha20 constants and stack location */ -#define CONSTANT_OFS_SP 48 -#define UNALIGNED_OFS_SP 40 - -#define CONSTANT_1 0x61707865 -#define CONSTANT_2 0x3320646e -#define CONSTANT_3 0x79622d32 -#define CONSTANT_4 0x6b206574 +#define PTR_LAST_ROUND $v1 +#define IS_UNALIGNED $fp #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define MSB 0 @@ -81,65 +70,121 @@ #define ROTR(n) #endif -#define STORE_UNALIGNED(x, a, s, o) \ -.Lchacha20_mips_xor_unaligned_ ## x ## _b: ; \ - .if ((s != NONCE) || (o != 0)); \ - lw T0, o(s); \ +#define FOR_EACH_WORD(x) \ + x( 0); \ + x( 1); \ + x( 2); \ + x( 3); \ + x( 4); \ + x( 5); \ + x( 6); \ + x( 7); \ + x( 8); \ + x( 9); \ + x(10); \ + x(11); \ + x(12); \ + x(13); \ + x(14); \ + x(15); + +#define FOR_EACH_WORD_REV(x) \ + x(15); \ + x(14); \ + x(13); \ + x(12); \ + x(11); \ + x(10); \ + x( 9); \ + x( 8); \ + x( 7); \ + x( 6); \ + x( 5); \ + x( 4); \ + x( 3); \ + x( 2); \ + x( 1); \ + x( 0); + +#define PLUS_ONE_0 1 +#define PLUS_ONE_1 2 +#define PLUS_ONE_2 3 +#define PLUS_ONE_3 4 +#define PLUS_ONE_4 5 +#define PLUS_ONE_5 6 +#define PLUS_ONE_6 7 +#define PLUS_ONE_7 8 +#define PLUS_ONE_8 9 +#define PLUS_ONE_9 10 +#define PLUS_ONE_10 11 +#define PLUS_ONE_11 12 +#define PLUS_ONE_12 13 +#define PLUS_ONE_13 14 +#define PLUS_ONE_14 15 +#define PLUS_ONE_15 16 +#define PLUS_ONE(x) PLUS_ONE_ ## x +#define _CONCAT3(a,b,c) a ## b ## c +#define CONCAT3(a,b,c) _CONCAT3(a,b,c) + +#define STORE_UNALIGNED(x) \ +CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ .endif; \ - lwl T1, x-4+MSB ## (IN); \ - lwr T1, x-4+LSB ## (IN); \ - .if ((s == NONCE) && (o == 0)); \ - addu X ## a, NONCE_0; \ + lwl T1, (x*4)+MSB ## (IN); \ + lwr T1, (x*4)+LSB ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ .else; \ - addu X ## a, T0; \ + addu X ## x, T0; \ .endif; \ - CPU_TO_LE32(X ## a); \ - xor X ## a, T1; \ - swl X ## a, x-4+MSB ## (OUT); \ - swr X ## a, x-4+LSB ## (OUT); - -#define STORE_ALIGNED(x, a, s, o) \ -.Lchacha20_mips_xor_aligned_ ## x ## _b: ; \ - .if ((s != NONCE) || (o != 0)); \ - lw T0, o(s); \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + swl X ## x, (x*4)+MSB ## (OUT); \ + swr X ## x, (x*4)+LSB ## (OUT); + +#define STORE_ALIGNED(x) \ +CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ .endif; \ - lw T1, x-4 ## (IN); \ - .if ((s == NONCE) && (o == 0)); \ - addu X ## a, NONCE_0; \ + lw T1, (x*4) ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ .else; \ - addu X ## a, T0; \ + addu X ## x, T0; \ .endif; \ - CPU_TO_LE32(X ## a); \ - xor X ## a, T1; \ - sw X ## a, x-4 ## (OUT); + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + sw X ## x, (x*4) ## (OUT); /* Jump table macro. * Used for setup and handling the last bytes, which are not multiple of 4. * X15 is free to store Xn * Every jumptable entry must be equal in size. */ -#define JMPTBL_ALIGNED(x, a, s, o) \ -.Lchacha20_mips_jmptbl_aligned_ ## a: ; \ +#define JMPTBL_ALIGNED(x) \ +.Lchacha20_mips_jmptbl_aligned_ ## x: ; \ .set noreorder; \ - .if ((s == NONCE) && (o == 0)); \ + .if (x == 12); \ move SAVED_CA, NONCE_0; \ .else; \ - lw SAVED_CA, o(s);\ + lw SAVED_CA, (x*4)(STATE); \ .endif; \ b .Lchacha20_mips_xor_aligned_ ## x ## _b; \ - move SAVED_X, X ## a; \ + move SAVED_X, X ## x; \ .set reorder -#define JMPTBL_UNALIGNED(x, a, s, o) \ -.Lchacha20_mips_jmptbl_unaligned_ ## a: ; \ +#define JMPTBL_UNALIGNED(x) \ +.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \ .set noreorder; \ - .if ((s == NONCE) && (o == 0)); \ + .if (x == 12); \ move SAVED_CA, NONCE_0; \ .else; \ - lw SAVED_CA, o(s);\ + lw SAVED_CA, (x*4)(STATE);\ .endif; \ b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \ - move SAVED_X, X ## a; \ + move SAVED_X, X ## x; \ .set reorder #define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ @@ -163,8 +208,6 @@ .ent chacha20_mips chacha20_mips: .frame $sp, STACK_SIZE, $ra - /* This is in the fifth argument */ - lw NONCE, 16($sp) addiu $sp, -STACK_SIZE @@ -176,7 +219,7 @@ chacha20_mips: ins PTR_LAST_ROUND, $zero, 0, 6 addu PTR_LAST_ROUND, OUT - /* Save s0-s7, fp, ra. */ + /* Save s0-s7, ra, fp */ sw $ra, 0($sp) sw $fp, 4($sp) sw $s0, 8($sp) @@ -188,30 +231,13 @@ chacha20_mips: sw $s6, 32($sp) sw $s7, 36($sp) - lw NONCE_0, 0(NONCE) + lw NONCE_0, 48(STATE) + /* Test IN or OUT is unaligned. - * UNALIGNED (T1) = ( IN | OUT ) & 0x00000003 + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 */ - or T1, IN, OUT - andi T1, 0x3 - - /* Load constant */ - lui X0, %hi(CONSTANT_1) - lui X1, %hi(CONSTANT_2) - lui X2, %hi(CONSTANT_3) - lui X3, %hi(CONSTANT_4) - ori X0, %lo(CONSTANT_1) - ori X1, %lo(CONSTANT_2) - ori X2, %lo(CONSTANT_3) - ori X3, %lo(CONSTANT_4) - - /* Store constant on stack. */ - sw X0, 0+CONSTANT_OFS_SP($sp) - sw X1, 4+CONSTANT_OFS_SP($sp) - sw X2, 8+CONSTANT_OFS_SP($sp) - sw X3, 12+CONSTANT_OFS_SP($sp) - - sw T1, UNALIGNED_OFS_SP($sp) + or IS_UNALIGNED, IN, OUT + andi IS_UNALIGNED, 0x3 andi BYTES, (CHACHA20_BLOCK_SIZE-1) @@ -223,26 +249,25 @@ chacha20_mips: addiu OUT, CHACHA20_BLOCK_SIZE addiu NONCE_0, 1 - lw X0, 0+CONSTANT_OFS_SP($sp) - lw X1, 4+CONSTANT_OFS_SP($sp) - lw X2, 8+CONSTANT_OFS_SP($sp) - lw X3, 12+CONSTANT_OFS_SP($sp) - lw T1, UNALIGNED_OFS_SP($sp) - .Lchacha20_rounds_start: - lw X4, 0(KEY) - lw X5, 4(KEY) - lw X6, 8(KEY) - lw X7, 12(KEY) - lw X8, 16(KEY) - lw X9, 20(KEY) - lw X10, 24(KEY) - lw X11, 28(KEY) + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) move X12, NONCE_0 - lw X13, 4(NONCE) - lw X14, 8(NONCE) - lw X15, 12(NONCE) + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) li $at, 20 .Loop_chacha20_xor_rounds: @@ -259,20 +284,20 @@ chacha20_mips: andi $at, BYTES, MASK_U32 - /* Unaligned? Jump */ - bnez T1, .Loop_chacha20_unaligned + /* Is data src/dst unaligned? Jump */ + bnez IS_UNALIGNED, .Loop_chacha20_unaligned /* Load upper half of jump table addr */ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) - /* Last round? No jump */ - bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_64_b + /* Last round? No, do a full block. */ + bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_16_b /* Calculate lower half jump table addr and offset */ ins T0, $at, 2, 6 /* Full block? Jump */ - beqz BYTES, .Lchacha20_mips_xor_aligned_64_b + beqz BYTES, .Lchacha20_mips_xor_aligned_16_b subu T0, $at addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) @@ -283,13 +308,13 @@ chacha20_mips: lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) /* Last round? no jump */ - bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_64_b + bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_16_b /* Calculate lower half jump table addr and offset */ ins T0, $at, 2, 6 /* Full block? Jump */ - beqz BYTES, .Lchacha20_mips_xor_unaligned_64_b + beqz BYTES, .Lchacha20_mips_xor_unaligned_16_b subu T0, $at addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) @@ -298,22 +323,7 @@ chacha20_mips: /* Aligned code path */ .align 4 - STORE_ALIGNED(64, 15, NONCE,12) - STORE_ALIGNED(60, 14, NONCE, 8) - STORE_ALIGNED(56, 13, NONCE, 4) - STORE_ALIGNED(52, 12, NONCE, 0) - STORE_ALIGNED(48, 11, KEY, 28) - STORE_ALIGNED(44, 10, KEY, 24) - STORE_ALIGNED(40, 9, KEY, 20) - STORE_ALIGNED(36, 8, KEY, 16) - STORE_ALIGNED(32, 7, KEY, 12) - STORE_ALIGNED(28, 6, KEY, 8) - STORE_ALIGNED(24, 5, KEY, 4) - STORE_ALIGNED(20, 4, KEY, 0) - STORE_ALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP) - STORE_ALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP) - STORE_ALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP) - STORE_ALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) + FOR_EACH_WORD_REV(STORE_ALIGNED) bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds @@ -322,6 +332,9 @@ chacha20_mips: bne $at, BYTES, .Lchacha20_mips_xor_bytes .Lchacha20_mips_xor_done: + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + /* Restore used registers */ lw $ra, 0($sp) lw $fp, 4($sp) @@ -337,43 +350,11 @@ chacha20_mips: addiu $sp, STACK_SIZE jr $ra - /* Start jump table */ - JMPTBL_ALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP) - JMPTBL_ALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP) - JMPTBL_ALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP) - JMPTBL_ALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP) - JMPTBL_ALIGNED(16, 4, KEY, 0) - JMPTBL_ALIGNED(20, 5, KEY, 4) - JMPTBL_ALIGNED(24, 6, KEY, 8) - JMPTBL_ALIGNED(28, 7, KEY, 12) - JMPTBL_ALIGNED(32, 8, KEY, 16) - JMPTBL_ALIGNED(36, 9, KEY, 20) - JMPTBL_ALIGNED(40, 10, KEY, 24) - JMPTBL_ALIGNED(44, 11, KEY, 28) - JMPTBL_ALIGNED(48, 12, NONCE, 0) - JMPTBL_ALIGNED(52, 13, NONCE, 4) - JMPTBL_ALIGNED(56, 14, NONCE, 8) - JMPTBL_ALIGNED(60, 15, NONCE,12) - /* End jump table */ - -/* Unaligned code path - */ - STORE_UNALIGNED(64, 15, NONCE,12) - STORE_UNALIGNED(60, 14, NONCE, 8) - STORE_UNALIGNED(56, 13, NONCE, 4) - STORE_UNALIGNED(52, 12, NONCE, 0) - STORE_UNALIGNED(48, 11, KEY, 28) - STORE_UNALIGNED(44, 10, KEY, 24) - STORE_UNALIGNED(40, 9, KEY, 20) - STORE_UNALIGNED(36, 8, KEY, 16) - STORE_UNALIGNED(32, 7, KEY, 12) - STORE_UNALIGNED(28, 6, KEY, 8) - STORE_UNALIGNED(24, 5, KEY, 4) - STORE_UNALIGNED(20, 4, KEY, 0) - STORE_UNALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP) - STORE_UNALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP) - STORE_UNALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP) - STORE_UNALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) + /* Jump table */ + FOR_EACH_WORD(JMPTBL_ALIGNED) + + /* Unaligned code path */ + FOR_EACH_WORD_REV(STORE_UNALIGNED) bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds @@ -413,23 +394,7 @@ chacha20_mips: b .Lchacha20_mips_xor_done .Lchacha20_mips_jmptbl_unaligned: - /* Start jump table */ - JMPTBL_UNALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP) - JMPTBL_UNALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP) - JMPTBL_UNALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP) - JMPTBL_UNALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP) - JMPTBL_UNALIGNED(16, 4, KEY, 0) - JMPTBL_UNALIGNED(20, 5, KEY, 4) - JMPTBL_UNALIGNED(24, 6, KEY, 8) - JMPTBL_UNALIGNED(28, 7, KEY, 12) - JMPTBL_UNALIGNED(32, 8, KEY, 16) - JMPTBL_UNALIGNED(36, 9, KEY, 20) - JMPTBL_UNALIGNED(40, 10, KEY, 24) - JMPTBL_UNALIGNED(44, 11, KEY, 28) - JMPTBL_UNALIGNED(48, 12, NONCE, 0) - JMPTBL_UNALIGNED(52, 13, NONCE, 4) - JMPTBL_UNALIGNED(56, 14, NONCE, 8) - JMPTBL_UNALIGNED(60, 15, NONCE,12) - /* End jump table */ + /* Jump table */ + FOR_EACH_WORD(JMPTBL_UNALIGNED) .end chacha20_mips .set at |