diff options
Diffstat (limited to 'src/crypto')
-rw-r--r-- | src/crypto/chacha20-mips.S | 474 | ||||
-rw-r--r-- | src/crypto/chacha20poly1305.c | 17 | ||||
-rw-r--r-- | src/crypto/poly1305-mips.S | 426 |
3 files changed, 912 insertions, 5 deletions
diff --git a/src/crypto/chacha20-mips.S b/src/crypto/chacha20-mips.S new file mode 100644 index 0000000..77da2c2 --- /dev/null +++ b/src/crypto/chacha20-mips.S @@ -0,0 +1,474 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#define MASK_U32 0x3c +#define MASK_BYTES 0x03 +#define CHACHA20_BLOCK_SIZE 64 +#define STACK_SIZE 4*16 + +#define X0 $t0 +#define X1 $t1 +#define X2 $t2 +#define X3 $t3 +#define X4 $t4 +#define X5 $t5 +#define X6 $t6 +#define X7 $t7 +#define X8 $v1 +#define X9 $fp +#define X10 $s7 +#define X11 $s6 +#define X12 $s5 +#define X13 $s4 +#define X14 $s3 +#define X15 $s2 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ +#define T0 $s1 +#define T1 $s0 +#define T(n) T ## n +#define X(n) X ## n + +/* Input arguments */ +#define OUT $a0 +#define IN $a1 +#define BYTES $a2 +/* KEY and NONCE argument must be u32 aligned */ +#define KEY $a3 +/* NONCE pointer is given via stack */ +#define NONCE $t9 + +/* Output argument */ +/* NONCE[0] is kept in a register and not in memory. + * We don't want to touch original value in memory. + * Must be incremented every loop iteration. + */ +#define NONCE_0 $v0 + +/* SAVED_X and SAVED_CA are set in the jump table. + * Use regs which are overwritten on exit else we don't leak clear data. + * They are used to handling the last bytes which are not multiple of 4. + */ +#define SAVED_X X15 +#define SAVED_CA $ra + +#define PTR_LAST_ROUND $t8 + +/* ChaCha20 constants and stack location */ +#define CONSTANT_OFS_SP 48 +#define UNALIGNED_OFS_SP 40 + +#define CONSTANT_1 0x61707865 +#define CONSTANT_2 0x3320646e +#define CONSTANT_3 0x79622d32 +#define CONSTANT_4 0x6b206574 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#define ROTx rotl +#define ROTR(n) rotr n, 24 +#define CPU_TO_LE32(n) \ + wsbh n; \ + rotr n, 16; +#else +#define MSB 3 +#define LSB 0 +#define ROTx rotr +#define CPU_TO_LE32(n) +#define ROTR(n) +#endif + +#define STORE_UNALIGNED(x, a, s, o) \ +.Lchacha20_mips_xor_unaligned_ ## x ## _b: ; \ + .if ((s != NONCE) || (o != 0)); \ + lw T0, o(s); \ + .endif; \ + lwl T1, x-4+MSB ## (IN); \ + lwr T1, x-4+LSB ## (IN); \ + .if ((s == NONCE) && (o == 0)); \ + addu X ## a, NONCE_0; \ + .else; \ + addu X ## a, T0; \ + .endif; \ + CPU_TO_LE32(X ## a); \ + xor X ## a, T1; \ + swl X ## a, x-4+MSB ## (OUT); \ + swr X ## a, x-4+LSB ## (OUT); + +#define STORE_ALIGNED(x, a, s, o) \ +.Lchacha20_mips_xor_aligned_ ## x ## _b: ; \ + .if ((s != NONCE) || (o != 0)); \ + lw T0, o(s); \ + .endif; \ + lw T1, x-4 ## (IN); \ + .if ((s == NONCE) && (o == 0)); \ + addu X ## a, NONCE_0; \ + .else; \ + addu X ## a, T0; \ + .endif; \ + CPU_TO_LE32(X ## a); \ + xor X ## a, T1; \ + sw X ## a, x-4 ## (OUT); + +/* Jump table macro. + * Used for setup and handling the last bytes, which are not multiple of 4. + * X15 is free to store Xn + * Every jumptable entry must be equal in size. + */ +#define JMPTBL_ALIGNED(x, a, s, o) \ +.Lchacha20_mips_jmptbl_aligned_ ## a: ; \ + .if ((s == NONCE) && (o == 0)); \ + move SAVED_CA, NONCE_0; \ + .else; \ + lw SAVED_CA, o(s);\ + .endif; \ + b .Lchacha20_mips_xor_aligned_ ## x ## _b; \ + move SAVED_X, X ## a; + +#define JMPTBL_UNALIGNED(x, a, s, o) \ +.Lchacha20_mips_jmptbl_unaligned_ ## a: ; \ + .if ((s == NONCE) && (o == 0)); \ + move SAVED_CA, NONCE_0; \ + .else; \ + lw SAVED_CA, o(s);\ + .endif; \ + b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \ + move SAVED_X, X ## a; + +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ + addu X(A), X(K); \ + addu X(B), X(L); \ + addu X(C), X(M); \ + addu X(D), X(N); \ + xor X(V), X(A); \ + xor X(W), X(B); \ + xor X(Y), X(C); \ + xor X(Z), X(D); \ + rotl X(V), S; \ + rotl X(W), S; \ + rotl X(Y), S; \ + rotl X(Z), S; + +.text +.set reorder +.set noat +.globl chacha20_mips +.ent chacha20_mips +chacha20_mips: + .frame $sp, STACK_SIZE, $ra + /* This is in the fifth argument */ + lw NONCE, 16($sp) + + /* Return bytes = 0. */ + .set noreorder + beqz BYTES, .Lchacha20_mips_end + addiu $sp, -STACK_SIZE + .set reorder + + /* Calculate PTR_LAST_ROUND */ + addiu PTR_LAST_ROUND, BYTES, -1 + ins PTR_LAST_ROUND, $zero, 0, 6 + addu PTR_LAST_ROUND, OUT + + /* Save s0-s7, fp, ra. */ + sw $ra, 0($sp) + sw $fp, 4($sp) + sw $s0, 8($sp) + sw $s1, 12($sp) + sw $s2, 16($sp) + sw $s3, 20($sp) + sw $s4, 24($sp) + sw $s5, 28($sp) + sw $s6, 32($sp) + sw $s7, 36($sp) + + lw NONCE_0, 0(NONCE) + /* Test IN or OUT is unaligned. + * UNALIGNED (T1) = ( IN | OUT ) & 0x00000003 + */ + or T1, IN, OUT + andi T1, 0x3 + + /* Load constant */ + lui X0, %hi(CONSTANT_1) + lui X1, %hi(CONSTANT_2) + lui X2, %hi(CONSTANT_3) + lui X3, %hi(CONSTANT_4) + ori X0, %lo(CONSTANT_1) + ori X1, %lo(CONSTANT_2) + ori X2, %lo(CONSTANT_3) + ori X3, %lo(CONSTANT_4) + + /* Store constant on stack. */ + sw X0, 0+CONSTANT_OFS_SP($sp) + sw X1, 4+CONSTANT_OFS_SP($sp) + sw X2, 8+CONSTANT_OFS_SP($sp) + sw X3, 12+CONSTANT_OFS_SP($sp) + + sw T1, UNALIGNED_OFS_SP($sp) + + .set noreorder + b .Lchacha20_rounds_start + andi BYTES, (CHACHA20_BLOCK_SIZE-1) + .set reorder + +.align 4 +.Loop_chacha20_rounds: + addiu IN, CHACHA20_BLOCK_SIZE + addiu OUT, CHACHA20_BLOCK_SIZE + addiu NONCE_0, 1 + + lw X0, 0+CONSTANT_OFS_SP($sp) + lw X1, 4+CONSTANT_OFS_SP($sp) + lw X2, 8+CONSTANT_OFS_SP($sp) + lw X3, 12+CONSTANT_OFS_SP($sp) + lw T1, UNALIGNED_OFS_SP($sp) + +.Lchacha20_rounds_start: + lw X4, 0(KEY) + lw X5, 4(KEY) + lw X6, 8(KEY) + lw X7, 12(KEY) + lw X8, 16(KEY) + lw X9, 20(KEY) + lw X10, 24(KEY) + lw X11, 28(KEY) + + move X12, NONCE_0 + lw X13, 4(NONCE) + lw X14, 8(NONCE) + lw X15, 12(NONCE) + + li $at, 9 +.Loop_chacha20_xor_rounds: + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + .set noreorder + bnez $at, .Loop_chacha20_xor_rounds + addiu $at, -1 + + /* Unaligned? Jump */ + bnez T1, .Loop_chacha20_unaligned + andi $at, BYTES, MASK_U32 + + /* Last round? No jump */ + bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_64_b + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) + + /* Full block? Jump */ + beqz BYTES, .Lchacha20_mips_xor_aligned_64_b + /* Calculate lower half jump table addr and offset */ + ins T0, $at, 2, 6 + + subu T0, $at + addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) + + jr T0 + /* Delay slot */ + nop + + .set reorder + +.Loop_chacha20_unaligned: + .set noreorder + + /* Last round? no jump */ + bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_64_b + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) + + /* Full block? Jump */ + beqz BYTES, .Lchacha20_mips_xor_unaligned_64_b + + /* Calculate lower half jump table addr and offset */ + ins T0, $at, 2, 6 + subu T0, $at + addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) + + jr T0 + /* Delay slot */ + nop + + .set reorder + +/* Aligned code path + */ +.align 4 + STORE_ALIGNED(64, 15, NONCE,12) + STORE_ALIGNED(60, 14, NONCE, 8) + STORE_ALIGNED(56, 13, NONCE, 4) + STORE_ALIGNED(52, 12, NONCE, 0) + STORE_ALIGNED(48, 11, KEY, 28) + STORE_ALIGNED(44, 10, KEY, 24) + STORE_ALIGNED(40, 9, KEY, 20) + STORE_ALIGNED(36, 8, KEY, 16) + STORE_ALIGNED(32, 7, KEY, 12) + STORE_ALIGNED(28, 6, KEY, 8) + STORE_ALIGNED(24, 5, KEY, 4) + STORE_ALIGNED(20, 4, KEY, 0) + STORE_ALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP) + STORE_ALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP) + STORE_ALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP) +.Lchacha20_mips_xor_aligned_4_b: + /* STORE_ALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */ + lw T0, 0+CONSTANT_OFS_SP($sp) + lw T1, 0(IN) + addu X0, T0 + CPU_TO_LE32(X0) + xor X0, T1 + .set noreorder + bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds + sw X0, 0(OUT) + .set reorder + + .set noreorder + bne $at, BYTES, .Lchacha20_mips_xor_bytes + /* Empty delayslot, Increase NONCE_0, return NONCE_0 value */ + addiu NONCE_0, 1 + .set noreorder + +.Lchacha20_mips_xor_done: + /* Restore used registers */ + lw $ra, 0($sp) + lw $fp, 4($sp) + lw $s0, 8($sp) + lw $s1, 12($sp) + lw $s2, 16($sp) + lw $s3, 20($sp) + lw $s4, 24($sp) + lw $s5, 28($sp) + lw $s6, 32($sp) + lw $s7, 36($sp) +.Lchacha20_mips_end: + .set noreorder + jr $ra + addiu $sp, STACK_SIZE + .set reorder + + .set noreorder + /* Start jump table */ + JMPTBL_ALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP) + JMPTBL_ALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP) + JMPTBL_ALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP) + JMPTBL_ALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP) + JMPTBL_ALIGNED(16, 4, KEY, 0) + JMPTBL_ALIGNED(20, 5, KEY, 4) + JMPTBL_ALIGNED(24, 6, KEY, 8) + JMPTBL_ALIGNED(28, 7, KEY, 12) + JMPTBL_ALIGNED(32, 8, KEY, 16) + JMPTBL_ALIGNED(36, 9, KEY, 20) + JMPTBL_ALIGNED(40, 10, KEY, 24) + JMPTBL_ALIGNED(44, 11, KEY, 28) + JMPTBL_ALIGNED(48, 12, NONCE, 0) + JMPTBL_ALIGNED(52, 13, NONCE, 4) + JMPTBL_ALIGNED(56, 14, NONCE, 8) + JMPTBL_ALIGNED(60, 15, NONCE,12) + /* End jump table */ + .set reorder + +/* Unaligned code path + */ + STORE_UNALIGNED(64, 15, NONCE,12) + STORE_UNALIGNED(60, 14, NONCE, 8) + STORE_UNALIGNED(56, 13, NONCE, 4) + STORE_UNALIGNED(52, 12, NONCE, 0) + STORE_UNALIGNED(48, 11, KEY, 28) + STORE_UNALIGNED(44, 10, KEY, 24) + STORE_UNALIGNED(40, 9, KEY, 20) + STORE_UNALIGNED(36, 8, KEY, 16) + STORE_UNALIGNED(32, 7, KEY, 12) + STORE_UNALIGNED(28, 6, KEY, 8) + STORE_UNALIGNED(24, 5, KEY, 4) + STORE_UNALIGNED(20, 4, KEY, 0) + STORE_UNALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP) + STORE_UNALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP) + STORE_UNALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP) +.Lchacha20_mips_xor_unaligned_4_b: + /* STORE_UNALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */ + lw T0, 0+CONSTANT_OFS_SP($sp) + lwl T1, 0+MSB(IN) + lwr T1, 0+LSB(IN) + addu X0, T0 + CPU_TO_LE32(X0) + xor X0, T1 + swl X0, 0+MSB(OUT) + .set noreorder + bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds + swr X0, 0+LSB(OUT) + .set reorder + + /* Fall through to byte handling */ + .set noreorder + beq $at, BYTES, .Lchacha20_mips_xor_done + /* Empty delayslot, increase NONCE_0, return NONCE_0 value */ +.Lchacha20_mips_xor_unaligned_0_b: +.Lchacha20_mips_xor_aligned_0_b: + addiu NONCE_0, 1 + .set reorder + +.Lchacha20_mips_xor_bytes: + addu OUT, $at + addu IN, $at + addu SAVED_X, SAVED_CA + /* First byte */ + lbu T1, 0(IN) + andi $at, BYTES, 2 + CPU_TO_LE32(SAVED_X) + ROTR(SAVED_X) + xor T1, SAVED_X + .set noreorder + beqz $at, .Lchacha20_mips_xor_done + sb T1, 0(OUT) + .set reorder + /* Second byte */ + lbu T1, 1(IN) + andi $at, BYTES, 1 + ROTx SAVED_X, 8 + xor T1, SAVED_X + .set noreorder + beqz $at, .Lchacha20_mips_xor_done + sb T1, 1(OUT) + .set reorder + /* Third byte */ + lbu T1, 2(IN) + ROTx SAVED_X, 8 + xor T1, SAVED_X + .set noreorder + b .Lchacha20_mips_xor_done + sb T1, 2(OUT) + .set reorder +.set noreorder + +.Lchacha20_mips_jmptbl_unaligned: + /* Start jump table */ + JMPTBL_UNALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP) + JMPTBL_UNALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP) + JMPTBL_UNALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP) + JMPTBL_UNALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP) + JMPTBL_UNALIGNED(16, 4, KEY, 0) + JMPTBL_UNALIGNED(20, 5, KEY, 4) + JMPTBL_UNALIGNED(24, 6, KEY, 8) + JMPTBL_UNALIGNED(28, 7, KEY, 12) + JMPTBL_UNALIGNED(32, 8, KEY, 16) + JMPTBL_UNALIGNED(36, 9, KEY, 20) + JMPTBL_UNALIGNED(40, 10, KEY, 24) + JMPTBL_UNALIGNED(44, 11, KEY, 28) + JMPTBL_UNALIGNED(48, 12, NONCE, 0) + JMPTBL_UNALIGNED(52, 13, NONCE, 4) + JMPTBL_UNALIGNED(56, 14, NONCE, 8) + JMPTBL_UNALIGNED(60, 15, NONCE,12) + /* End jump table */ +.set reorder + +.end chacha20_mips +.set at diff --git a/src/crypto/chacha20poly1305.c b/src/crypto/chacha20poly1305.c index c066d93..13b5ec6 100644 --- a/src/crypto/chacha20poly1305.c +++ b/src/crypto/chacha20poly1305.c @@ -83,10 +83,13 @@ void __init chacha20poly1305_fpu_init(void) chacha20poly1305_use_neon = elf_hwcap & HWCAP_NEON; #endif } -#elif defined(CONFIG_MIPS) && defined(CONFIG_64BIT) +#elif defined(CONFIG_MIPS) && (defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPS32_R2)) asmlinkage void poly1305_init_mips(void *ctx, const u8 key[16]); asmlinkage void poly1305_blocks_mips(void *ctx, const u8 *inp, size_t len, u32 padbit); asmlinkage void poly1305_emit_mips(void *ctx, u8 mac[16], const u32 nonce[4]); +#if defined(CONFIG_CPU_MIPS32_R2) +asmlinkage void chacha20_mips(u8 *out, const u8 *in, size_t len, const u32 key[8], const u32 counter[4]); +#endif void __init chacha20poly1305_fpu_init(void) { } #else void __init chacha20poly1305_fpu_init(void) { } @@ -263,6 +266,10 @@ no_simd: chacha20_arm(dst, src, bytes, &ctx->state[4], &ctx->state[12]); ctx->state[12] += (bytes + 63) / 64; return; +#elif defined(CONFIG_MIPS) && defined(CONFIG_CPU_MIPS32_R2) + chacha20_mips(dst, src, bytes, &ctx->state[4], &ctx->state[12]); + ctx->state[12] += (bytes + 63) / 64; + return; #endif if (dst != src) @@ -287,7 +294,7 @@ struct poly1305_ctx { size_t num; } __aligned(8); -#if !(defined(CONFIG_X86_64) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || (defined(CONFIG_MIPS) && defined(CONFIG_64BIT))) +#if !(defined(CONFIG_X86_64) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || (defined(CONFIG_MIPS) && (defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPS32_R2)))) struct poly1305_internal { u32 h[5]; u32 r[4]; @@ -460,7 +467,7 @@ static void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SI poly1305_init_x86_64(ctx->opaque, key); #elif defined(CONFIG_ARM) || defined(CONFIG_ARM64) poly1305_init_arm(ctx->opaque, key); -#elif defined(CONFIG_MIPS) && defined(CONFIG_64BIT) +#elif defined(CONFIG_MIPS) && (defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPS32_R2)) poly1305_init_mips(ctx->opaque, key); #else poly1305_init_generic(ctx->opaque, key); @@ -494,7 +501,7 @@ static inline void poly1305_blocks(void *ctx, const u8 *inp, size_t len, u32 pad else #endif poly1305_blocks_arm(ctx, inp, len, padbit); -#elif defined(CONFIG_MIPS) && defined(CONFIG_64BIT) +#elif defined(CONFIG_MIPS) && (defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPS32_R2)) poly1305_blocks_mips(ctx, inp, len, padbit); #else poly1305_blocks_generic(ctx, inp, len, padbit); @@ -527,7 +534,7 @@ static inline void poly1305_emit(void *ctx, u8 mac[16], const u32 nonce[4], bool else #endif poly1305_emit_arm(ctx, mac, nonce); -#elif defined(CONFIG_MIPS) && defined(CONFIG_64BIT) +#elif defined(CONFIG_MIPS) && (defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPS32_R2)) poly1305_emit_mips(ctx, mac, nonce); #else poly1305_emit_generic(ctx, mac, nonce); diff --git a/src/crypto/poly1305-mips.S b/src/crypto/poly1305-mips.S new file mode 100644 index 0000000..cd62d9b --- /dev/null +++ b/src/crypto/poly1305-mips.S @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com> All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#else +#define MSB 3 +#define LSB 0 +#endif + +#define POLY1305_BLOCK_SIZE 16 +.text +#define H0 $t0 +#define H1 $t1 +#define H2 $t2 +#define H3 $t3 +#define H4 $t4 + +#define R0 $t5 +#define R1 $t6 +#define R2 $t7 +#define R3 $t8 + +#define O0 $s0 +#define O1 $s4 +#define O2 $v1 +#define O3 $t9 +#define O4 $s5 + +#define S1 $s1 +#define S2 $s2 +#define S3 $s3 + +#define SC $at +#define CA $v0 + +/* Input arguments */ +#define poly $a0 +#define src $a1 +#define srclen $a2 +#define hibit $a3 + +#define PTR_POLY1305_R(n) ( 0 + (n*4)) ## ($a0) +#define PTR_POLY1305_S(n) (16 + (n*4)) ## ($a0) +#define PTR_POLY1305_CA (32 ) ## ($a0) +#define PTR_POLY1305_H(n) (36 + (n*4)) ## ($a0) + +#define POLY1305_BLOCK_SIZE 16 +#define POLY1305_STACK_SIZE 8 * 4 + +.set reorder +.set noat +.align 4 +.globl poly1305_blocks_mips +.ent poly1305_blocks_mips +poly1305_blocks_mips: + .frame $sp,POLY1305_STACK_SIZE,$31 + /* srclen &= 0xFFFFFFF0 */ + ins srclen, $zero, 0, 4 + + .set noreorder + /* check srclen >= 16 bytes */ + beqz srclen, .Lpoly1305_blocks_mips_end + addiu $sp, -(POLY1305_STACK_SIZE) + .set reorder + + /* Calculate last round based on src address pointer. + * last round src ptr (srclen) = src + (srclen & 0xFFFFFFF0) + */ + addu srclen, src + + lw R0, PTR_POLY1305_R(0) + lw R1, PTR_POLY1305_R(1) + lw R2, PTR_POLY1305_R(2) + lw R3, PTR_POLY1305_R(3) + + /* store the used save registers. */ + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + sw $s4, 16($sp) + sw $s5, 20($sp) + + lw S1, PTR_POLY1305_S(1) + lw S2, PTR_POLY1305_S(2) + lw S3, PTR_POLY1305_S(3) + + /* load Hx and Carry */ + lw CA, PTR_POLY1305_CA + lw H0, PTR_POLY1305_H(0) + lw H1, PTR_POLY1305_H(1) + lw H2, PTR_POLY1305_H(2) + lw H3, PTR_POLY1305_H(3) + lw H4, PTR_POLY1305_H(4) + + addiu SC, $zero, 1 + +.Lpoly1305_loop: + lwl O0, 0+MSB(src) + lwl O1, 4+MSB(src) + lwl O2, 8+MSB(src) + lwl O3,12+MSB(src) + lwr O0, 0+LSB(src) + lwr O1, 4+LSB(src) + lwr O2, 8+LSB(src) + lwr O3,12+LSB(src) + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + wsbh O0 + wsbh O1 + wsbh O2 + wsbh O3 + rotr O0, 16 + rotr O1, 16 + rotr O2, 16 + rotr O3, 16 +#endif + + /* h0 = (u32)(d0 = (u64)h0 + inp[0] + c 'Carry_previous cycle'); */ + addu H0, CA + sltu CA, H0, CA + addu O0, H0 + sltu H0, O0, H0 + addu CA, H0 + + /* h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + inp[4]); */ + addu H1, CA + sltu CA, H1, CA + addu O1, H1 + sltu H1, O1, H1 + addu CA, H1 + + /* h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + inp[8]); */ + addu H2, CA + sltu CA, H2, CA + addu O2, H2 + sltu H2, O2, H2 + addu CA, H2 + + /* h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + inp[12]); */ + addu H3, CA + sltu CA, H3, CA + addu O3, H3 + sltu H3, O3, H3 + addu CA, H3 + + /* h4 += (u32)(d3 >> 32) + padbit; */ + addu H4, hibit + addu O4, H4, CA + + /* D0 */ + multu O0, R0 + maddu O1, S3 + maddu O2, S2 + maddu O3, S1 + mfhi CA + mflo H0 + + /* D1 */ + multu O0, R1 + maddu O1, R0 + maddu O2, S3 + maddu O3, S2 + maddu O4, S1 + maddu CA, SC + mfhi CA + mflo H1 + + /* D2 */ + multu O0, R2 + maddu O1, R1 + maddu O2, R0 + maddu O3, S3 + maddu O4, S2 + maddu CA, SC + mfhi CA + mflo H2 + + /* D4 */ + mul H4, O4, R0 + + /* D3 */ + multu O0, R3 + maddu O1, R2 + maddu O2, R1 + maddu O3, R0 + maddu O4, S3 + maddu CA, SC + mfhi CA + mflo H3 + + addiu src, POLY1305_BLOCK_SIZE + + /* h4 += (u32)(d3 >> 32); */ + addu O4, H4, CA + /* h4 &= 3 */ + andi H4, O4, 3 + /* c = (h4 >> 2) + (h4 & ~3U); */ + srl CA, O4, 2 + ins O4, $zero, 0, 2 + + /* able to do a 16 byte block. */ + .set noreorder + bne src, srclen, .Lpoly1305_loop + /* Delay slot is always executed. */ + addu CA, O4 + .set reorder + + /* restore the used save registers. */ + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + + /* store Hx and Carry */ + sw CA, PTR_POLY1305_CA + sw H0, PTR_POLY1305_H(0) + sw H1, PTR_POLY1305_H(1) + sw H2, PTR_POLY1305_H(2) + sw H3, PTR_POLY1305_H(3) + sw H4, PTR_POLY1305_H(4) + +.Lpoly1305_blocks_mips_end: + /* Jump Back */ + .set noreorder + jr $ra + addiu $sp, POLY1305_STACK_SIZE + .set reorder +.end poly1305_blocks_mips +.set at +.set reorder + +/* Input arguments CTX=$a0, MAC=$a1, NONCE=$a2 */ +#define MAC $a1 +#define NONCE $a2 + +#define G0 $t5 +#define G1 $t6 +#define G2 $t7 +#define G3 $t8 +#define G4 $t9 + +.set reorder +.set noat +.align 4 +.globl poly1305_emit_mips +.ent poly1305_emit_mips +poly1305_emit_mips: + /* load Hx and Carry */ + lw CA, PTR_POLY1305_CA + lw H0, PTR_POLY1305_H(0) + lw H1, PTR_POLY1305_H(1) + lw H2, PTR_POLY1305_H(2) + lw H3, PTR_POLY1305_H(3) + lw H4, PTR_POLY1305_H(4) + + /* Add left over carry */ + addu H0, CA + sltu CA, H0, CA + addu H1, CA + sltu CA, H1, CA + addu H2, CA + sltu CA, H2, CA + addu H3, CA + sltu CA, H3, CA + addu H4, CA + + /* compare to modulus by computing h + -p */ + addiu G0, H0, 5 + sltu CA, G0, H0 + addu G1, H1, CA + sltu CA, G1, H1 + addu G2, H2, CA + sltu CA, G2, H2 + addu G3, H3, CA + sltu CA, G3, H3 + addu G4, H4, CA + + srl SC, G4, 2 + + /* if there was carry into 131st bit, h3:h0 = g3:g0 */ + movn H0, G0, SC + movn H1, G1, SC + movn H2, G2, SC + movn H3, G3, SC + + lwl G0, 0+MSB(NONCE) + lwl G1, 4+MSB(NONCE) + lwl G2, 8+MSB(NONCE) + lwl G3,12+MSB(NONCE) + lwr G0, 0+LSB(NONCE) + lwr G1, 4+LSB(NONCE) + lwr G2, 8+LSB(NONCE) + lwr G3,12+LSB(NONCE) + + /* mac = (h + nonce) % (2^128) */ + addu H0, G0 + sltu CA, H0, G0 + + /* H1 */ + addu H1, CA + sltu CA, H1, CA + addu H1, G1 + sltu G1, H1, G1 + addu CA, G1 + + /* H2 */ + addu H2, CA + sltu CA, H2, CA + addu H2, G2 + sltu G2, H2, G2 + addu CA, G2 + + /* H3 */ + addu H3, CA + addu H3, G3 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + wsbh H0 + wsbh H1 + wsbh H2 + wsbh H3 + rotr H0, 16 + rotr H1, 16 + rotr H2, 16 + rotr H3, 16 +#endif + + /* store MAC */ + swl H0, 0+MSB(MAC) + swl H1, 4+MSB(MAC) + swl H2, 8+MSB(MAC) + swl H3,12+MSB(MAC) + swr H0, 0+LSB(MAC) + swr H1, 4+LSB(MAC) + swr H2, 8+LSB(MAC) + .set noreorder + jr $ra + swr H3,12+LSB(MAC) + .set reorder +.end poly1305_emit_mips + +#define PR0 $t0 +#define PR1 $t1 +#define PR2 $t2 +#define PR3 $t3 +#define PT0 $t4 +#define PS1 $t5 +#define PS2 $t6 +#define PS3 $t7 + +/* Input arguments CTX=$a0, KEY=$a1 */ + +.align 4 +.globl poly1305_init_mips +.ent poly1305_init_mips +poly1305_init_mips: + lwl PR0, 0+MSB($a1) + lwl PR1, 4+MSB($a1) + lwl PR2, 8+MSB($a1) + lwl PR3,12+MSB($a1) + lwr PR0, 0+LSB($a1) + lwr PR1, 4+LSB($a1) + lwr PR2, 8+LSB($a1) + lwr PR3,12+LSB($a1) + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + wsbh PR0 + wsbh PR1 + wsbh PR2 + wsbh PR3 + rotr PR0, 16 + rotr PR1, 16 + rotr PR2, 16 + rotr PR3, 16 +#endif + + /* store Hx and Carry */ + sw $zero, PTR_POLY1305_CA + sw $zero, PTR_POLY1305_H(0) + sw $zero, PTR_POLY1305_H(1) + sw $zero, PTR_POLY1305_H(2) + sw $zero, PTR_POLY1305_H(3) + sw $zero, PTR_POLY1305_H(4) + + lui PT0, 0x0FFF + ori PT0, 0xFFFC + + /* AND 0x0fffffff; */ + ext PR0, PR0, 0, (32-4) + + /* AND 0x0ffffffc; */ + and PR1, PT0 + and PR2, PT0 + and PR3, PT0 + + srl PS1, PR1, 2 + srl PS2, PR2, 2 + srl PS3, PR3, 2 + addu PS1, PR1 + addu PS2, PR2 + addu PS3, PR3 + + /* store Rx */ + sw PR0, PTR_POLY1305_R(0) + sw PR1, PTR_POLY1305_R(1) + sw PR2, PTR_POLY1305_R(2) + sw PR3, PTR_POLY1305_R(3) + + /* store Sx */ + sw PS1, PTR_POLY1305_S(1) + sw PS2, PTR_POLY1305_S(2) + + .set noreorder + /* Jump Back */ + jr $ra + sw PS3, PTR_POLY1305_S(3) + .set reorder +.end poly1305_init_mips |