1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
|
/* SPDX-License-Identifier: GPL-2.0
*
* Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com> All Rights Reserved.
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define MSB 0
#define LSB 3
#else
#define MSB 3
#define LSB 0
#endif
#define POLY1305_BLOCK_SIZE 16
.text
#define H0 $t0
#define H1 $t1
#define H2 $t2
#define H3 $t3
#define H4 $t4
#define R0 $t5
#define R1 $t6
#define R2 $t7
#define R3 $t8
#define O0 $s0
#define O1 $s4
#define O2 $v1
#define O3 $t9
#define O4 $s5
#define S1 $s1
#define S2 $s2
#define S3 $s3
#define SC $at
#define CA $v0
/* Input arguments */
#define poly $a0
#define src $a1
#define srclen $a2
#define hibit $a3
#define PTR_POLY1305_R(n) ( 0 + (n*4)) ## ($a0)
#define PTR_POLY1305_S(n) (16 + (n*4)) ## ($a0)
#define PTR_POLY1305_CA (32 ) ## ($a0)
#define PTR_POLY1305_H(n) (36 + (n*4)) ## ($a0)
#define POLY1305_BLOCK_SIZE 16
#define POLY1305_STACK_SIZE 8 * 4
.set reorder
.set noat
.align 4
.globl poly1305_blocks_mips
.ent poly1305_blocks_mips
poly1305_blocks_mips:
.frame $sp,POLY1305_STACK_SIZE,$31
/* srclen &= 0xFFFFFFF0 */
ins srclen, $zero, 0, 4
.set noreorder
/* check srclen >= 16 bytes */
beqz srclen, .Lpoly1305_blocks_mips_end
addiu $sp, -(POLY1305_STACK_SIZE)
.set reorder
/* Calculate last round based on src address pointer.
* last round src ptr (srclen) = src + (srclen & 0xFFFFFFF0)
*/
addu srclen, src
lw R0, PTR_POLY1305_R(0)
lw R1, PTR_POLY1305_R(1)
lw R2, PTR_POLY1305_R(2)
lw R3, PTR_POLY1305_R(3)
/* store the used save registers. */
sw $s0, 0($sp)
sw $s1, 4($sp)
sw $s2, 8($sp)
sw $s3, 12($sp)
sw $s4, 16($sp)
sw $s5, 20($sp)
lw S1, PTR_POLY1305_S(1)
lw S2, PTR_POLY1305_S(2)
lw S3, PTR_POLY1305_S(3)
/* load Hx and Carry */
lw CA, PTR_POLY1305_CA
lw H0, PTR_POLY1305_H(0)
lw H1, PTR_POLY1305_H(1)
lw H2, PTR_POLY1305_H(2)
lw H3, PTR_POLY1305_H(3)
lw H4, PTR_POLY1305_H(4)
addiu SC, $zero, 1
.Lpoly1305_loop:
lwl O0, 0+MSB(src)
lwl O1, 4+MSB(src)
lwl O2, 8+MSB(src)
lwl O3,12+MSB(src)
lwr O0, 0+LSB(src)
lwr O1, 4+LSB(src)
lwr O2, 8+LSB(src)
lwr O3,12+LSB(src)
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
wsbh O0
wsbh O1
wsbh O2
wsbh O3
rotr O0, 16
rotr O1, 16
rotr O2, 16
rotr O3, 16
#endif
/* h0 = (u32)(d0 = (u64)h0 + inp[0] + c 'Carry_previous cycle'); */
addu H0, CA
sltu CA, H0, CA
addu O0, H0
sltu H0, O0, H0
addu CA, H0
/* h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + inp[4]); */
addu H1, CA
sltu CA, H1, CA
addu O1, H1
sltu H1, O1, H1
addu CA, H1
/* h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + inp[8]); */
addu H2, CA
sltu CA, H2, CA
addu O2, H2
sltu H2, O2, H2
addu CA, H2
/* h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + inp[12]); */
addu H3, CA
sltu CA, H3, CA
addu O3, H3
sltu H3, O3, H3
addu CA, H3
/* h4 += (u32)(d3 >> 32) + padbit; */
addu H4, hibit
addu O4, H4, CA
/* D0 */
multu O0, R0
maddu O1, S3
maddu O2, S2
maddu O3, S1
mfhi CA
mflo H0
/* D1 */
multu O0, R1
maddu O1, R0
maddu O2, S3
maddu O3, S2
maddu O4, S1
maddu CA, SC
mfhi CA
mflo H1
/* D2 */
multu O0, R2
maddu O1, R1
maddu O2, R0
maddu O3, S3
maddu O4, S2
maddu CA, SC
mfhi CA
mflo H2
/* D4 */
mul H4, O4, R0
/* D3 */
multu O0, R3
maddu O1, R2
maddu O2, R1
maddu O3, R0
maddu O4, S3
maddu CA, SC
mfhi CA
mflo H3
addiu src, POLY1305_BLOCK_SIZE
/* h4 += (u32)(d3 >> 32); */
addu O4, H4, CA
/* h4 &= 3 */
andi H4, O4, 3
/* c = (h4 >> 2) + (h4 & ~3U); */
srl CA, O4, 2
ins O4, $zero, 0, 2
/* able to do a 16 byte block. */
.set noreorder
bne src, srclen, .Lpoly1305_loop
/* Delay slot is always executed. */
addu CA, O4
.set reorder
/* restore the used save registers. */
lw $s0, 0($sp)
lw $s1, 4($sp)
lw $s2, 8($sp)
lw $s3, 12($sp)
lw $s4, 16($sp)
lw $s5, 20($sp)
/* store Hx and Carry */
sw CA, PTR_POLY1305_CA
sw H0, PTR_POLY1305_H(0)
sw H1, PTR_POLY1305_H(1)
sw H2, PTR_POLY1305_H(2)
sw H3, PTR_POLY1305_H(3)
sw H4, PTR_POLY1305_H(4)
.Lpoly1305_blocks_mips_end:
/* Jump Back */
.set noreorder
jr $ra
addiu $sp, POLY1305_STACK_SIZE
.set reorder
.end poly1305_blocks_mips
.set at
.set reorder
/* Input arguments CTX=$a0, MAC=$a1, NONCE=$a2 */
#define MAC $a1
#define NONCE $a2
#define G0 $t5
#define G1 $t6
#define G2 $t7
#define G3 $t8
#define G4 $t9
.set reorder
.set noat
.align 4
.globl poly1305_emit_mips
.ent poly1305_emit_mips
poly1305_emit_mips:
/* load Hx and Carry */
lw CA, PTR_POLY1305_CA
lw H0, PTR_POLY1305_H(0)
lw H1, PTR_POLY1305_H(1)
lw H2, PTR_POLY1305_H(2)
lw H3, PTR_POLY1305_H(3)
lw H4, PTR_POLY1305_H(4)
/* Add left over carry */
addu H0, CA
sltu CA, H0, CA
addu H1, CA
sltu CA, H1, CA
addu H2, CA
sltu CA, H2, CA
addu H3, CA
sltu CA, H3, CA
addu H4, CA
/* compare to modulus by computing h + -p */
addiu G0, H0, 5
sltu CA, G0, H0
addu G1, H1, CA
sltu CA, G1, H1
addu G2, H2, CA
sltu CA, G2, H2
addu G3, H3, CA
sltu CA, G3, H3
addu G4, H4, CA
srl SC, G4, 2
/* if there was carry into 131st bit, h3:h0 = g3:g0 */
movn H0, G0, SC
movn H1, G1, SC
movn H2, G2, SC
movn H3, G3, SC
lwl G0, 0+MSB(NONCE)
lwl G1, 4+MSB(NONCE)
lwl G2, 8+MSB(NONCE)
lwl G3,12+MSB(NONCE)
lwr G0, 0+LSB(NONCE)
lwr G1, 4+LSB(NONCE)
lwr G2, 8+LSB(NONCE)
lwr G3,12+LSB(NONCE)
/* mac = (h + nonce) % (2^128) */
addu H0, G0
sltu CA, H0, G0
/* H1 */
addu H1, CA
sltu CA, H1, CA
addu H1, G1
sltu G1, H1, G1
addu CA, G1
/* H2 */
addu H2, CA
sltu CA, H2, CA
addu H2, G2
sltu G2, H2, G2
addu CA, G2
/* H3 */
addu H3, CA
addu H3, G3
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
wsbh H0
wsbh H1
wsbh H2
wsbh H3
rotr H0, 16
rotr H1, 16
rotr H2, 16
rotr H3, 16
#endif
/* store MAC */
swl H0, 0+MSB(MAC)
swl H1, 4+MSB(MAC)
swl H2, 8+MSB(MAC)
swl H3,12+MSB(MAC)
swr H0, 0+LSB(MAC)
swr H1, 4+LSB(MAC)
swr H2, 8+LSB(MAC)
.set noreorder
jr $ra
swr H3,12+LSB(MAC)
.set reorder
.end poly1305_emit_mips
#define PR0 $t0
#define PR1 $t1
#define PR2 $t2
#define PR3 $t3
#define PT0 $t4
#define PS1 $t5
#define PS2 $t6
#define PS3 $t7
/* Input arguments CTX=$a0, KEY=$a1 */
.align 4
.globl poly1305_init_mips
.ent poly1305_init_mips
poly1305_init_mips:
lwl PR0, 0+MSB($a1)
lwl PR1, 4+MSB($a1)
lwl PR2, 8+MSB($a1)
lwl PR3,12+MSB($a1)
lwr PR0, 0+LSB($a1)
lwr PR1, 4+LSB($a1)
lwr PR2, 8+LSB($a1)
lwr PR3,12+LSB($a1)
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
wsbh PR0
wsbh PR1
wsbh PR2
wsbh PR3
rotr PR0, 16
rotr PR1, 16
rotr PR2, 16
rotr PR3, 16
#endif
/* store Hx and Carry */
sw $zero, PTR_POLY1305_CA
sw $zero, PTR_POLY1305_H(0)
sw $zero, PTR_POLY1305_H(1)
sw $zero, PTR_POLY1305_H(2)
sw $zero, PTR_POLY1305_H(3)
sw $zero, PTR_POLY1305_H(4)
lui PT0, 0x0FFF
ori PT0, 0xFFFC
/* AND 0x0fffffff; */
ext PR0, PR0, 0, (32-4)
/* AND 0x0ffffffc; */
and PR1, PT0
and PR2, PT0
and PR3, PT0
srl PS1, PR1, 2
srl PS2, PR2, 2
srl PS3, PR3, 2
addu PS1, PR1
addu PS2, PR2
addu PS3, PR3
/* store Rx */
sw PR0, PTR_POLY1305_R(0)
sw PR1, PTR_POLY1305_R(1)
sw PR2, PTR_POLY1305_R(2)
sw PR3, PTR_POLY1305_R(3)
/* store Sx */
sw PS1, PTR_POLY1305_S(1)
sw PS2, PTR_POLY1305_S(2)
.set noreorder
/* Jump Back */
jr $ra
sw PS3, PTR_POLY1305_S(3)
.set reorder
.end poly1305_init_mips
|