Loading...
Note: File does not exist in v3.1.
1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3#
4# ====================================================================
5# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
6# project.
7# ====================================================================
8
9# Poly1305 hash for MIPS.
10#
11# May 2016
12#
13# Numbers are cycles per processed byte with poly1305_blocks alone.
14#
15# IALU/gcc
16# R1x000 ~5.5/+130% (big-endian)
17# Octeon II 2.50/+70% (little-endian)
18#
19# March 2019
20#
21# Add 32-bit code path.
22#
23# October 2019
24#
25# Modulo-scheduling reduction allows to omit dependency chain at the
26# end of inner loop and improve performance. Also optimize MIPS32R2
27# code path for MIPS 1004K core. Per René von Dorst's suggestions.
28#
29# IALU/gcc
30# R1x000 ~9.8/? (big-endian)
31# Octeon II 3.65/+140% (little-endian)
32# MT7621/1004K 4.75/? (little-endian)
33#
34######################################################################
35# There is a number of MIPS ABI in use, O32 and N32/64 are most
36# widely used. Then there is a new contender: NUBI. It appears that if
37# one picks the latter, it's possible to arrange code in ABI neutral
38# manner. Therefore let's stick to NUBI register layout:
39#
40($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
44#
45# The return value is placed in $a0. Following coding rules facilitate
46# interoperability:
47#
48# - never ever touch $tp, "thread pointer", former $gp [o32 can be
49# excluded from the rule, because it's specified volatile];
50# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
51# old code];
52# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
53#
54# For reference here is register layout for N32/64 MIPS ABIs:
55#
56# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
61#
62# <appro@openssl.org>
63#
64######################################################################
65
66$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
67
68$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
69
70if ($flavour =~ /64|n32/i) {{{
71######################################################################
72# 64-bit code path
73#
74
75my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
77
78$code.=<<___;
79#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80 defined(_MIPS_ARCH_MIPS64R6)) \\
81 && !defined(_MIPS_ARCH_MIPS64R2)
82# define _MIPS_ARCH_MIPS64R2
83#endif
84
85#if defined(_MIPS_ARCH_MIPS64R6)
86# define dmultu(rs,rt)
87# define mflo(rd,rs,rt) dmulu rd,rs,rt
88# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
89#else
90# define dmultu(rs,rt) dmultu rs,rt
91# define mflo(rd,rs,rt) mflo rd
92# define mfhi(rd,rs,rt) mfhi rd
93#endif
94
95#ifdef __KERNEL__
96# define poly1305_init poly1305_init_mips
97# define poly1305_blocks poly1305_blocks_mips
98# define poly1305_emit poly1305_emit_mips
99#endif
100
101#if defined(__MIPSEB__) && !defined(MIPSEB)
102# define MIPSEB
103#endif
104
105#ifdef MIPSEB
106# define MSB 0
107# define LSB 7
108#else
109# define MSB 7
110# define LSB 0
111#endif
112
113.text
114.set noat
115.set noreorder
116
117.align 5
118.globl poly1305_init
119.ent poly1305_init
120poly1305_init:
121 .frame $sp,0,$ra
122 .set reorder
123
124 sd $zero,0($ctx)
125 sd $zero,8($ctx)
126 sd $zero,16($ctx)
127
128 beqz $inp,.Lno_key
129
130#if defined(_MIPS_ARCH_MIPS64R6)
131 andi $tmp0,$inp,7 # $inp % 8
132 dsubu $inp,$inp,$tmp0 # align $inp
133 sll $tmp0,$tmp0,3 # byte to bit offset
134 ld $in0,0($inp)
135 ld $in1,8($inp)
136 beqz $tmp0,.Laligned_key
137 ld $tmp2,16($inp)
138
139 subu $tmp1,$zero,$tmp0
140# ifdef MIPSEB
141 dsllv $in0,$in0,$tmp0
142 dsrlv $tmp3,$in1,$tmp1
143 dsllv $in1,$in1,$tmp0
144 dsrlv $tmp2,$tmp2,$tmp1
145# else
146 dsrlv $in0,$in0,$tmp0
147 dsllv $tmp3,$in1,$tmp1
148 dsrlv $in1,$in1,$tmp0
149 dsllv $tmp2,$tmp2,$tmp1
150# endif
151 or $in0,$in0,$tmp3
152 or $in1,$in1,$tmp2
153.Laligned_key:
154#else
155 ldl $in0,0+MSB($inp)
156 ldl $in1,8+MSB($inp)
157 ldr $in0,0+LSB($inp)
158 ldr $in1,8+LSB($inp)
159#endif
160#ifdef MIPSEB
161# if defined(_MIPS_ARCH_MIPS64R2)
162 dsbh $in0,$in0 # byte swap
163 dsbh $in1,$in1
164 dshd $in0,$in0
165 dshd $in1,$in1
166# else
167 ori $tmp0,$zero,0xFF
168 dsll $tmp2,$tmp0,32
169 or $tmp0,$tmp2 # 0x000000FF000000FF
170
171 and $tmp1,$in0,$tmp0 # byte swap
172 and $tmp3,$in1,$tmp0
173 dsrl $tmp2,$in0,24
174 dsrl $tmp4,$in1,24
175 dsll $tmp1,24
176 dsll $tmp3,24
177 and $tmp2,$tmp0
178 and $tmp4,$tmp0
179 dsll $tmp0,8 # 0x0000FF000000FF00
180 or $tmp1,$tmp2
181 or $tmp3,$tmp4
182 and $tmp2,$in0,$tmp0
183 and $tmp4,$in1,$tmp0
184 dsrl $in0,8
185 dsrl $in1,8
186 dsll $tmp2,8
187 dsll $tmp4,8
188 and $in0,$tmp0
189 and $in1,$tmp0
190 or $tmp1,$tmp2
191 or $tmp3,$tmp4
192 or $in0,$tmp1
193 or $in1,$tmp3
194 dsrl $tmp1,$in0,32
195 dsrl $tmp3,$in1,32
196 dsll $in0,32
197 dsll $in1,32
198 or $in0,$tmp1
199 or $in1,$tmp3
200# endif
201#endif
202 li $tmp0,1
203 dsll $tmp0,32 # 0x0000000100000000
204 daddiu $tmp0,-63 # 0x00000000ffffffc1
205 dsll $tmp0,28 # 0x0ffffffc10000000
206 daddiu $tmp0,-1 # 0x0ffffffc0fffffff
207
208 and $in0,$tmp0
209 daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
210 and $in1,$tmp0
211
212 sd $in0,24($ctx)
213 dsrl $tmp0,$in1,2
214 sd $in1,32($ctx)
215 daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
216 sd $tmp0,40($ctx)
217
218.Lno_key:
219 li $v0,0 # return 0
220 jr $ra
221.end poly1305_init
222___
223{
224my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
225
226my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
227 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
228my ($shr,$shl) = ($s6,$s7); # used on R6
229
230$code.=<<___;
231.align 5
232.globl poly1305_blocks
233.ent poly1305_blocks
234poly1305_blocks:
235 .set noreorder
236 dsrl $len,4 # number of complete blocks
237 bnez $len,poly1305_blocks_internal
238 nop
239 jr $ra
240 nop
241.end poly1305_blocks
242
243.align 5
244.ent poly1305_blocks_internal
245poly1305_blocks_internal:
246 .set noreorder
247#if defined(_MIPS_ARCH_MIPS64R6)
248 .frame $sp,8*8,$ra
249 .mask $SAVED_REGS_MASK|0x000c0000,-8
250 dsubu $sp,8*8
251 sd $s7,56($sp)
252 sd $s6,48($sp)
253#else
254 .frame $sp,6*8,$ra
255 .mask $SAVED_REGS_MASK,-8
256 dsubu $sp,6*8
257#endif
258 sd $s5,40($sp)
259 sd $s4,32($sp)
260___
261$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
262 sd $s3,24($sp)
263 sd $s2,16($sp)
264 sd $s1,8($sp)
265 sd $s0,0($sp)
266___
267$code.=<<___;
268 .set reorder
269
270#if defined(_MIPS_ARCH_MIPS64R6)
271 andi $shr,$inp,7
272 dsubu $inp,$inp,$shr # align $inp
273 sll $shr,$shr,3 # byte to bit offset
274 subu $shl,$zero,$shr
275#endif
276
277 ld $h0,0($ctx) # load hash value
278 ld $h1,8($ctx)
279 ld $h2,16($ctx)
280
281 ld $r0,24($ctx) # load key
282 ld $r1,32($ctx)
283 ld $rs1,40($ctx)
284
285 dsll $len,4
286 daddu $len,$inp # end of buffer
287 b .Loop
288
289.align 4
290.Loop:
291#if defined(_MIPS_ARCH_MIPS64R6)
292 ld $in0,0($inp) # load input
293 ld $in1,8($inp)
294 beqz $shr,.Laligned_inp
295
296 ld $tmp2,16($inp)
297# ifdef MIPSEB
298 dsllv $in0,$in0,$shr
299 dsrlv $tmp3,$in1,$shl
300 dsllv $in1,$in1,$shr
301 dsrlv $tmp2,$tmp2,$shl
302# else
303 dsrlv $in0,$in0,$shr
304 dsllv $tmp3,$in1,$shl
305 dsrlv $in1,$in1,$shr
306 dsllv $tmp2,$tmp2,$shl
307# endif
308 or $in0,$in0,$tmp3
309 or $in1,$in1,$tmp2
310.Laligned_inp:
311#else
312 ldl $in0,0+MSB($inp) # load input
313 ldl $in1,8+MSB($inp)
314 ldr $in0,0+LSB($inp)
315 ldr $in1,8+LSB($inp)
316#endif
317 daddiu $inp,16
318#ifdef MIPSEB
319# if defined(_MIPS_ARCH_MIPS64R2)
320 dsbh $in0,$in0 # byte swap
321 dsbh $in1,$in1
322 dshd $in0,$in0
323 dshd $in1,$in1
324# else
325 ori $tmp0,$zero,0xFF
326 dsll $tmp2,$tmp0,32
327 or $tmp0,$tmp2 # 0x000000FF000000FF
328
329 and $tmp1,$in0,$tmp0 # byte swap
330 and $tmp3,$in1,$tmp0
331 dsrl $tmp2,$in0,24
332 dsrl $tmp4,$in1,24
333 dsll $tmp1,24
334 dsll $tmp3,24
335 and $tmp2,$tmp0
336 and $tmp4,$tmp0
337 dsll $tmp0,8 # 0x0000FF000000FF00
338 or $tmp1,$tmp2
339 or $tmp3,$tmp4
340 and $tmp2,$in0,$tmp0
341 and $tmp4,$in1,$tmp0
342 dsrl $in0,8
343 dsrl $in1,8
344 dsll $tmp2,8
345 dsll $tmp4,8
346 and $in0,$tmp0
347 and $in1,$tmp0
348 or $tmp1,$tmp2
349 or $tmp3,$tmp4
350 or $in0,$tmp1
351 or $in1,$tmp3
352 dsrl $tmp1,$in0,32
353 dsrl $tmp3,$in1,32
354 dsll $in0,32
355 dsll $in1,32
356 or $in0,$tmp1
357 or $in1,$tmp3
358# endif
359#endif
360 dsrl $tmp1,$h2,2 # modulo-scheduled reduction
361 andi $h2,$h2,3
362 dsll $tmp0,$tmp1,2
363
364 daddu $d0,$h0,$in0 # accumulate input
365 daddu $tmp1,$tmp0
366 sltu $tmp0,$d0,$h0
367 daddu $d0,$d0,$tmp1 # ... and residue
368 sltu $tmp1,$d0,$tmp1
369 daddu $d1,$h1,$in1
370 daddu $tmp0,$tmp1
371 sltu $tmp1,$d1,$h1
372 daddu $d1,$tmp0
373
374 dmultu ($r0,$d0) # h0*r0
375 daddu $d2,$h2,$padbit
376 sltu $tmp0,$d1,$tmp0
377 mflo ($h0,$r0,$d0)
378 mfhi ($h1,$r0,$d0)
379
380 dmultu ($rs1,$d1) # h1*5*r1
381 daddu $d2,$tmp1
382 daddu $d2,$tmp0
383 mflo ($tmp0,$rs1,$d1)
384 mfhi ($tmp1,$rs1,$d1)
385
386 dmultu ($r1,$d0) # h0*r1
387 mflo ($tmp2,$r1,$d0)
388 mfhi ($h2,$r1,$d0)
389 daddu $h0,$tmp0
390 daddu $h1,$tmp1
391 sltu $tmp0,$h0,$tmp0
392
393 dmultu ($r0,$d1) # h1*r0
394 daddu $h1,$tmp0
395 daddu $h1,$tmp2
396 mflo ($tmp0,$r0,$d1)
397 mfhi ($tmp1,$r0,$d1)
398
399 dmultu ($rs1,$d2) # h2*5*r1
400 sltu $tmp2,$h1,$tmp2
401 daddu $h2,$tmp2
402 mflo ($tmp2,$rs1,$d2)
403
404 dmultu ($r0,$d2) # h2*r0
405 daddu $h1,$tmp0
406 daddu $h2,$tmp1
407 mflo ($tmp3,$r0,$d2)
408 sltu $tmp0,$h1,$tmp0
409 daddu $h2,$tmp0
410
411 daddu $h1,$tmp2
412 sltu $tmp2,$h1,$tmp2
413 daddu $h2,$tmp2
414 daddu $h2,$tmp3
415
416 bne $inp,$len,.Loop
417
418 sd $h0,0($ctx) # store hash value
419 sd $h1,8($ctx)
420 sd $h2,16($ctx)
421
422 .set noreorder
423#if defined(_MIPS_ARCH_MIPS64R6)
424 ld $s7,56($sp)
425 ld $s6,48($sp)
426#endif
427 ld $s5,40($sp) # epilogue
428 ld $s4,32($sp)
429___
430$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
431 ld $s3,24($sp)
432 ld $s2,16($sp)
433 ld $s1,8($sp)
434 ld $s0,0($sp)
435___
436$code.=<<___;
437 jr $ra
438#if defined(_MIPS_ARCH_MIPS64R6)
439 daddu $sp,8*8
440#else
441 daddu $sp,6*8
442#endif
443.end poly1305_blocks_internal
444___
445}
446{
447my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
448
449$code.=<<___;
450.align 5
451.globl poly1305_emit
452.ent poly1305_emit
453poly1305_emit:
454 .frame $sp,0,$ra
455 .set reorder
456
457 ld $tmp2,16($ctx)
458 ld $tmp0,0($ctx)
459 ld $tmp1,8($ctx)
460
461 li $in0,-4 # final reduction
462 dsrl $in1,$tmp2,2
463 and $in0,$tmp2
464 andi $tmp2,$tmp2,3
465 daddu $in0,$in1
466
467 daddu $tmp0,$tmp0,$in0
468 sltu $in1,$tmp0,$in0
469 daddiu $in0,$tmp0,5 # compare to modulus
470 daddu $tmp1,$tmp1,$in1
471 sltiu $tmp3,$in0,5
472 sltu $tmp4,$tmp1,$in1
473 daddu $in1,$tmp1,$tmp3
474 daddu $tmp2,$tmp2,$tmp4
475 sltu $tmp3,$in1,$tmp3
476 daddu $tmp2,$tmp2,$tmp3
477
478 dsrl $tmp2,2 # see if it carried/borrowed
479 dsubu $tmp2,$zero,$tmp2
480
481 xor $in0,$tmp0
482 xor $in1,$tmp1
483 and $in0,$tmp2
484 and $in1,$tmp2
485 xor $in0,$tmp0
486 xor $in1,$tmp1
487
488 lwu $tmp0,0($nonce) # load nonce
489 lwu $tmp1,4($nonce)
490 lwu $tmp2,8($nonce)
491 lwu $tmp3,12($nonce)
492 dsll $tmp1,32
493 dsll $tmp3,32
494 or $tmp0,$tmp1
495 or $tmp2,$tmp3
496
497 daddu $in0,$tmp0 # accumulate nonce
498 daddu $in1,$tmp2
499 sltu $tmp0,$in0,$tmp0
500 daddu $in1,$tmp0
501
502 dsrl $tmp0,$in0,8 # write mac value
503 dsrl $tmp1,$in0,16
504 dsrl $tmp2,$in0,24
505 sb $in0,0($mac)
506 dsrl $tmp3,$in0,32
507 sb $tmp0,1($mac)
508 dsrl $tmp0,$in0,40
509 sb $tmp1,2($mac)
510 dsrl $tmp1,$in0,48
511 sb $tmp2,3($mac)
512 dsrl $tmp2,$in0,56
513 sb $tmp3,4($mac)
514 dsrl $tmp3,$in1,8
515 sb $tmp0,5($mac)
516 dsrl $tmp0,$in1,16
517 sb $tmp1,6($mac)
518 dsrl $tmp1,$in1,24
519 sb $tmp2,7($mac)
520
521 sb $in1,8($mac)
522 dsrl $tmp2,$in1,32
523 sb $tmp3,9($mac)
524 dsrl $tmp3,$in1,40
525 sb $tmp0,10($mac)
526 dsrl $tmp0,$in1,48
527 sb $tmp1,11($mac)
528 dsrl $tmp1,$in1,56
529 sb $tmp2,12($mac)
530 sb $tmp3,13($mac)
531 sb $tmp0,14($mac)
532 sb $tmp1,15($mac)
533
534 jr $ra
535.end poly1305_emit
536.rdata
537.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
538.align 2
539___
540}
541}}} else {{{
542######################################################################
543# 32-bit code path
544#
545
546my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
547my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
548 ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
549
550$code.=<<___;
551#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
552 defined(_MIPS_ARCH_MIPS32R6)) \\
553 && !defined(_MIPS_ARCH_MIPS32R2)
554# define _MIPS_ARCH_MIPS32R2
555#endif
556
557#if defined(_MIPS_ARCH_MIPS32R6)
558# define multu(rs,rt)
559# define mflo(rd,rs,rt) mulu rd,rs,rt
560# define mfhi(rd,rs,rt) muhu rd,rs,rt
561#else
562# define multu(rs,rt) multu rs,rt
563# define mflo(rd,rs,rt) mflo rd
564# define mfhi(rd,rs,rt) mfhi rd
565#endif
566
567#ifdef __KERNEL__
568# define poly1305_init poly1305_init_mips
569# define poly1305_blocks poly1305_blocks_mips
570# define poly1305_emit poly1305_emit_mips
571#endif
572
573#if defined(__MIPSEB__) && !defined(MIPSEB)
574# define MIPSEB
575#endif
576
577#ifdef MIPSEB
578# define MSB 0
579# define LSB 3
580#else
581# define MSB 3
582# define LSB 0
583#endif
584
585.text
586.set noat
587.set noreorder
588
589.align 5
590.globl poly1305_init
591.ent poly1305_init
592poly1305_init:
593 .frame $sp,0,$ra
594 .set reorder
595
596 sw $zero,0($ctx)
597 sw $zero,4($ctx)
598 sw $zero,8($ctx)
599 sw $zero,12($ctx)
600 sw $zero,16($ctx)
601
602 beqz $inp,.Lno_key
603
604#if defined(_MIPS_ARCH_MIPS32R6)
605 andi $tmp0,$inp,3 # $inp % 4
606 subu $inp,$inp,$tmp0 # align $inp
607 sll $tmp0,$tmp0,3 # byte to bit offset
608 lw $in0,0($inp)
609 lw $in1,4($inp)
610 lw $in2,8($inp)
611 lw $in3,12($inp)
612 beqz $tmp0,.Laligned_key
613
614 lw $tmp2,16($inp)
615 subu $tmp1,$zero,$tmp0
616# ifdef MIPSEB
617 sllv $in0,$in0,$tmp0
618 srlv $tmp3,$in1,$tmp1
619 sllv $in1,$in1,$tmp0
620 or $in0,$in0,$tmp3
621 srlv $tmp3,$in2,$tmp1
622 sllv $in2,$in2,$tmp0
623 or $in1,$in1,$tmp3
624 srlv $tmp3,$in3,$tmp1
625 sllv $in3,$in3,$tmp0
626 or $in2,$in2,$tmp3
627 srlv $tmp2,$tmp2,$tmp1
628 or $in3,$in3,$tmp2
629# else
630 srlv $in0,$in0,$tmp0
631 sllv $tmp3,$in1,$tmp1
632 srlv $in1,$in1,$tmp0
633 or $in0,$in0,$tmp3
634 sllv $tmp3,$in2,$tmp1
635 srlv $in2,$in2,$tmp0
636 or $in1,$in1,$tmp3
637 sllv $tmp3,$in3,$tmp1
638 srlv $in3,$in3,$tmp0
639 or $in2,$in2,$tmp3
640 sllv $tmp2,$tmp2,$tmp1
641 or $in3,$in3,$tmp2
642# endif
643.Laligned_key:
644#else
645 lwl $in0,0+MSB($inp)
646 lwl $in1,4+MSB($inp)
647 lwl $in2,8+MSB($inp)
648 lwl $in3,12+MSB($inp)
649 lwr $in0,0+LSB($inp)
650 lwr $in1,4+LSB($inp)
651 lwr $in2,8+LSB($inp)
652 lwr $in3,12+LSB($inp)
653#endif
654#ifdef MIPSEB
655# if defined(_MIPS_ARCH_MIPS32R2)
656 wsbh $in0,$in0 # byte swap
657 wsbh $in1,$in1
658 wsbh $in2,$in2
659 wsbh $in3,$in3
660 rotr $in0,$in0,16
661 rotr $in1,$in1,16
662 rotr $in2,$in2,16
663 rotr $in3,$in3,16
664# else
665 srl $tmp0,$in0,24 # byte swap
666 srl $tmp1,$in0,8
667 andi $tmp2,$in0,0xFF00
668 sll $in0,$in0,24
669 andi $tmp1,0xFF00
670 sll $tmp2,$tmp2,8
671 or $in0,$tmp0
672 srl $tmp0,$in1,24
673 or $tmp1,$tmp2
674 srl $tmp2,$in1,8
675 or $in0,$tmp1
676 andi $tmp1,$in1,0xFF00
677 sll $in1,$in1,24
678 andi $tmp2,0xFF00
679 sll $tmp1,$tmp1,8
680 or $in1,$tmp0
681 srl $tmp0,$in2,24
682 or $tmp2,$tmp1
683 srl $tmp1,$in2,8
684 or $in1,$tmp2
685 andi $tmp2,$in2,0xFF00
686 sll $in2,$in2,24
687 andi $tmp1,0xFF00
688 sll $tmp2,$tmp2,8
689 or $in2,$tmp0
690 srl $tmp0,$in3,24
691 or $tmp1,$tmp2
692 srl $tmp2,$in3,8
693 or $in2,$tmp1
694 andi $tmp1,$in3,0xFF00
695 sll $in3,$in3,24
696 andi $tmp2,0xFF00
697 sll $tmp1,$tmp1,8
698 or $in3,$tmp0
699 or $tmp2,$tmp1
700 or $in3,$tmp2
701# endif
702#endif
703 lui $tmp0,0x0fff
704 ori $tmp0,0xffff # 0x0fffffff
705 and $in0,$in0,$tmp0
706 subu $tmp0,3 # 0x0ffffffc
707 and $in1,$in1,$tmp0
708 and $in2,$in2,$tmp0
709 and $in3,$in3,$tmp0
710
711 sw $in0,20($ctx)
712 sw $in1,24($ctx)
713 sw $in2,28($ctx)
714 sw $in3,32($ctx)
715
716 srl $tmp1,$in1,2
717 srl $tmp2,$in2,2
718 srl $tmp3,$in3,2
719 addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
720 addu $in2,$in2,$tmp2
721 addu $in3,$in3,$tmp3
722 sw $in1,36($ctx)
723 sw $in2,40($ctx)
724 sw $in3,44($ctx)
725.Lno_key:
726 li $v0,0
727 jr $ra
728.end poly1305_init
729___
730{
731my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
732
733my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
734 ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
735my ($d0,$d1,$d2,$d3) =
736 ($a4,$a5,$a6,$a7);
737my $shr = $t2; # used on R6
738my $one = $t2; # used on R2
739
740$code.=<<___;
741.globl poly1305_blocks
742.align 5
743.ent poly1305_blocks
744poly1305_blocks:
745 .frame $sp,16*4,$ra
746 .mask $SAVED_REGS_MASK,-4
747 .set noreorder
748 subu $sp, $sp,4*12
749 sw $s11,4*11($sp)
750 sw $s10,4*10($sp)
751 sw $s9, 4*9($sp)
752 sw $s8, 4*8($sp)
753 sw $s7, 4*7($sp)
754 sw $s6, 4*6($sp)
755 sw $s5, 4*5($sp)
756 sw $s4, 4*4($sp)
757___
758$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
759 sw $s3, 4*3($sp)
760 sw $s2, 4*2($sp)
761 sw $s1, 4*1($sp)
762 sw $s0, 4*0($sp)
763___
764$code.=<<___;
765 .set reorder
766
767 srl $len,4 # number of complete blocks
768 li $one,1
769 beqz $len,.Labort
770
771#if defined(_MIPS_ARCH_MIPS32R6)
772 andi $shr,$inp,3
773 subu $inp,$inp,$shr # align $inp
774 sll $shr,$shr,3 # byte to bit offset
775#endif
776
777 lw $h0,0($ctx) # load hash value
778 lw $h1,4($ctx)
779 lw $h2,8($ctx)
780 lw $h3,12($ctx)
781 lw $h4,16($ctx)
782
783 lw $r0,20($ctx) # load key
784 lw $r1,24($ctx)
785 lw $r2,28($ctx)
786 lw $r3,32($ctx)
787 lw $rs1,36($ctx)
788 lw $rs2,40($ctx)
789 lw $rs3,44($ctx)
790
791 sll $len,4
792 addu $len,$len,$inp # end of buffer
793 b .Loop
794
795.align 4
796.Loop:
797#if defined(_MIPS_ARCH_MIPS32R6)
798 lw $d0,0($inp) # load input
799 lw $d1,4($inp)
800 lw $d2,8($inp)
801 lw $d3,12($inp)
802 beqz $shr,.Laligned_inp
803
804 lw $t0,16($inp)
805 subu $t1,$zero,$shr
806# ifdef MIPSEB
807 sllv $d0,$d0,$shr
808 srlv $at,$d1,$t1
809 sllv $d1,$d1,$shr
810 or $d0,$d0,$at
811 srlv $at,$d2,$t1
812 sllv $d2,$d2,$shr
813 or $d1,$d1,$at
814 srlv $at,$d3,$t1
815 sllv $d3,$d3,$shr
816 or $d2,$d2,$at
817 srlv $t0,$t0,$t1
818 or $d3,$d3,$t0
819# else
820 srlv $d0,$d0,$shr
821 sllv $at,$d1,$t1
822 srlv $d1,$d1,$shr
823 or $d0,$d0,$at
824 sllv $at,$d2,$t1
825 srlv $d2,$d2,$shr
826 or $d1,$d1,$at
827 sllv $at,$d3,$t1
828 srlv $d3,$d3,$shr
829 or $d2,$d2,$at
830 sllv $t0,$t0,$t1
831 or $d3,$d3,$t0
832# endif
833.Laligned_inp:
834#else
835 lwl $d0,0+MSB($inp) # load input
836 lwl $d1,4+MSB($inp)
837 lwl $d2,8+MSB($inp)
838 lwl $d3,12+MSB($inp)
839 lwr $d0,0+LSB($inp)
840 lwr $d1,4+LSB($inp)
841 lwr $d2,8+LSB($inp)
842 lwr $d3,12+LSB($inp)
843#endif
844#ifdef MIPSEB
845# if defined(_MIPS_ARCH_MIPS32R2)
846 wsbh $d0,$d0 # byte swap
847 wsbh $d1,$d1
848 wsbh $d2,$d2
849 wsbh $d3,$d3
850 rotr $d0,$d0,16
851 rotr $d1,$d1,16
852 rotr $d2,$d2,16
853 rotr $d3,$d3,16
854# else
855 srl $at,$d0,24 # byte swap
856 srl $t0,$d0,8
857 andi $t1,$d0,0xFF00
858 sll $d0,$d0,24
859 andi $t0,0xFF00
860 sll $t1,$t1,8
861 or $d0,$at
862 srl $at,$d1,24
863 or $t0,$t1
864 srl $t1,$d1,8
865 or $d0,$t0
866 andi $t0,$d1,0xFF00
867 sll $d1,$d1,24
868 andi $t1,0xFF00
869 sll $t0,$t0,8
870 or $d1,$at
871 srl $at,$d2,24
872 or $t1,$t0
873 srl $t0,$d2,8
874 or $d1,$t1
875 andi $t1,$d2,0xFF00
876 sll $d2,$d2,24
877 andi $t0,0xFF00
878 sll $t1,$t1,8
879 or $d2,$at
880 srl $at,$d3,24
881 or $t0,$t1
882 srl $t1,$d3,8
883 or $d2,$t0
884 andi $t0,$d3,0xFF00
885 sll $d3,$d3,24
886 andi $t1,0xFF00
887 sll $t0,$t0,8
888 or $d3,$at
889 or $t1,$t0
890 or $d3,$t1
891# endif
892#endif
893 srl $t0,$h4,2 # modulo-scheduled reduction
894 andi $h4,$h4,3
895 sll $at,$t0,2
896
897 addu $d0,$d0,$h0 # accumulate input
898 addu $t0,$t0,$at
899 sltu $h0,$d0,$h0
900 addu $d0,$d0,$t0 # ... and residue
901 sltu $at,$d0,$t0
902
903 addu $d1,$d1,$h1
904 addu $h0,$h0,$at # carry
905 sltu $h1,$d1,$h1
906 addu $d1,$d1,$h0
907 sltu $h0,$d1,$h0
908
909 addu $d2,$d2,$h2
910 addu $h1,$h1,$h0 # carry
911 sltu $h2,$d2,$h2
912 addu $d2,$d2,$h1
913 sltu $h1,$d2,$h1
914
915 addu $d3,$d3,$h3
916 addu $h2,$h2,$h1 # carry
917 sltu $h3,$d3,$h3
918 addu $d3,$d3,$h2
919
920#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
921 multu $r0,$d0 # d0*r0
922 sltu $h2,$d3,$h2
923 maddu $rs3,$d1 # d1*s3
924 addu $h3,$h3,$h2 # carry
925 maddu $rs2,$d2 # d2*s2
926 addu $h4,$h4,$padbit
927 maddu $rs1,$d3 # d3*s1
928 addu $h4,$h4,$h3
929 mfhi $at
930 mflo $h0
931
932 multu $r1,$d0 # d0*r1
933 maddu $r0,$d1 # d1*r0
934 maddu $rs3,$d2 # d2*s3
935 maddu $rs2,$d3 # d3*s2
936 maddu $rs1,$h4 # h4*s1
937 maddu $at,$one # hi*1
938 mfhi $at
939 mflo $h1
940
941 multu $r2,$d0 # d0*r2
942 maddu $r1,$d1 # d1*r1
943 maddu $r0,$d2 # d2*r0
944 maddu $rs3,$d3 # d3*s3
945 maddu $rs2,$h4 # h4*s2
946 maddu $at,$one # hi*1
947 mfhi $at
948 mflo $h2
949
950 mul $t0,$r0,$h4 # h4*r0
951
952 multu $r3,$d0 # d0*r3
953 maddu $r2,$d1 # d1*r2
954 maddu $r1,$d2 # d2*r1
955 maddu $r0,$d3 # d3*r0
956 maddu $rs3,$h4 # h4*s3
957 maddu $at,$one # hi*1
958 mfhi $at
959 mflo $h3
960
961 addiu $inp,$inp,16
962
963 addu $h4,$t0,$at
964#else
965 multu ($r0,$d0) # d0*r0
966 mflo ($h0,$r0,$d0)
967 mfhi ($h1,$r0,$d0)
968
969 sltu $h2,$d3,$h2
970 addu $h3,$h3,$h2 # carry
971
972 multu ($rs3,$d1) # d1*s3
973 mflo ($at,$rs3,$d1)
974 mfhi ($t0,$rs3,$d1)
975
976 addu $h4,$h4,$padbit
977 addiu $inp,$inp,16
978 addu $h4,$h4,$h3
979
980 multu ($rs2,$d2) # d2*s2
981 mflo ($a3,$rs2,$d2)
982 mfhi ($t1,$rs2,$d2)
983 addu $h0,$h0,$at
984 addu $h1,$h1,$t0
985 multu ($rs1,$d3) # d3*s1
986 sltu $at,$h0,$at
987 addu $h1,$h1,$at
988
989 mflo ($at,$rs1,$d3)
990 mfhi ($t0,$rs1,$d3)
991 addu $h0,$h0,$a3
992 addu $h1,$h1,$t1
993 multu ($r1,$d0) # d0*r1
994 sltu $a3,$h0,$a3
995 addu $h1,$h1,$a3
996
997
998 mflo ($a3,$r1,$d0)
999 mfhi ($h2,$r1,$d0)
1000 addu $h0,$h0,$at
1001 addu $h1,$h1,$t0
1002 multu ($r0,$d1) # d1*r0
1003 sltu $at,$h0,$at
1004 addu $h1,$h1,$at
1005
1006 mflo ($at,$r0,$d1)
1007 mfhi ($t0,$r0,$d1)
1008 addu $h1,$h1,$a3
1009 sltu $a3,$h1,$a3
1010 multu ($rs3,$d2) # d2*s3
1011 addu $h2,$h2,$a3
1012
1013 mflo ($a3,$rs3,$d2)
1014 mfhi ($t1,$rs3,$d2)
1015 addu $h1,$h1,$at
1016 addu $h2,$h2,$t0
1017 multu ($rs2,$d3) # d3*s2
1018 sltu $at,$h1,$at
1019 addu $h2,$h2,$at
1020
1021 mflo ($at,$rs2,$d3)
1022 mfhi ($t0,$rs2,$d3)
1023 addu $h1,$h1,$a3
1024 addu $h2,$h2,$t1
1025 multu ($rs1,$h4) # h4*s1
1026 sltu $a3,$h1,$a3
1027 addu $h2,$h2,$a3
1028
1029 mflo ($a3,$rs1,$h4)
1030 addu $h1,$h1,$at
1031 addu $h2,$h2,$t0
1032 multu ($r2,$d0) # d0*r2
1033 sltu $at,$h1,$at
1034 addu $h2,$h2,$at
1035
1036
1037 mflo ($at,$r2,$d0)
1038 mfhi ($h3,$r2,$d0)
1039 addu $h1,$h1,$a3
1040 sltu $a3,$h1,$a3
1041 multu ($r1,$d1) # d1*r1
1042 addu $h2,$h2,$a3
1043
1044 mflo ($a3,$r1,$d1)
1045 mfhi ($t1,$r1,$d1)
1046 addu $h2,$h2,$at
1047 sltu $at,$h2,$at
1048 multu ($r0,$d2) # d2*r0
1049 addu $h3,$h3,$at
1050
1051 mflo ($at,$r0,$d2)
1052 mfhi ($t0,$r0,$d2)
1053 addu $h2,$h2,$a3
1054 addu $h3,$h3,$t1
1055 multu ($rs3,$d3) # d3*s3
1056 sltu $a3,$h2,$a3
1057 addu $h3,$h3,$a3
1058
1059 mflo ($a3,$rs3,$d3)
1060 mfhi ($t1,$rs3,$d3)
1061 addu $h2,$h2,$at
1062 addu $h3,$h3,$t0
1063 multu ($rs2,$h4) # h4*s2
1064 sltu $at,$h2,$at
1065 addu $h3,$h3,$at
1066
1067 mflo ($at,$rs2,$h4)
1068 addu $h2,$h2,$a3
1069 addu $h3,$h3,$t1
1070 multu ($r3,$d0) # d0*r3
1071 sltu $a3,$h2,$a3
1072 addu $h3,$h3,$a3
1073
1074
1075 mflo ($a3,$r3,$d0)
1076 mfhi ($t1,$r3,$d0)
1077 addu $h2,$h2,$at
1078 sltu $at,$h2,$at
1079 multu ($r2,$d1) # d1*r2
1080 addu $h3,$h3,$at
1081
1082 mflo ($at,$r2,$d1)
1083 mfhi ($t0,$r2,$d1)
1084 addu $h3,$h3,$a3
1085 sltu $a3,$h3,$a3
1086 multu ($r0,$d3) # d3*r0
1087 addu $t1,$t1,$a3
1088
1089 mflo ($a3,$r0,$d3)
1090 mfhi ($d3,$r0,$d3)
1091 addu $h3,$h3,$at
1092 addu $t1,$t1,$t0
1093 multu ($r1,$d2) # d2*r1
1094 sltu $at,$h3,$at
1095 addu $t1,$t1,$at
1096
1097 mflo ($at,$r1,$d2)
1098 mfhi ($t0,$r1,$d2)
1099 addu $h3,$h3,$a3
1100 addu $t1,$t1,$d3
1101 multu ($rs3,$h4) # h4*s3
1102 sltu $a3,$h3,$a3
1103 addu $t1,$t1,$a3
1104
1105 mflo ($a3,$rs3,$h4)
1106 addu $h3,$h3,$at
1107 addu $t1,$t1,$t0
1108 multu ($r0,$h4) # h4*r0
1109 sltu $at,$h3,$at
1110 addu $t1,$t1,$at
1111
1112
1113 mflo ($h4,$r0,$h4)
1114 addu $h3,$h3,$a3
1115 sltu $a3,$h3,$a3
1116 addu $t1,$t1,$a3
1117 addu $h4,$h4,$t1
1118
1119 li $padbit,1 # if we loop, padbit is 1
1120#endif
1121 bne $inp,$len,.Loop
1122
1123 sw $h0,0($ctx) # store hash value
1124 sw $h1,4($ctx)
1125 sw $h2,8($ctx)
1126 sw $h3,12($ctx)
1127 sw $h4,16($ctx)
1128
1129 .set noreorder
1130.Labort:
1131 lw $s11,4*11($sp)
1132 lw $s10,4*10($sp)
1133 lw $s9, 4*9($sp)
1134 lw $s8, 4*8($sp)
1135 lw $s7, 4*7($sp)
1136 lw $s6, 4*6($sp)
1137 lw $s5, 4*5($sp)
1138 lw $s4, 4*4($sp)
1139___
1140$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1141 lw $s3, 4*3($sp)
1142 lw $s2, 4*2($sp)
1143 lw $s1, 4*1($sp)
1144 lw $s0, 4*0($sp)
1145___
1146$code.=<<___;
1147 jr $ra
1148 addu $sp,$sp,4*12
1149.end poly1305_blocks
1150___
1151}
1152{
1153my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1154
1155$code.=<<___;
1156.align 5
1157.globl poly1305_emit
1158.ent poly1305_emit
1159poly1305_emit:
1160 .frame $sp,0,$ra
1161 .set reorder
1162
1163 lw $tmp4,16($ctx)
1164 lw $tmp0,0($ctx)
1165 lw $tmp1,4($ctx)
1166 lw $tmp2,8($ctx)
1167 lw $tmp3,12($ctx)
1168
1169 li $in0,-4 # final reduction
1170 srl $ctx,$tmp4,2
1171 and $in0,$in0,$tmp4
1172 andi $tmp4,$tmp4,3
1173 addu $ctx,$ctx,$in0
1174
1175 addu $tmp0,$tmp0,$ctx
1176 sltu $ctx,$tmp0,$ctx
1177 addiu $in0,$tmp0,5 # compare to modulus
1178 addu $tmp1,$tmp1,$ctx
1179 sltiu $in1,$in0,5
1180 sltu $ctx,$tmp1,$ctx
1181 addu $in1,$in1,$tmp1
1182 addu $tmp2,$tmp2,$ctx
1183 sltu $in2,$in1,$tmp1
1184 sltu $ctx,$tmp2,$ctx
1185 addu $in2,$in2,$tmp2
1186 addu $tmp3,$tmp3,$ctx
1187 sltu $in3,$in2,$tmp2
1188 sltu $ctx,$tmp3,$ctx
1189 addu $in3,$in3,$tmp3
1190 addu $tmp4,$tmp4,$ctx
1191 sltu $ctx,$in3,$tmp3
1192 addu $ctx,$tmp4
1193
1194 srl $ctx,2 # see if it carried/borrowed
1195 subu $ctx,$zero,$ctx
1196
1197 xor $in0,$tmp0
1198 xor $in1,$tmp1
1199 xor $in2,$tmp2
1200 xor $in3,$tmp3
1201 and $in0,$ctx
1202 and $in1,$ctx
1203 and $in2,$ctx
1204 and $in3,$ctx
1205 xor $in0,$tmp0
1206 xor $in1,$tmp1
1207 xor $in2,$tmp2
1208 xor $in3,$tmp3
1209
1210 lw $tmp0,0($nonce) # load nonce
1211 lw $tmp1,4($nonce)
1212 lw $tmp2,8($nonce)
1213 lw $tmp3,12($nonce)
1214
1215 addu $in0,$tmp0 # accumulate nonce
1216 sltu $ctx,$in0,$tmp0
1217
1218 addu $in1,$tmp1
1219 sltu $tmp1,$in1,$tmp1
1220 addu $in1,$ctx
1221 sltu $ctx,$in1,$ctx
1222 addu $ctx,$tmp1
1223
1224 addu $in2,$tmp2
1225 sltu $tmp2,$in2,$tmp2
1226 addu $in2,$ctx
1227 sltu $ctx,$in2,$ctx
1228 addu $ctx,$tmp2
1229
1230 addu $in3,$tmp3
1231 addu $in3,$ctx
1232
1233 srl $tmp0,$in0,8 # write mac value
1234 srl $tmp1,$in0,16
1235 srl $tmp2,$in0,24
1236 sb $in0, 0($mac)
1237 sb $tmp0,1($mac)
1238 srl $tmp0,$in1,8
1239 sb $tmp1,2($mac)
1240 srl $tmp1,$in1,16
1241 sb $tmp2,3($mac)
1242 srl $tmp2,$in1,24
1243 sb $in1, 4($mac)
1244 sb $tmp0,5($mac)
1245 srl $tmp0,$in2,8
1246 sb $tmp1,6($mac)
1247 srl $tmp1,$in2,16
1248 sb $tmp2,7($mac)
1249 srl $tmp2,$in2,24
1250 sb $in2, 8($mac)
1251 sb $tmp0,9($mac)
1252 srl $tmp0,$in3,8
1253 sb $tmp1,10($mac)
1254 srl $tmp1,$in3,16
1255 sb $tmp2,11($mac)
1256 srl $tmp2,$in3,24
1257 sb $in3, 12($mac)
1258 sb $tmp0,13($mac)
1259 sb $tmp1,14($mac)
1260 sb $tmp2,15($mac)
1261
1262 jr $ra
1263.end poly1305_emit
1264.rdata
1265.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1266.align 2
1267___
1268}
1269}}}
1270
1271$output=pop and open STDOUT,">$output";
1272print $code;
1273close STDOUT;