Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1#!/usr/bin/env perl
   2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
   3#
   4# ====================================================================
   5# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
   6# project.
   7# ====================================================================
   8
   9# Poly1305 hash for MIPS.
  10#
  11# May 2016
  12#
  13# Numbers are cycles per processed byte with poly1305_blocks alone.
  14#
  15#		IALU/gcc
  16# R1x000	~5.5/+130%	(big-endian)
  17# Octeon II	2.50/+70%	(little-endian)
  18#
  19# March 2019
  20#
  21# Add 32-bit code path.
  22#
  23# October 2019
  24#
  25# Modulo-scheduling reduction allows to omit dependency chain at the
  26# end of inner loop and improve performance. Also optimize MIPS32R2
  27# code path for MIPS 1004K core. Per René von Dorst's suggestions.
  28#
  29#		IALU/gcc
  30# R1x000	~9.8/?		(big-endian)
  31# Octeon II	3.65/+140%	(little-endian)
  32# MT7621/1004K	4.75/?		(little-endian)
  33#
  34######################################################################
  35# There is a number of MIPS ABI in use, O32 and N32/64 are most
  36# widely used. Then there is a new contender: NUBI. It appears that if
  37# one picks the latter, it's possible to arrange code in ABI neutral
  38# manner. Therefore let's stick to NUBI register layout:
  39#
  40($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
  41($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  42($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
  43($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
  44#
  45# The return value is placed in $a0. Following coding rules facilitate
  46# interoperability:
  47#
  48# - never ever touch $tp, "thread pointer", former $gp [o32 can be
  49#   excluded from the rule, because it's specified volatile];
  50# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
  51#   old code];
  52# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
  53#
  54# For reference here is register layout for N32/64 MIPS ABIs:
  55#
  56# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
  57# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  58# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
  59# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
  60# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
  61#
  62# <appro@openssl.org>
  63#
  64######################################################################
  65
  66$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
  67
  68$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
  69
  70if ($flavour =~ /64|n32/i) {{{
  71######################################################################
  72# 64-bit code path
  73#
  74
  75my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
  76my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
  77
  78$code.=<<___;
  79#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
  80     defined(_MIPS_ARCH_MIPS64R6)) \\
  81     && !defined(_MIPS_ARCH_MIPS64R2)
  82# define _MIPS_ARCH_MIPS64R2
  83#endif
  84
  85#if defined(_MIPS_ARCH_MIPS64R6)
  86# define dmultu(rs,rt)
  87# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
  88# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
  89#else
  90# define dmultu(rs,rt)		dmultu	rs,rt
  91# define mflo(rd,rs,rt)	mflo	rd
  92# define mfhi(rd,rs,rt)	mfhi	rd
  93#endif
  94
  95#ifdef	__KERNEL__
  96# define poly1305_init   poly1305_init_mips
  97# define poly1305_blocks poly1305_blocks_mips
  98# define poly1305_emit   poly1305_emit_mips
  99#endif
 100
 101#if defined(__MIPSEB__) && !defined(MIPSEB)
 102# define MIPSEB
 103#endif
 104
 105#ifdef MIPSEB
 106# define MSB 0
 107# define LSB 7
 108#else
 109# define MSB 7
 110# define LSB 0
 111#endif
 112
 113.text
 114.set	noat
 115.set	noreorder
 116
 117.align	5
 118.globl	poly1305_init
 119.ent	poly1305_init
 120poly1305_init:
 121	.frame	$sp,0,$ra
 122	.set	reorder
 123
 124	sd	$zero,0($ctx)
 125	sd	$zero,8($ctx)
 126	sd	$zero,16($ctx)
 127
 128	beqz	$inp,.Lno_key
 129
 130#if defined(_MIPS_ARCH_MIPS64R6)
 131	andi	$tmp0,$inp,7		# $inp % 8
 132	dsubu	$inp,$inp,$tmp0		# align $inp
 133	sll	$tmp0,$tmp0,3		# byte to bit offset
 134	ld	$in0,0($inp)
 135	ld	$in1,8($inp)
 136	beqz	$tmp0,.Laligned_key
 137	ld	$tmp2,16($inp)
 138
 139	subu	$tmp1,$zero,$tmp0
 140# ifdef	MIPSEB
 141	dsllv	$in0,$in0,$tmp0
 142	dsrlv	$tmp3,$in1,$tmp1
 143	dsllv	$in1,$in1,$tmp0
 144	dsrlv	$tmp2,$tmp2,$tmp1
 145# else
 146	dsrlv	$in0,$in0,$tmp0
 147	dsllv	$tmp3,$in1,$tmp1
 148	dsrlv	$in1,$in1,$tmp0
 149	dsllv	$tmp2,$tmp2,$tmp1
 150# endif
 151	or	$in0,$in0,$tmp3
 152	or	$in1,$in1,$tmp2
 153.Laligned_key:
 154#else
 155	ldl	$in0,0+MSB($inp)
 156	ldl	$in1,8+MSB($inp)
 157	ldr	$in0,0+LSB($inp)
 158	ldr	$in1,8+LSB($inp)
 159#endif
 160#ifdef	MIPSEB
 161# if defined(_MIPS_ARCH_MIPS64R2)
 162	dsbh	$in0,$in0		# byte swap
 163	 dsbh	$in1,$in1
 164	dshd	$in0,$in0
 165	 dshd	$in1,$in1
 166# else
 167	ori	$tmp0,$zero,0xFF
 168	dsll	$tmp2,$tmp0,32
 169	or	$tmp0,$tmp2		# 0x000000FF000000FF
 170
 171	and	$tmp1,$in0,$tmp0	# byte swap
 172	 and	$tmp3,$in1,$tmp0
 173	dsrl	$tmp2,$in0,24
 174	 dsrl	$tmp4,$in1,24
 175	dsll	$tmp1,24
 176	 dsll	$tmp3,24
 177	and	$tmp2,$tmp0
 178	 and	$tmp4,$tmp0
 179	dsll	$tmp0,8			# 0x0000FF000000FF00
 180	or	$tmp1,$tmp2
 181	 or	$tmp3,$tmp4
 182	and	$tmp2,$in0,$tmp0
 183	 and	$tmp4,$in1,$tmp0
 184	dsrl	$in0,8
 185	 dsrl	$in1,8
 186	dsll	$tmp2,8
 187	 dsll	$tmp4,8
 188	and	$in0,$tmp0
 189	 and	$in1,$tmp0
 190	or	$tmp1,$tmp2
 191	 or	$tmp3,$tmp4
 192	or	$in0,$tmp1
 193	 or	$in1,$tmp3
 194	dsrl	$tmp1,$in0,32
 195	 dsrl	$tmp3,$in1,32
 196	dsll	$in0,32
 197	 dsll	$in1,32
 198	or	$in0,$tmp1
 199	 or	$in1,$tmp3
 200# endif
 201#endif
 202	li	$tmp0,1
 203	dsll	$tmp0,32		# 0x0000000100000000
 204	daddiu	$tmp0,-63		# 0x00000000ffffffc1
 205	dsll	$tmp0,28		# 0x0ffffffc10000000
 206	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
 207
 208	and	$in0,$tmp0
 209	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
 210	and	$in1,$tmp0
 211
 212	sd	$in0,24($ctx)
 213	dsrl	$tmp0,$in1,2
 214	sd	$in1,32($ctx)
 215	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
 216	sd	$tmp0,40($ctx)
 217
 218.Lno_key:
 219	li	$v0,0			# return 0
 220	jr	$ra
 221.end	poly1305_init
 222___
 223{
 224my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
 225
 226my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
 227   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
 228my ($shr,$shl) = ($s6,$s7);		# used on R6
 229
 230$code.=<<___;
 231.align	5
 232.globl	poly1305_blocks
 233.ent	poly1305_blocks
 234poly1305_blocks:
 235	.set	noreorder
 236	dsrl	$len,4			# number of complete blocks
 237	bnez	$len,poly1305_blocks_internal
 238	nop
 239	jr	$ra
 240	nop
 241.end	poly1305_blocks
 242
 243.align	5
 244.ent	poly1305_blocks_internal
 245poly1305_blocks_internal:
 246	.set	noreorder
 247#if defined(_MIPS_ARCH_MIPS64R6)
 248	.frame	$sp,8*8,$ra
 249	.mask	$SAVED_REGS_MASK|0x000c0000,-8
 250	dsubu	$sp,8*8
 251	sd	$s7,56($sp)
 252	sd	$s6,48($sp)
 253#else
 254	.frame	$sp,6*8,$ra
 255	.mask	$SAVED_REGS_MASK,-8
 256	dsubu	$sp,6*8
 257#endif
 258	sd	$s5,40($sp)
 259	sd	$s4,32($sp)
 260___
 261$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
 262	sd	$s3,24($sp)
 263	sd	$s2,16($sp)
 264	sd	$s1,8($sp)
 265	sd	$s0,0($sp)
 266___
 267$code.=<<___;
 268	.set	reorder
 269
 270#if defined(_MIPS_ARCH_MIPS64R6)
 271	andi	$shr,$inp,7
 272	dsubu	$inp,$inp,$shr		# align $inp
 273	sll	$shr,$shr,3		# byte to bit offset
 274	subu	$shl,$zero,$shr
 275#endif
 276
 277	ld	$h0,0($ctx)		# load hash value
 278	ld	$h1,8($ctx)
 279	ld	$h2,16($ctx)
 280
 281	ld	$r0,24($ctx)		# load key
 282	ld	$r1,32($ctx)
 283	ld	$rs1,40($ctx)
 284
 285	dsll	$len,4
 286	daddu	$len,$inp		# end of buffer
 287	b	.Loop
 288
 289.align	4
 290.Loop:
 291#if defined(_MIPS_ARCH_MIPS64R6)
 292	ld	$in0,0($inp)		# load input
 293	ld	$in1,8($inp)
 294	beqz	$shr,.Laligned_inp
 295
 296	ld	$tmp2,16($inp)
 297# ifdef	MIPSEB
 298	dsllv	$in0,$in0,$shr
 299	dsrlv	$tmp3,$in1,$shl
 300	dsllv	$in1,$in1,$shr
 301	dsrlv	$tmp2,$tmp2,$shl
 302# else
 303	dsrlv	$in0,$in0,$shr
 304	dsllv	$tmp3,$in1,$shl
 305	dsrlv	$in1,$in1,$shr
 306	dsllv	$tmp2,$tmp2,$shl
 307# endif
 308	or	$in0,$in0,$tmp3
 309	or	$in1,$in1,$tmp2
 310.Laligned_inp:
 311#else
 312	ldl	$in0,0+MSB($inp)	# load input
 313	ldl	$in1,8+MSB($inp)
 314	ldr	$in0,0+LSB($inp)
 315	ldr	$in1,8+LSB($inp)
 316#endif
 317	daddiu	$inp,16
 318#ifdef	MIPSEB
 319# if defined(_MIPS_ARCH_MIPS64R2)
 320	dsbh	$in0,$in0		# byte swap
 321	 dsbh	$in1,$in1
 322	dshd	$in0,$in0
 323	 dshd	$in1,$in1
 324# else
 325	ori	$tmp0,$zero,0xFF
 326	dsll	$tmp2,$tmp0,32
 327	or	$tmp0,$tmp2		# 0x000000FF000000FF
 328
 329	and	$tmp1,$in0,$tmp0	# byte swap
 330	 and	$tmp3,$in1,$tmp0
 331	dsrl	$tmp2,$in0,24
 332	 dsrl	$tmp4,$in1,24
 333	dsll	$tmp1,24
 334	 dsll	$tmp3,24
 335	and	$tmp2,$tmp0
 336	 and	$tmp4,$tmp0
 337	dsll	$tmp0,8			# 0x0000FF000000FF00
 338	or	$tmp1,$tmp2
 339	 or	$tmp3,$tmp4
 340	and	$tmp2,$in0,$tmp0
 341	 and	$tmp4,$in1,$tmp0
 342	dsrl	$in0,8
 343	 dsrl	$in1,8
 344	dsll	$tmp2,8
 345	 dsll	$tmp4,8
 346	and	$in0,$tmp0
 347	 and	$in1,$tmp0
 348	or	$tmp1,$tmp2
 349	 or	$tmp3,$tmp4
 350	or	$in0,$tmp1
 351	 or	$in1,$tmp3
 352	dsrl	$tmp1,$in0,32
 353	 dsrl	$tmp3,$in1,32
 354	dsll	$in0,32
 355	 dsll	$in1,32
 356	or	$in0,$tmp1
 357	 or	$in1,$tmp3
 358# endif
 359#endif
 360	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
 361	andi	$h2,$h2,3
 362	dsll	$tmp0,$tmp1,2
 363
 364	daddu	$d0,$h0,$in0		# accumulate input
 365	 daddu	$tmp1,$tmp0
 366	sltu	$tmp0,$d0,$h0
 367	daddu	$d0,$d0,$tmp1		# ... and residue
 368	sltu	$tmp1,$d0,$tmp1
 369	daddu	$d1,$h1,$in1
 370	daddu	$tmp0,$tmp1
 371	sltu	$tmp1,$d1,$h1
 372	daddu	$d1,$tmp0
 373
 374	dmultu	($r0,$d0)		# h0*r0
 375	 daddu	$d2,$h2,$padbit
 376	 sltu	$tmp0,$d1,$tmp0
 377	mflo	($h0,$r0,$d0)
 378	mfhi	($h1,$r0,$d0)
 379
 380	dmultu	($rs1,$d1)		# h1*5*r1
 381	 daddu	$d2,$tmp1
 382	 daddu	$d2,$tmp0
 383	mflo	($tmp0,$rs1,$d1)
 384	mfhi	($tmp1,$rs1,$d1)
 385
 386	dmultu	($r1,$d0)		# h0*r1
 387	mflo	($tmp2,$r1,$d0)
 388	mfhi	($h2,$r1,$d0)
 389	 daddu	$h0,$tmp0
 390	 daddu	$h1,$tmp1
 391	 sltu	$tmp0,$h0,$tmp0
 392
 393	dmultu	($r0,$d1)		# h1*r0
 394	 daddu	$h1,$tmp0
 395	 daddu	$h1,$tmp2
 396	mflo	($tmp0,$r0,$d1)
 397	mfhi	($tmp1,$r0,$d1)
 398
 399	dmultu	($rs1,$d2)		# h2*5*r1
 400	 sltu	$tmp2,$h1,$tmp2
 401	 daddu	$h2,$tmp2
 402	mflo	($tmp2,$rs1,$d2)
 403
 404	dmultu	($r0,$d2)		# h2*r0
 405	 daddu	$h1,$tmp0
 406	 daddu	$h2,$tmp1
 407	mflo	($tmp3,$r0,$d2)
 408	 sltu	$tmp0,$h1,$tmp0
 409	 daddu	$h2,$tmp0
 410
 411	daddu	$h1,$tmp2
 412	sltu	$tmp2,$h1,$tmp2
 413	daddu	$h2,$tmp2
 414	daddu	$h2,$tmp3
 415
 416	bne	$inp,$len,.Loop
 417
 418	sd	$h0,0($ctx)		# store hash value
 419	sd	$h1,8($ctx)
 420	sd	$h2,16($ctx)
 421
 422	.set	noreorder
 423#if defined(_MIPS_ARCH_MIPS64R6)
 424	ld	$s7,56($sp)
 425	ld	$s6,48($sp)
 426#endif
 427	ld	$s5,40($sp)		# epilogue
 428	ld	$s4,32($sp)
 429___
 430$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
 431	ld	$s3,24($sp)
 432	ld	$s2,16($sp)
 433	ld	$s1,8($sp)
 434	ld	$s0,0($sp)
 435___
 436$code.=<<___;
 437	jr	$ra
 438#if defined(_MIPS_ARCH_MIPS64R6)
 439	daddu	$sp,8*8
 440#else
 441	daddu	$sp,6*8
 442#endif
 443.end	poly1305_blocks_internal
 444___
 445}
 446{
 447my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
 448
 449$code.=<<___;
 450.align	5
 451.globl	poly1305_emit
 452.ent	poly1305_emit
 453poly1305_emit:
 454	.frame	$sp,0,$ra
 455	.set	reorder
 456
 457	ld	$tmp2,16($ctx)
 458	ld	$tmp0,0($ctx)
 459	ld	$tmp1,8($ctx)
 460
 461	li	$in0,-4			# final reduction
 462	dsrl	$in1,$tmp2,2
 463	and	$in0,$tmp2
 464	andi	$tmp2,$tmp2,3
 465	daddu	$in0,$in1
 466
 467	daddu	$tmp0,$tmp0,$in0
 468	sltu	$in1,$tmp0,$in0
 469	 daddiu	$in0,$tmp0,5		# compare to modulus
 470	daddu	$tmp1,$tmp1,$in1
 471	 sltiu	$tmp3,$in0,5
 472	sltu	$tmp4,$tmp1,$in1
 473	 daddu	$in1,$tmp1,$tmp3
 474	daddu	$tmp2,$tmp2,$tmp4
 475	 sltu	$tmp3,$in1,$tmp3
 476	 daddu	$tmp2,$tmp2,$tmp3
 477
 478	dsrl	$tmp2,2			# see if it carried/borrowed
 479	dsubu	$tmp2,$zero,$tmp2
 480
 481	xor	$in0,$tmp0
 482	xor	$in1,$tmp1
 483	and	$in0,$tmp2
 484	and	$in1,$tmp2
 485	xor	$in0,$tmp0
 486	xor	$in1,$tmp1
 487
 488	lwu	$tmp0,0($nonce)		# load nonce
 489	lwu	$tmp1,4($nonce)
 490	lwu	$tmp2,8($nonce)
 491	lwu	$tmp3,12($nonce)
 492	dsll	$tmp1,32
 493	dsll	$tmp3,32
 494	or	$tmp0,$tmp1
 495	or	$tmp2,$tmp3
 496
 497	daddu	$in0,$tmp0		# accumulate nonce
 498	daddu	$in1,$tmp2
 499	sltu	$tmp0,$in0,$tmp0
 500	daddu	$in1,$tmp0
 501
 502	dsrl	$tmp0,$in0,8		# write mac value
 503	dsrl	$tmp1,$in0,16
 504	dsrl	$tmp2,$in0,24
 505	sb	$in0,0($mac)
 506	dsrl	$tmp3,$in0,32
 507	sb	$tmp0,1($mac)
 508	dsrl	$tmp0,$in0,40
 509	sb	$tmp1,2($mac)
 510	dsrl	$tmp1,$in0,48
 511	sb	$tmp2,3($mac)
 512	dsrl	$tmp2,$in0,56
 513	sb	$tmp3,4($mac)
 514	dsrl	$tmp3,$in1,8
 515	sb	$tmp0,5($mac)
 516	dsrl	$tmp0,$in1,16
 517	sb	$tmp1,6($mac)
 518	dsrl	$tmp1,$in1,24
 519	sb	$tmp2,7($mac)
 520
 521	sb	$in1,8($mac)
 522	dsrl	$tmp2,$in1,32
 523	sb	$tmp3,9($mac)
 524	dsrl	$tmp3,$in1,40
 525	sb	$tmp0,10($mac)
 526	dsrl	$tmp0,$in1,48
 527	sb	$tmp1,11($mac)
 528	dsrl	$tmp1,$in1,56
 529	sb	$tmp2,12($mac)
 530	sb	$tmp3,13($mac)
 531	sb	$tmp0,14($mac)
 532	sb	$tmp1,15($mac)
 533
 534	jr	$ra
 535.end	poly1305_emit
 536.rdata
 537.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
 538.align	2
 539___
 540}
 541}}} else {{{
 542######################################################################
 543# 32-bit code path
 544#
 545
 546my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
 547my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
 548   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
 549
 550$code.=<<___;
 551#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
 552     defined(_MIPS_ARCH_MIPS32R6)) \\
 553     && !defined(_MIPS_ARCH_MIPS32R2)
 554# define _MIPS_ARCH_MIPS32R2
 555#endif
 556
 557#if defined(_MIPS_ARCH_MIPS32R6)
 558# define multu(rs,rt)
 559# define mflo(rd,rs,rt)	mulu	rd,rs,rt
 560# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
 561#else
 562# define multu(rs,rt)	multu	rs,rt
 563# define mflo(rd,rs,rt)	mflo	rd
 564# define mfhi(rd,rs,rt)	mfhi	rd
 565#endif
 566
 567#ifdef	__KERNEL__
 568# define poly1305_init   poly1305_init_mips
 569# define poly1305_blocks poly1305_blocks_mips
 570# define poly1305_emit   poly1305_emit_mips
 571#endif
 572
 573#if defined(__MIPSEB__) && !defined(MIPSEB)
 574# define MIPSEB
 575#endif
 576
 577#ifdef MIPSEB
 578# define MSB 0
 579# define LSB 3
 580#else
 581# define MSB 3
 582# define LSB 0
 583#endif
 584
 585.text
 586.set	noat
 587.set	noreorder
 588
 589.align	5
 590.globl	poly1305_init
 591.ent	poly1305_init
 592poly1305_init:
 593	.frame	$sp,0,$ra
 594	.set	reorder
 595
 596	sw	$zero,0($ctx)
 597	sw	$zero,4($ctx)
 598	sw	$zero,8($ctx)
 599	sw	$zero,12($ctx)
 600	sw	$zero,16($ctx)
 601
 602	beqz	$inp,.Lno_key
 603
 604#if defined(_MIPS_ARCH_MIPS32R6)
 605	andi	$tmp0,$inp,3		# $inp % 4
 606	subu	$inp,$inp,$tmp0		# align $inp
 607	sll	$tmp0,$tmp0,3		# byte to bit offset
 608	lw	$in0,0($inp)
 609	lw	$in1,4($inp)
 610	lw	$in2,8($inp)
 611	lw	$in3,12($inp)
 612	beqz	$tmp0,.Laligned_key
 613
 614	lw	$tmp2,16($inp)
 615	subu	$tmp1,$zero,$tmp0
 616# ifdef	MIPSEB
 617	sllv	$in0,$in0,$tmp0
 618	srlv	$tmp3,$in1,$tmp1
 619	sllv	$in1,$in1,$tmp0
 620	or	$in0,$in0,$tmp3
 621	srlv	$tmp3,$in2,$tmp1
 622	sllv	$in2,$in2,$tmp0
 623	or	$in1,$in1,$tmp3
 624	srlv	$tmp3,$in3,$tmp1
 625	sllv	$in3,$in3,$tmp0
 626	or	$in2,$in2,$tmp3
 627	srlv	$tmp2,$tmp2,$tmp1
 628	or	$in3,$in3,$tmp2
 629# else
 630	srlv	$in0,$in0,$tmp0
 631	sllv	$tmp3,$in1,$tmp1
 632	srlv	$in1,$in1,$tmp0
 633	or	$in0,$in0,$tmp3
 634	sllv	$tmp3,$in2,$tmp1
 635	srlv	$in2,$in2,$tmp0
 636	or	$in1,$in1,$tmp3
 637	sllv	$tmp3,$in3,$tmp1
 638	srlv	$in3,$in3,$tmp0
 639	or	$in2,$in2,$tmp3
 640	sllv	$tmp2,$tmp2,$tmp1
 641	or	$in3,$in3,$tmp2
 642# endif
 643.Laligned_key:
 644#else
 645	lwl	$in0,0+MSB($inp)
 646	lwl	$in1,4+MSB($inp)
 647	lwl	$in2,8+MSB($inp)
 648	lwl	$in3,12+MSB($inp)
 649	lwr	$in0,0+LSB($inp)
 650	lwr	$in1,4+LSB($inp)
 651	lwr	$in2,8+LSB($inp)
 652	lwr	$in3,12+LSB($inp)
 653#endif
 654#ifdef	MIPSEB
 655# if defined(_MIPS_ARCH_MIPS32R2)
 656	wsbh	$in0,$in0		# byte swap
 657	wsbh	$in1,$in1
 658	wsbh	$in2,$in2
 659	wsbh	$in3,$in3
 660	rotr	$in0,$in0,16
 661	rotr	$in1,$in1,16
 662	rotr	$in2,$in2,16
 663	rotr	$in3,$in3,16
 664# else
 665	srl	$tmp0,$in0,24		# byte swap
 666	srl	$tmp1,$in0,8
 667	andi	$tmp2,$in0,0xFF00
 668	sll	$in0,$in0,24
 669	andi	$tmp1,0xFF00
 670	sll	$tmp2,$tmp2,8
 671	or	$in0,$tmp0
 672	 srl	$tmp0,$in1,24
 673	or	$tmp1,$tmp2
 674	 srl	$tmp2,$in1,8
 675	or	$in0,$tmp1
 676	 andi	$tmp1,$in1,0xFF00
 677	 sll	$in1,$in1,24
 678	 andi	$tmp2,0xFF00
 679	 sll	$tmp1,$tmp1,8
 680	 or	$in1,$tmp0
 681	srl	$tmp0,$in2,24
 682	 or	$tmp2,$tmp1
 683	srl	$tmp1,$in2,8
 684	 or	$in1,$tmp2
 685	andi	$tmp2,$in2,0xFF00
 686	sll	$in2,$in2,24
 687	andi	$tmp1,0xFF00
 688	sll	$tmp2,$tmp2,8
 689	or	$in2,$tmp0
 690	 srl	$tmp0,$in3,24
 691	or	$tmp1,$tmp2
 692	 srl	$tmp2,$in3,8
 693	or	$in2,$tmp1
 694	 andi	$tmp1,$in3,0xFF00
 695	 sll	$in3,$in3,24
 696	 andi	$tmp2,0xFF00
 697	 sll	$tmp1,$tmp1,8
 698	 or	$in3,$tmp0
 699	 or	$tmp2,$tmp1
 700	 or	$in3,$tmp2
 701# endif
 702#endif
 703	lui	$tmp0,0x0fff
 704	ori	$tmp0,0xffff		# 0x0fffffff
 705	and	$in0,$in0,$tmp0
 706	subu	$tmp0,3			# 0x0ffffffc
 707	and	$in1,$in1,$tmp0
 708	and	$in2,$in2,$tmp0
 709	and	$in3,$in3,$tmp0
 710
 711	sw	$in0,20($ctx)
 712	sw	$in1,24($ctx)
 713	sw	$in2,28($ctx)
 714	sw	$in3,32($ctx)
 715
 716	srl	$tmp1,$in1,2
 717	srl	$tmp2,$in2,2
 718	srl	$tmp3,$in3,2
 719	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
 720	addu	$in2,$in2,$tmp2
 721	addu	$in3,$in3,$tmp3
 722	sw	$in1,36($ctx)
 723	sw	$in2,40($ctx)
 724	sw	$in3,44($ctx)
 725.Lno_key:
 726	li	$v0,0
 727	jr	$ra
 728.end	poly1305_init
 729___
 730{
 731my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
 732
 733my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
 734   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
 735my ($d0,$d1,$d2,$d3) =
 736   ($a4,$a5,$a6,$a7);
 737my $shr = $t2;		# used on R6
 738my $one = $t2;		# used on R2
 739
 740$code.=<<___;
 741.globl	poly1305_blocks
 742.align	5
 743.ent	poly1305_blocks
 744poly1305_blocks:
 745	.frame	$sp,16*4,$ra
 746	.mask	$SAVED_REGS_MASK,-4
 747	.set	noreorder
 748	subu	$sp, $sp,4*12
 749	sw	$s11,4*11($sp)
 750	sw	$s10,4*10($sp)
 751	sw	$s9, 4*9($sp)
 752	sw	$s8, 4*8($sp)
 753	sw	$s7, 4*7($sp)
 754	sw	$s6, 4*6($sp)
 755	sw	$s5, 4*5($sp)
 756	sw	$s4, 4*4($sp)
 757___
 758$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
 759	sw	$s3, 4*3($sp)
 760	sw	$s2, 4*2($sp)
 761	sw	$s1, 4*1($sp)
 762	sw	$s0, 4*0($sp)
 763___
 764$code.=<<___;
 765	.set	reorder
 766
 767	srl	$len,4			# number of complete blocks
 768	li	$one,1
 769	beqz	$len,.Labort
 770
 771#if defined(_MIPS_ARCH_MIPS32R6)
 772	andi	$shr,$inp,3
 773	subu	$inp,$inp,$shr		# align $inp
 774	sll	$shr,$shr,3		# byte to bit offset
 775#endif
 776
 777	lw	$h0,0($ctx)		# load hash value
 778	lw	$h1,4($ctx)
 779	lw	$h2,8($ctx)
 780	lw	$h3,12($ctx)
 781	lw	$h4,16($ctx)
 782
 783	lw	$r0,20($ctx)		# load key
 784	lw	$r1,24($ctx)
 785	lw	$r2,28($ctx)
 786	lw	$r3,32($ctx)
 787	lw	$rs1,36($ctx)
 788	lw	$rs2,40($ctx)
 789	lw	$rs3,44($ctx)
 790
 791	sll	$len,4
 792	addu	$len,$len,$inp		# end of buffer
 793	b	.Loop
 794
 795.align	4
 796.Loop:
 797#if defined(_MIPS_ARCH_MIPS32R6)
 798	lw	$d0,0($inp)		# load input
 799	lw	$d1,4($inp)
 800	lw	$d2,8($inp)
 801	lw	$d3,12($inp)
 802	beqz	$shr,.Laligned_inp
 803
 804	lw	$t0,16($inp)
 805	subu	$t1,$zero,$shr
 806# ifdef	MIPSEB
 807	sllv	$d0,$d0,$shr
 808	srlv	$at,$d1,$t1
 809	sllv	$d1,$d1,$shr
 810	or	$d0,$d0,$at
 811	srlv	$at,$d2,$t1
 812	sllv	$d2,$d2,$shr
 813	or	$d1,$d1,$at
 814	srlv	$at,$d3,$t1
 815	sllv	$d3,$d3,$shr
 816	or	$d2,$d2,$at
 817	srlv	$t0,$t0,$t1
 818	or	$d3,$d3,$t0
 819# else
 820	srlv	$d0,$d0,$shr
 821	sllv	$at,$d1,$t1
 822	srlv	$d1,$d1,$shr
 823	or	$d0,$d0,$at
 824	sllv	$at,$d2,$t1
 825	srlv	$d2,$d2,$shr
 826	or	$d1,$d1,$at
 827	sllv	$at,$d3,$t1
 828	srlv	$d3,$d3,$shr
 829	or	$d2,$d2,$at
 830	sllv	$t0,$t0,$t1
 831	or	$d3,$d3,$t0
 832# endif
 833.Laligned_inp:
 834#else
 835	lwl	$d0,0+MSB($inp)		# load input
 836	lwl	$d1,4+MSB($inp)
 837	lwl	$d2,8+MSB($inp)
 838	lwl	$d3,12+MSB($inp)
 839	lwr	$d0,0+LSB($inp)
 840	lwr	$d1,4+LSB($inp)
 841	lwr	$d2,8+LSB($inp)
 842	lwr	$d3,12+LSB($inp)
 843#endif
 844#ifdef	MIPSEB
 845# if defined(_MIPS_ARCH_MIPS32R2)
 846	wsbh	$d0,$d0			# byte swap
 847	wsbh	$d1,$d1
 848	wsbh	$d2,$d2
 849	wsbh	$d3,$d3
 850	rotr	$d0,$d0,16
 851	rotr	$d1,$d1,16
 852	rotr	$d2,$d2,16
 853	rotr	$d3,$d3,16
 854# else
 855	srl	$at,$d0,24		# byte swap
 856	srl	$t0,$d0,8
 857	andi	$t1,$d0,0xFF00
 858	sll	$d0,$d0,24
 859	andi	$t0,0xFF00
 860	sll	$t1,$t1,8
 861	or	$d0,$at
 862	 srl	$at,$d1,24
 863	or	$t0,$t1
 864	 srl	$t1,$d1,8
 865	or	$d0,$t0
 866	 andi	$t0,$d1,0xFF00
 867	 sll	$d1,$d1,24
 868	 andi	$t1,0xFF00
 869	 sll	$t0,$t0,8
 870	 or	$d1,$at
 871	srl	$at,$d2,24
 872	 or	$t1,$t0
 873	srl	$t0,$d2,8
 874	 or	$d1,$t1
 875	andi	$t1,$d2,0xFF00
 876	sll	$d2,$d2,24
 877	andi	$t0,0xFF00
 878	sll	$t1,$t1,8
 879	or	$d2,$at
 880	 srl	$at,$d3,24
 881	or	$t0,$t1
 882	 srl	$t1,$d3,8
 883	or	$d2,$t0
 884	 andi	$t0,$d3,0xFF00
 885	 sll	$d3,$d3,24
 886	 andi	$t1,0xFF00
 887	 sll	$t0,$t0,8
 888	 or	$d3,$at
 889	 or	$t1,$t0
 890	 or	$d3,$t1
 891# endif
 892#endif
 893	srl	$t0,$h4,2		# modulo-scheduled reduction
 894	andi	$h4,$h4,3
 895	sll	$at,$t0,2
 896
 897	addu	$d0,$d0,$h0		# accumulate input
 898	 addu	$t0,$t0,$at
 899	sltu	$h0,$d0,$h0
 900	addu	$d0,$d0,$t0		# ... and residue
 901	sltu	$at,$d0,$t0
 902
 903	addu	$d1,$d1,$h1
 904	 addu	$h0,$h0,$at		# carry
 905	sltu	$h1,$d1,$h1
 906	addu	$d1,$d1,$h0
 907	sltu	$h0,$d1,$h0
 908
 909	addu	$d2,$d2,$h2
 910	 addu	$h1,$h1,$h0		# carry
 911	sltu	$h2,$d2,$h2
 912	addu	$d2,$d2,$h1
 913	sltu	$h1,$d2,$h1
 914
 915	addu	$d3,$d3,$h3
 916	 addu	$h2,$h2,$h1		# carry
 917	sltu	$h3,$d3,$h3
 918	addu	$d3,$d3,$h2
 919
 920#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
 921	multu	$r0,$d0			# d0*r0
 922	 sltu	$h2,$d3,$h2
 923	maddu	$rs3,$d1		# d1*s3
 924	 addu	$h3,$h3,$h2		# carry
 925	maddu	$rs2,$d2		# d2*s2
 926	 addu	$h4,$h4,$padbit
 927	maddu	$rs1,$d3		# d3*s1
 928	 addu	$h4,$h4,$h3
 929	mfhi	$at
 930	mflo	$h0
 931
 932	multu	$r1,$d0			# d0*r1
 933	maddu	$r0,$d1			# d1*r0
 934	maddu	$rs3,$d2		# d2*s3
 935	maddu	$rs2,$d3		# d3*s2
 936	maddu	$rs1,$h4		# h4*s1
 937	maddu	$at,$one		# hi*1
 938	mfhi	$at
 939	mflo	$h1
 940
 941	multu	$r2,$d0			# d0*r2
 942	maddu	$r1,$d1			# d1*r1
 943	maddu	$r0,$d2			# d2*r0
 944	maddu	$rs3,$d3		# d3*s3
 945	maddu	$rs2,$h4		# h4*s2
 946	maddu	$at,$one		# hi*1
 947	mfhi	$at
 948	mflo	$h2
 949
 950	mul	$t0,$r0,$h4		# h4*r0
 951
 952	multu	$r3,$d0			# d0*r3
 953	maddu	$r2,$d1			# d1*r2
 954	maddu	$r1,$d2			# d2*r1
 955	maddu	$r0,$d3			# d3*r0
 956	maddu	$rs3,$h4		# h4*s3
 957	maddu	$at,$one		# hi*1
 958	mfhi	$at
 959	mflo	$h3
 960
 961	 addiu	$inp,$inp,16
 962
 963	addu	$h4,$t0,$at
 964#else
 965	multu	($r0,$d0)		# d0*r0
 966	mflo	($h0,$r0,$d0)
 967	mfhi	($h1,$r0,$d0)
 968
 969	 sltu	$h2,$d3,$h2
 970	 addu	$h3,$h3,$h2		# carry
 971
 972	multu	($rs3,$d1)		# d1*s3
 973	mflo	($at,$rs3,$d1)
 974	mfhi	($t0,$rs3,$d1)
 975
 976	 addu	$h4,$h4,$padbit
 977	 addiu	$inp,$inp,16
 978	 addu	$h4,$h4,$h3
 979
 980	multu	($rs2,$d2)		# d2*s2
 981	mflo	($a3,$rs2,$d2)
 982	mfhi	($t1,$rs2,$d2)
 983	 addu	$h0,$h0,$at
 984	 addu	$h1,$h1,$t0
 985	multu	($rs1,$d3)		# d3*s1
 986	 sltu	$at,$h0,$at
 987	 addu	$h1,$h1,$at
 988
 989	mflo	($at,$rs1,$d3)
 990	mfhi	($t0,$rs1,$d3)
 991	 addu	$h0,$h0,$a3
 992	 addu	$h1,$h1,$t1
 993	multu	($r1,$d0)		# d0*r1
 994	 sltu	$a3,$h0,$a3
 995	 addu	$h1,$h1,$a3
 996
 997
 998	mflo	($a3,$r1,$d0)
 999	mfhi	($h2,$r1,$d0)
1000	 addu	$h0,$h0,$at
1001	 addu	$h1,$h1,$t0
1002	multu	($r0,$d1)		# d1*r0
1003	 sltu	$at,$h0,$at
1004	 addu	$h1,$h1,$at
1005
1006	mflo	($at,$r0,$d1)
1007	mfhi	($t0,$r0,$d1)
1008	 addu	$h1,$h1,$a3
1009	 sltu	$a3,$h1,$a3
1010	multu	($rs3,$d2)		# d2*s3
1011	 addu	$h2,$h2,$a3
1012
1013	mflo	($a3,$rs3,$d2)
1014	mfhi	($t1,$rs3,$d2)
1015	 addu	$h1,$h1,$at
1016	 addu	$h2,$h2,$t0
1017	multu	($rs2,$d3)		# d3*s2
1018	 sltu	$at,$h1,$at
1019	 addu	$h2,$h2,$at
1020
1021	mflo	($at,$rs2,$d3)
1022	mfhi	($t0,$rs2,$d3)
1023	 addu	$h1,$h1,$a3
1024	 addu	$h2,$h2,$t1
1025	multu	($rs1,$h4)		# h4*s1
1026	 sltu	$a3,$h1,$a3
1027	 addu	$h2,$h2,$a3
1028
1029	mflo	($a3,$rs1,$h4)
1030	 addu	$h1,$h1,$at
1031	 addu	$h2,$h2,$t0
1032	multu	($r2,$d0)		# d0*r2
1033	 sltu	$at,$h1,$at
1034	 addu	$h2,$h2,$at
1035
1036
1037	mflo	($at,$r2,$d0)
1038	mfhi	($h3,$r2,$d0)
1039	 addu	$h1,$h1,$a3
1040	 sltu	$a3,$h1,$a3
1041	multu	($r1,$d1)		# d1*r1
1042	 addu	$h2,$h2,$a3
1043
1044	mflo	($a3,$r1,$d1)
1045	mfhi	($t1,$r1,$d1)
1046	 addu	$h2,$h2,$at
1047	 sltu	$at,$h2,$at
1048	multu	($r0,$d2)		# d2*r0
1049	 addu	$h3,$h3,$at
1050
1051	mflo	($at,$r0,$d2)
1052	mfhi	($t0,$r0,$d2)
1053	 addu	$h2,$h2,$a3
1054	 addu	$h3,$h3,$t1
1055	multu	($rs3,$d3)		# d3*s3
1056	 sltu	$a3,$h2,$a3
1057	 addu	$h3,$h3,$a3
1058
1059	mflo	($a3,$rs3,$d3)
1060	mfhi	($t1,$rs3,$d3)
1061	 addu	$h2,$h2,$at
1062	 addu	$h3,$h3,$t0
1063	multu	($rs2,$h4)		# h4*s2
1064	 sltu	$at,$h2,$at
1065	 addu	$h3,$h3,$at
1066
1067	mflo	($at,$rs2,$h4)
1068	 addu	$h2,$h2,$a3
1069	 addu	$h3,$h3,$t1
1070	multu	($r3,$d0)		# d0*r3
1071	 sltu	$a3,$h2,$a3
1072	 addu	$h3,$h3,$a3
1073
1074
1075	mflo	($a3,$r3,$d0)
1076	mfhi	($t1,$r3,$d0)
1077	 addu	$h2,$h2,$at
1078	 sltu	$at,$h2,$at
1079	multu	($r2,$d1)		# d1*r2
1080	 addu	$h3,$h3,$at
1081
1082	mflo	($at,$r2,$d1)
1083	mfhi	($t0,$r2,$d1)
1084	 addu	$h3,$h3,$a3
1085	 sltu	$a3,$h3,$a3
1086	multu	($r0,$d3)		# d3*r0
1087	 addu	$t1,$t1,$a3
1088
1089	mflo	($a3,$r0,$d3)
1090	mfhi	($d3,$r0,$d3)
1091	 addu	$h3,$h3,$at
1092	 addu	$t1,$t1,$t0
1093	multu	($r1,$d2)		# d2*r1
1094	 sltu	$at,$h3,$at
1095	 addu	$t1,$t1,$at
1096
1097	mflo	($at,$r1,$d2)
1098	mfhi	($t0,$r1,$d2)
1099	 addu	$h3,$h3,$a3
1100	 addu	$t1,$t1,$d3
1101	multu	($rs3,$h4)		# h4*s3
1102	 sltu	$a3,$h3,$a3
1103	 addu	$t1,$t1,$a3
1104
1105	mflo	($a3,$rs3,$h4)
1106	 addu	$h3,$h3,$at
1107	 addu	$t1,$t1,$t0
1108	multu	($r0,$h4)		# h4*r0
1109	 sltu	$at,$h3,$at
1110	 addu	$t1,$t1,$at
1111
1112
1113	mflo	($h4,$r0,$h4)
1114	 addu	$h3,$h3,$a3
1115	 sltu	$a3,$h3,$a3
1116	 addu	$t1,$t1,$a3
1117	addu	$h4,$h4,$t1
1118
1119	li	$padbit,1		# if we loop, padbit is 1
1120#endif
1121	bne	$inp,$len,.Loop
1122
1123	sw	$h0,0($ctx)		# store hash value
1124	sw	$h1,4($ctx)
1125	sw	$h2,8($ctx)
1126	sw	$h3,12($ctx)
1127	sw	$h4,16($ctx)
1128
1129	.set	noreorder
1130.Labort:
1131	lw	$s11,4*11($sp)
1132	lw	$s10,4*10($sp)
1133	lw	$s9, 4*9($sp)
1134	lw	$s8, 4*8($sp)
1135	lw	$s7, 4*7($sp)
1136	lw	$s6, 4*6($sp)
1137	lw	$s5, 4*5($sp)
1138	lw	$s4, 4*4($sp)
1139___
1140$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
1141	lw	$s3, 4*3($sp)
1142	lw	$s2, 4*2($sp)
1143	lw	$s1, 4*1($sp)
1144	lw	$s0, 4*0($sp)
1145___
1146$code.=<<___;
1147	jr	$ra
1148	addu	$sp,$sp,4*12
1149.end	poly1305_blocks
1150___
1151}
1152{
1153my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1154
1155$code.=<<___;
1156.align	5
1157.globl	poly1305_emit
1158.ent	poly1305_emit
1159poly1305_emit:
1160	.frame	$sp,0,$ra
1161	.set	reorder
1162
1163	lw	$tmp4,16($ctx)
1164	lw	$tmp0,0($ctx)
1165	lw	$tmp1,4($ctx)
1166	lw	$tmp2,8($ctx)
1167	lw	$tmp3,12($ctx)
1168
1169	li	$in0,-4			# final reduction
1170	srl	$ctx,$tmp4,2
1171	and	$in0,$in0,$tmp4
1172	andi	$tmp4,$tmp4,3
1173	addu	$ctx,$ctx,$in0
1174
1175	addu	$tmp0,$tmp0,$ctx
1176	sltu	$ctx,$tmp0,$ctx
1177	 addiu	$in0,$tmp0,5		# compare to modulus
1178	addu	$tmp1,$tmp1,$ctx
1179	 sltiu	$in1,$in0,5
1180	sltu	$ctx,$tmp1,$ctx
1181	 addu	$in1,$in1,$tmp1
1182	addu	$tmp2,$tmp2,$ctx
1183	 sltu	$in2,$in1,$tmp1
1184	sltu	$ctx,$tmp2,$ctx
1185	 addu	$in2,$in2,$tmp2
1186	addu	$tmp3,$tmp3,$ctx
1187	 sltu	$in3,$in2,$tmp2
1188	sltu	$ctx,$tmp3,$ctx
1189	 addu	$in3,$in3,$tmp3
1190	addu	$tmp4,$tmp4,$ctx
1191	 sltu	$ctx,$in3,$tmp3
1192	 addu	$ctx,$tmp4
1193
1194	srl	$ctx,2			# see if it carried/borrowed
1195	subu	$ctx,$zero,$ctx
1196
1197	xor	$in0,$tmp0
1198	xor	$in1,$tmp1
1199	xor	$in2,$tmp2
1200	xor	$in3,$tmp3
1201	and	$in0,$ctx
1202	and	$in1,$ctx
1203	and	$in2,$ctx
1204	and	$in3,$ctx
1205	xor	$in0,$tmp0
1206	xor	$in1,$tmp1
1207	xor	$in2,$tmp2
1208	xor	$in3,$tmp3
1209
1210	lw	$tmp0,0($nonce)		# load nonce
1211	lw	$tmp1,4($nonce)
1212	lw	$tmp2,8($nonce)
1213	lw	$tmp3,12($nonce)
1214
1215	addu	$in0,$tmp0		# accumulate nonce
1216	sltu	$ctx,$in0,$tmp0
1217
1218	addu	$in1,$tmp1
1219	sltu	$tmp1,$in1,$tmp1
1220	addu	$in1,$ctx
1221	sltu	$ctx,$in1,$ctx
1222	addu	$ctx,$tmp1
1223
1224	addu	$in2,$tmp2
1225	sltu	$tmp2,$in2,$tmp2
1226	addu	$in2,$ctx
1227	sltu	$ctx,$in2,$ctx
1228	addu	$ctx,$tmp2
1229
1230	addu	$in3,$tmp3
1231	addu	$in3,$ctx
1232
1233	srl	$tmp0,$in0,8		# write mac value
1234	srl	$tmp1,$in0,16
1235	srl	$tmp2,$in0,24
1236	sb	$in0, 0($mac)
1237	sb	$tmp0,1($mac)
1238	srl	$tmp0,$in1,8
1239	sb	$tmp1,2($mac)
1240	srl	$tmp1,$in1,16
1241	sb	$tmp2,3($mac)
1242	srl	$tmp2,$in1,24
1243	sb	$in1, 4($mac)
1244	sb	$tmp0,5($mac)
1245	srl	$tmp0,$in2,8
1246	sb	$tmp1,6($mac)
1247	srl	$tmp1,$in2,16
1248	sb	$tmp2,7($mac)
1249	srl	$tmp2,$in2,24
1250	sb	$in2, 8($mac)
1251	sb	$tmp0,9($mac)
1252	srl	$tmp0,$in3,8
1253	sb	$tmp1,10($mac)
1254	srl	$tmp1,$in3,16
1255	sb	$tmp2,11($mac)
1256	srl	$tmp2,$in3,24
1257	sb	$in3, 12($mac)
1258	sb	$tmp0,13($mac)
1259	sb	$tmp1,14($mac)
1260	sb	$tmp2,15($mac)
1261
1262	jr	$ra
1263.end	poly1305_emit
1264.rdata
1265.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1266.align	2
1267___
1268}
1269}}}
1270
1271$output=pop and open STDOUT,">$output";
1272print $code;
1273close STDOUT;