sha1_avx2_x86_64_asm.S - arch/x86/crypto/sha1_avx2_x86_64_asm.S - Linux diff v6.13.7

  1/*
  2 *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
  3 *
  4 * This file is provided under a dual BSD/GPLv2 license.  When using or
  5 * redistributing this file, you may do so under either license.
  6 *
  7 * GPL LICENSE SUMMARY
  8 *
  9 * Copyright(c) 2014 Intel Corporation.
 10 *
 11 * This program is free software; you can redistribute it and/or modify
 12 * it under the terms of version 2 of the GNU General Public License as
 13 * published by the Free Software Foundation.
 14 *
 15 * This program is distributed in the hope that it will be useful, but
 16 * WITHOUT ANY WARRANTY; without even the implied warranty of
 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 18 * General Public License for more details.
 19 *
 20 * Contact Information:
 21 * Ilya Albrekht <ilya.albrekht@intel.com>
 22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
 23 * Ronen Zohar <ronen.zohar@intel.com>
 24 * Chandramouli Narayanan <mouli@linux.intel.com>
 25 *
 26 * BSD LICENSE
 27 *
 28 * Copyright(c) 2014 Intel Corporation.
 29 *
 30 * Redistribution and use in source and binary forms, with or without
 31 * modification, are permitted provided that the following conditions
 32 * are met:
 33 *
 34 * Redistributions of source code must retain the above copyright
 35 * notice, this list of conditions and the following disclaimer.
 36 * Redistributions in binary form must reproduce the above copyright
 37 * notice, this list of conditions and the following disclaimer in
 38 * the documentation and/or other materials provided with the
 39 * distribution.
 40 * Neither the name of Intel Corporation nor the names of its
 41 * contributors may be used to endorse or promote products derived
 42 * from this software without specific prior written permission.
 43 *
 44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 55 *
 56 */
 57
 58/*
 59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
 60 *
 61 *This implementation is based on the previous SSSE3 release:
 62 *Visit http://software.intel.com/en-us/articles/
 63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
 64 *
 65 *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
 66 *even number of 'blocks' consecutive 64-byte blocks.
 67 *
 68 *extern "C" void sha1_transform_avx2(
 69 *	struct sha1_state *state, const u8* input, int blocks );
 70 */
 71
 72#include <linux/linkage.h>
 73
 74#define	CTX	%rdi	/* arg1 */
 75#define BUF	%rsi	/* arg2 */
 76#define CNT	%rdx	/* arg3 */
 77
 78#define	REG_A	%ecx
 79#define	REG_B	%esi
 80#define	REG_C	%edi
 81#define	REG_D	%eax
 82#define	REG_E	%edx
 83#define	REG_TB	%ebx
 84#define	REG_TA	%r12d
 85#define	REG_RA	%rcx
 86#define	REG_RB	%rsi
 87#define	REG_RC	%rdi
 88#define	REG_RD	%rax
 89#define	REG_RE	%rdx
 90#define	REG_RTA	%r12
 91#define	REG_RTB	%rbx
 92#define	REG_T1	%r11d
 93#define	xmm_mov	vmovups
 94#define	avx2_zeroupper	vzeroupper
 95#define	RND_F1	1
 96#define	RND_F2	2
 97#define	RND_F3	3
 98
 99.macro REGALLOC
100	.set A, REG_A
101	.set B, REG_B
102	.set C, REG_C
103	.set D, REG_D
104	.set E, REG_E
105	.set TB, REG_TB
106	.set TA, REG_TA
107
108	.set RA, REG_RA
109	.set RB, REG_RB
110	.set RC, REG_RC
111	.set RD, REG_RD
112	.set RE, REG_RE
113
114	.set RTA, REG_RTA
115	.set RTB, REG_RTB
116
117	.set T1, REG_T1
118.endm
119
 
120#define HASH_PTR	%r9
121#define BLOCKS_CTR	%r8
122#define BUFFER_PTR	%r10
123#define BUFFER_PTR2	%r13
 
124
125#define PRECALC_BUF	%r14
126#define WK_BUF		%r15
127
128#define W_TMP		%xmm0
129#define WY_TMP		%ymm0
130#define WY_TMP2		%ymm9
131
132# AVX2 variables
133#define WY0		%ymm3
134#define WY4		%ymm5
135#define WY08		%ymm7
136#define WY12		%ymm8
137#define WY16		%ymm12
138#define WY20		%ymm13
139#define WY24		%ymm14
140#define WY28		%ymm15
141
142#define YMM_SHUFB_BSWAP	%ymm10
143
144/*
145 * Keep 2 iterations precalculated at a time:
146 *    - 80 DWORDs per iteration * 2
147 */
148#define W_SIZE		(80*2*2 +16)
149
150#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
151#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
152
153
154.macro UPDATE_HASH  hash, val
155	add	\hash, \val
156	mov	\val, \hash
157.endm
158
159.macro PRECALC_RESET_WY
160	.set WY_00, WY0
161	.set WY_04, WY4
162	.set WY_08, WY08
163	.set WY_12, WY12
164	.set WY_16, WY16
165	.set WY_20, WY20
166	.set WY_24, WY24
167	.set WY_28, WY28
168	.set WY_32, WY_00
169.endm
170
171.macro PRECALC_ROTATE_WY
172	/* Rotate macros */
173	.set WY_32, WY_28
174	.set WY_28, WY_24
175	.set WY_24, WY_20
176	.set WY_20, WY_16
177	.set WY_16, WY_12
178	.set WY_12, WY_08
179	.set WY_08, WY_04
180	.set WY_04, WY_00
181	.set WY_00, WY_32
182
183	/* Define register aliases */
184	.set WY, WY_00
185	.set WY_minus_04, WY_04
186	.set WY_minus_08, WY_08
187	.set WY_minus_12, WY_12
188	.set WY_minus_16, WY_16
189	.set WY_minus_20, WY_20
190	.set WY_minus_24, WY_24
191	.set WY_minus_28, WY_28
192	.set WY_minus_32, WY
193.endm
194
195.macro PRECALC_00_15
196	.if (i == 0) # Initialize and rotate registers
197		PRECALC_RESET_WY
198		PRECALC_ROTATE_WY
199	.endif
200
201	/* message scheduling pre-compute for rounds 0-15 */
202	.if   ((i & 7) == 0)
203		/*
204		 * blended AVX2 and ALU instruction scheduling
205		 * 1 vector iteration per 8 rounds
206		 */
207		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
208	.elseif ((i & 7) == 1)
209		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
210			 WY_TMP, WY_TMP
211	.elseif ((i & 7) == 2)
212		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
213	.elseif ((i & 7) == 4)
214		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
215	.elseif ((i & 7) == 7)
216		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
217
218		PRECALC_ROTATE_WY
219	.endif
220.endm
221
222.macro PRECALC_16_31
223	/*
224	 * message scheduling pre-compute for rounds 16-31
225	 * calculating last 32 w[i] values in 8 XMM registers
226	 * pre-calculate K+w[i] values and store to mem
227	 * for later load by ALU add instruction
228	 *
229	 * "brute force" vectorization for rounds 16-31 only
230	 * due to w[i]->w[i-3] dependency
231	 */
232	.if   ((i & 7) == 0)
233		/*
234		 * blended AVX2 and ALU instruction scheduling
235		 * 1 vector iteration per 8 rounds
236		 */
237		/* w[i-14] */
238		vpalignr	$8, WY_minus_16, WY_minus_12, WY
239		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
240	.elseif ((i & 7) == 1)
241		vpxor	WY_minus_08, WY, WY
242		vpxor	WY_minus_16, WY_TMP, WY_TMP
243	.elseif ((i & 7) == 2)
244		vpxor	WY_TMP, WY, WY
245		vpslldq	$12, WY, WY_TMP2
246	.elseif ((i & 7) == 3)
247		vpslld	$1, WY, WY_TMP
248		vpsrld	$31, WY, WY
249	.elseif ((i & 7) == 4)
250		vpor	WY, WY_TMP, WY_TMP
251		vpslld	$2, WY_TMP2, WY
252	.elseif ((i & 7) == 5)
253		vpsrld	$30, WY_TMP2, WY_TMP2
254		vpxor	WY, WY_TMP, WY_TMP
255	.elseif ((i & 7) == 7)
256		vpxor	WY_TMP2, WY_TMP, WY
257		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
258		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
259
260		PRECALC_ROTATE_WY
261	.endif
262.endm
263
264.macro PRECALC_32_79
265	/*
266	 * in SHA-1 specification:
267	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
268	 * instead we do equal:
269	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
270	 * allows more efficient vectorization
271	 * since w[i]=>w[i-3] dependency is broken
272	 */
273
274	.if   ((i & 7) == 0)
275	/*
276	 * blended AVX2 and ALU instruction scheduling
277	 * 1 vector iteration per 8 rounds
278	 */
279		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
280	.elseif ((i & 7) == 1)
281		/* W is W_minus_32 before xor */
282		vpxor	WY_minus_28, WY, WY
283	.elseif ((i & 7) == 2)
284		vpxor	WY_minus_16, WY_TMP, WY_TMP
285	.elseif ((i & 7) == 3)
286		vpxor	WY_TMP, WY, WY
287	.elseif ((i & 7) == 4)
288		vpslld	$2, WY, WY_TMP
289	.elseif ((i & 7) == 5)
290		vpsrld	$30, WY, WY
291		vpor	WY, WY_TMP, WY
292	.elseif ((i & 7) == 7)
293		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
294		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
295
296		PRECALC_ROTATE_WY
297	.endif
298.endm
299
300.macro PRECALC r, s
301	.set i, \r
302
303	.if (i < 40)
304		.set K_XMM, 32*0
305	.elseif (i < 80)
306		.set K_XMM, 32*1
307	.elseif (i < 120)
308		.set K_XMM, 32*2
309	.else
310		.set K_XMM, 32*3
311	.endif
312
313	.if (i<32)
314		PRECALC_00_15	\s
315	.elseif (i<64)
316		PRECALC_16_31	\s
317	.elseif (i < 160)
318		PRECALC_32_79	\s
319	.endif
320.endm
321
322.macro ROTATE_STATE
323	.set T_REG, E
324	.set E, D
325	.set D, C
326	.set C, B
327	.set B, TB
328	.set TB, A
329	.set A, T_REG
330
331	.set T_REG, RE
332	.set RE, RD
333	.set RD, RC
334	.set RC, RB
335	.set RB, RTB
336	.set RTB, RA
337	.set RA, T_REG
338.endm
339
340/* Macro relies on saved ROUND_Fx */
341
342.macro RND_FUN f, r
343	.if (\f == RND_F1)
344		ROUND_F1	\r
345	.elseif (\f == RND_F2)
346		ROUND_F2	\r
347	.elseif (\f == RND_F3)
348		ROUND_F3	\r
349	.endif
350.endm
351
352.macro RR r
353	.set round_id, (\r % 80)
354
355	.if (round_id == 0)        /* Precalculate F for first round */
356		.set ROUND_FUNC, RND_F1
357		mov	B, TB
358
359		rorx	$(32-30), B, B    /* b>>>2 */
360		andn	D, TB, T1
361		and	C, TB
362		xor	T1, TB
363	.endif
364
365	RND_FUN ROUND_FUNC, \r
366	ROTATE_STATE
367
368	.if   (round_id == 18)
369		.set ROUND_FUNC, RND_F2
370	.elseif (round_id == 38)
371		.set ROUND_FUNC, RND_F3
372	.elseif (round_id == 58)
373		.set ROUND_FUNC, RND_F2
374	.endif
375
376	.set round_id, ( (\r+1) % 80)
377
378	RND_FUN ROUND_FUNC, (\r+1)
379	ROTATE_STATE
380.endm
381
382.macro ROUND_F1 r
383	add	WK(\r), E
384
385	andn	C, A, T1			/* ~b&d */
386	lea	(RE,RTB), E		/* Add F from the previous round */
387
388	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
389	rorx	$(32-30),A, TB		/* b>>>2 for next round */
390
391	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
392
393	/*
394	 * Calculate F for the next round
395	 * (b & c) ^ andn[b, d]
396	 */
397	and	B, A			/* b&c */
398	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
399
400	lea	(RE,RTA), E		/* E += A >>> 5 */
401.endm
402
403.macro ROUND_F2 r
404	add	WK(\r), E
405	lea	(RE,RTB), E		/* Add F from the previous round */
406
407	/* Calculate F for the next round */
408	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
409	.if ((round_id) < 79)
410		rorx	$(32-30), A, TB	/* b>>>2 for next round */
411	.endif
412	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
413
414	.if ((round_id) < 79)
415		xor	B, A
416	.endif
417
418	add	TA, E			/* E += A >>> 5 */
419
420	.if ((round_id) < 79)
421		xor	C, A
422	.endif
423.endm
424
425.macro ROUND_F3 r
426	add	WK(\r), E
427	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
428
429	lea	(RE,RTB), E		/* Add F from the previous round */
430
431	mov	B, T1
432	or	A, T1
433
434	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
435	rorx	$(32-30), A, TB		/* b>>>2 for next round */
436
437	/* Calculate F for the next round
438	 * (b and c) or (d and (b or c))
439	 */
440	and	C, T1
441	and	B, A
442	or	T1, A
443
444	add	TA, E			/* E += A >>> 5 */
445
446.endm
447
448/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
449 * %1 + %2 >= %3 ? %4 : 0
450 */
451.macro ADD_IF_GE a, b, c, d
452	mov     \a, RTA
453	add     $\d, RTA
454	cmp     $\c, \b
455	cmovge  RTA, \a
456.endm
457
458/*
459 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
460 */
461.macro SHA1_PIPELINED_MAIN_BODY
462
463	REGALLOC
464
465	mov	(HASH_PTR), A
466	mov	4(HASH_PTR), B
467	mov	8(HASH_PTR), C
468	mov	12(HASH_PTR), D
469	mov	16(HASH_PTR), E
470
471	mov	%rsp, PRECALC_BUF
472	lea	(2*4*80+32)(%rsp), WK_BUF
473
474	# Precalc WK for first 2 blocks
475	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
476	.set i, 0
477	.rept    160
478		PRECALC i
479		.set i, i + 1
480	.endr
481
482	/* Go to next block if needed */
483	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
484	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
485	xchg	WK_BUF, PRECALC_BUF
486
487	.align 32
488.L_loop:
489	/*
490	 * code loops through more than one block
491	 * we use K_BASE value as a signal of a last block,
492	 * it is set below by: cmovae BUFFER_PTR, K_BASE
493	 */
494	test BLOCKS_CTR, BLOCKS_CTR
495	jnz .L_begin
496	.align 32
497	jmp	.L_end
498	.align 32
499.L_begin:
500
501	/*
502	 * Do first block
503	 * rounds: 0,2,4,6,8
504	 */
505	.set j, 0
506	.rept 5
507		RR	j
508		.set j, j+2
509	.endr
510
 
 
 
511	/*
512	 * rounds:
513	 * 10,12,14,16,18
514	 * 20,22,24,26,28
515	 * 30,32,34,36,38
516	 * 40,42,44,46,48
517	 * 50,52,54,56,58
518	 */
519	.rept 25
520		RR	j
521		.set j, j+2
522	.endr
523
524	/* Update Counter */
525	sub $1, BLOCKS_CTR
526	/* Move to the next block only if needed*/
527	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
528	/*
529	 * rounds
530	 * 60,62,64,66,68
531	 * 70,72,74,76,78
532	 */
533	.rept 10
534		RR	j
535		.set j, j+2
536	.endr
537
538	UPDATE_HASH	(HASH_PTR), A
539	UPDATE_HASH	4(HASH_PTR), TB
540	UPDATE_HASH	8(HASH_PTR), C
541	UPDATE_HASH	12(HASH_PTR), D
542	UPDATE_HASH	16(HASH_PTR), E
543
544	test	BLOCKS_CTR, BLOCKS_CTR
545	jz	.L_loop
546
547	mov	TB, B
548
549	/* Process second block */
550	/*
551	 * rounds
552	 *  0+80, 2+80, 4+80, 6+80, 8+80
553	 * 10+80,12+80,14+80,16+80,18+80
554	 */
555
556	.set j, 0
557	.rept 10
558		RR	j+80
559		.set j, j+2
560	.endr
561
 
 
562	/*
563	 * rounds
564	 * 20+80,22+80,24+80,26+80,28+80
565	 * 30+80,32+80,34+80,36+80,38+80
566	 */
567	.rept 10
568		RR	j+80
569		.set j, j+2
570	.endr
571
 
 
 
572	/*
573	 * rounds
574	 * 40+80,42+80,44+80,46+80,48+80
575	 * 50+80,52+80,54+80,56+80,58+80
576	 */
577	.rept 10
578		RR	j+80
579		.set j, j+2
580	.endr
581
582	/* update counter */
583	sub     $1, BLOCKS_CTR
584	/* Move to the next block only if needed*/
585	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 
 
 
586
587	/*
588	 * rounds
589	 * 60+80,62+80,64+80,66+80,68+80
590	 * 70+80,72+80,74+80,76+80,78+80
591	 */
592	.rept 10
593		RR	j+80
594		.set j, j+2
595	.endr
596
597	UPDATE_HASH	(HASH_PTR), A
598	UPDATE_HASH	4(HASH_PTR), TB
599	UPDATE_HASH	8(HASH_PTR), C
600	UPDATE_HASH	12(HASH_PTR), D
601	UPDATE_HASH	16(HASH_PTR), E
602
603	/* Reset state for AVX2 reg permutation */
604	mov	A, TA
605	mov	TB, A
606	mov	C, TB
607	mov	E, C
608	mov	D, B
609	mov	TA, D
610
611	REGALLOC
612
613	xchg	WK_BUF, PRECALC_BUF
614
615	jmp	.L_loop
616
617	.align 32
618.L_end:
619
620.endm
621/*
622 * macro implements SHA-1 function's body for several 64-byte blocks
623 * param: function's name
624 */
625.macro SHA1_VECTOR_ASM  name
626	SYM_FUNC_START(\name)
627
628	push	%rbx
 
629	push	%r12
630	push	%r13
631	push	%r14
632	push	%r15
633
634	RESERVE_STACK  = (W_SIZE*4 + 8+24)
635
636	/* Align stack */
637	push	%rbp
638	mov	%rsp, %rbp
639	and	$~(0x20-1), %rsp
 
640	sub	$RESERVE_STACK, %rsp
641
642	avx2_zeroupper
643
644	/* Setup initial values */
 
645	mov	CTX, HASH_PTR
646	mov	BUF, BUFFER_PTR
 
 
 
 
 
 
647
648	mov	BUF, BUFFER_PTR2
649	mov	CNT, BLOCKS_CTR
650
651	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
652
653	SHA1_PIPELINED_MAIN_BODY
654
655	avx2_zeroupper
656
657	mov	%rbp, %rsp
658	pop	%rbp
659
660	pop	%r15
661	pop	%r14
662	pop	%r13
663	pop	%r12
 
664	pop	%rbx
665
666	RET
667
668	SYM_FUNC_END(\name)
669.endm
670
671.section .rodata
672
673#define K1 0x5a827999
674#define K2 0x6ed9eba1
675#define K3 0x8f1bbcdc
676#define K4 0xca62c1d6
677
678.align 128
679K_XMM_AR:
680	.long K1, K1, K1, K1
681	.long K1, K1, K1, K1
682	.long K2, K2, K2, K2
683	.long K2, K2, K2, K2
684	.long K3, K3, K3, K3
685	.long K3, K3, K3, K3
686	.long K4, K4, K4, K4
687	.long K4, K4, K4, K4
688
689BSWAP_SHUFB_CTL:
690	.long 0x00010203
691	.long 0x04050607
692	.long 0x08090a0b
693	.long 0x0c0d0e0f
694	.long 0x00010203
695	.long 0x04050607
696	.long 0x08090a0b
697	.long 0x0c0d0e0f
698.text
699
700SHA1_VECTOR_ASM     sha1_transform_avx2

  1/*
  2 *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
  3 *
  4 * This file is provided under a dual BSD/GPLv2 license.  When using or
  5 * redistributing this file, you may do so under either license.
  6 *
  7 * GPL LICENSE SUMMARY
  8 *
  9 * Copyright(c) 2014 Intel Corporation.
 10 *
 11 * This program is free software; you can redistribute it and/or modify
 12 * it under the terms of version 2 of the GNU General Public License as
 13 * published by the Free Software Foundation.
 14 *
 15 * This program is distributed in the hope that it will be useful, but
 16 * WITHOUT ANY WARRANTY; without even the implied warranty of
 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 18 * General Public License for more details.
 19 *
 20 * Contact Information:
 21 * Ilya Albrekht <ilya.albrekht@intel.com>
 22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
 23 * Ronen Zohar <ronen.zohar@intel.com>
 24 * Chandramouli Narayanan <mouli@linux.intel.com>
 25 *
 26 * BSD LICENSE
 27 *
 28 * Copyright(c) 2014 Intel Corporation.
 29 *
 30 * Redistribution and use in source and binary forms, with or without
 31 * modification, are permitted provided that the following conditions
 32 * are met:
 33 *
 34 * Redistributions of source code must retain the above copyright
 35 * notice, this list of conditions and the following disclaimer.
 36 * Redistributions in binary form must reproduce the above copyright
 37 * notice, this list of conditions and the following disclaimer in
 38 * the documentation and/or other materials provided with the
 39 * distribution.
 40 * Neither the name of Intel Corporation nor the names of its
 41 * contributors may be used to endorse or promote products derived
 42 * from this software without specific prior written permission.
 43 *
 44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 55 *
 56 */
 57
 58/*
 59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
 60 *
 61 *This implementation is based on the previous SSSE3 release:
 62 *Visit http://software.intel.com/en-us/articles/
 63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
 64 *
 65 *Updates 20-byte SHA-1 record in 'hash' for even number of
 66 *'num_blocks' consecutive 64-byte blocks
 67 *
 68 *extern "C" void sha1_transform_avx2(
 69 *	int *hash, const char* input, size_t num_blocks );
 70 */
 71
 72#include <linux/linkage.h>
 73
 74#define	CTX	%rdi	/* arg1 */
 75#define BUF	%rsi	/* arg2 */
 76#define CNT	%rdx	/* arg3 */
 77
 78#define	REG_A	%ecx
 79#define	REG_B	%esi
 80#define	REG_C	%edi
 81#define	REG_D	%eax
 82#define	REG_E	%edx
 83#define	REG_TB	%ebx
 84#define	REG_TA	%r12d
 85#define	REG_RA	%rcx
 86#define	REG_RB	%rsi
 87#define	REG_RC	%rdi
 88#define	REG_RD	%rax
 89#define	REG_RE	%rdx
 90#define	REG_RTA	%r12
 91#define	REG_RTB	%rbx
 92#define	REG_T1	%ebp
 93#define	xmm_mov	vmovups
 94#define	avx2_zeroupper	vzeroupper
 95#define	RND_F1	1
 96#define	RND_F2	2
 97#define	RND_F3	3
 98
 99.macro REGALLOC
100	.set A, REG_A
101	.set B, REG_B
102	.set C, REG_C
103	.set D, REG_D
104	.set E, REG_E
105	.set TB, REG_TB
106	.set TA, REG_TA
107
108	.set RA, REG_RA
109	.set RB, REG_RB
110	.set RC, REG_RC
111	.set RD, REG_RD
112	.set RE, REG_RE
113
114	.set RTA, REG_RTA
115	.set RTB, REG_RTB
116
117	.set T1, REG_T1
118.endm
119
120#define K_BASE		%r8
121#define HASH_PTR	%r9
 
122#define BUFFER_PTR	%r10
123#define BUFFER_PTR2	%r13
124#define BUFFER_END	%r11
125
126#define PRECALC_BUF	%r14
127#define WK_BUF		%r15
128
129#define W_TMP		%xmm0
130#define WY_TMP		%ymm0
131#define WY_TMP2		%ymm9
132
133# AVX2 variables
134#define WY0		%ymm3
135#define WY4		%ymm5
136#define WY08		%ymm7
137#define WY12		%ymm8
138#define WY16		%ymm12
139#define WY20		%ymm13
140#define WY24		%ymm14
141#define WY28		%ymm15
142
143#define YMM_SHUFB_BSWAP	%ymm10
144
145/*
146 * Keep 2 iterations precalculated at a time:
147 *    - 80 DWORDs per iteration * 2
148 */
149#define W_SIZE		(80*2*2 +16)
150
151#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
152#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
153
154
155.macro UPDATE_HASH  hash, val
156	add	\hash, \val
157	mov	\val, \hash
158.endm
159
160.macro PRECALC_RESET_WY
161	.set WY_00, WY0
162	.set WY_04, WY4
163	.set WY_08, WY08
164	.set WY_12, WY12
165	.set WY_16, WY16
166	.set WY_20, WY20
167	.set WY_24, WY24
168	.set WY_28, WY28
169	.set WY_32, WY_00
170.endm
171
172.macro PRECALC_ROTATE_WY
173	/* Rotate macros */
174	.set WY_32, WY_28
175	.set WY_28, WY_24
176	.set WY_24, WY_20
177	.set WY_20, WY_16
178	.set WY_16, WY_12
179	.set WY_12, WY_08
180	.set WY_08, WY_04
181	.set WY_04, WY_00
182	.set WY_00, WY_32
183
184	/* Define register aliases */
185	.set WY, WY_00
186	.set WY_minus_04, WY_04
187	.set WY_minus_08, WY_08
188	.set WY_minus_12, WY_12
189	.set WY_minus_16, WY_16
190	.set WY_minus_20, WY_20
191	.set WY_minus_24, WY_24
192	.set WY_minus_28, WY_28
193	.set WY_minus_32, WY
194.endm
195
196.macro PRECALC_00_15
197	.if (i == 0) # Initialize and rotate registers
198		PRECALC_RESET_WY
199		PRECALC_ROTATE_WY
200	.endif
201
202	/* message scheduling pre-compute for rounds 0-15 */
203	.if   ((i & 7) == 0)
204		/*
205		 * blended AVX2 and ALU instruction scheduling
206		 * 1 vector iteration per 8 rounds
207		 */
208		vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
209	.elseif ((i & 7) == 1)
210		vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
211			 WY_TMP, WY_TMP
212	.elseif ((i & 7) == 2)
213		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
214	.elseif ((i & 7) == 4)
215		vpaddd  K_XMM(K_BASE), WY, WY_TMP
216	.elseif ((i & 7) == 7)
217		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
218
219		PRECALC_ROTATE_WY
220	.endif
221.endm
222
223.macro PRECALC_16_31
224	/*
225	 * message scheduling pre-compute for rounds 16-31
226	 * calculating last 32 w[i] values in 8 XMM registers
227	 * pre-calculate K+w[i] values and store to mem
228	 * for later load by ALU add instruction
229	 *
230	 * "brute force" vectorization for rounds 16-31 only
231	 * due to w[i]->w[i-3] dependency
232	 */
233	.if   ((i & 7) == 0)
234		/*
235		 * blended AVX2 and ALU instruction scheduling
236		 * 1 vector iteration per 8 rounds
237		 */
238		/* w[i-14] */
239		vpalignr	$8, WY_minus_16, WY_minus_12, WY
240		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
241	.elseif ((i & 7) == 1)
242		vpxor	WY_minus_08, WY, WY
243		vpxor	WY_minus_16, WY_TMP, WY_TMP
244	.elseif ((i & 7) == 2)
245		vpxor	WY_TMP, WY, WY
246		vpslldq	$12, WY, WY_TMP2
247	.elseif ((i & 7) == 3)
248		vpslld	$1, WY, WY_TMP
249		vpsrld	$31, WY, WY
250	.elseif ((i & 7) == 4)
251		vpor	WY, WY_TMP, WY_TMP
252		vpslld	$2, WY_TMP2, WY
253	.elseif ((i & 7) == 5)
254		vpsrld	$30, WY_TMP2, WY_TMP2
255		vpxor	WY, WY_TMP, WY_TMP
256	.elseif ((i & 7) == 7)
257		vpxor	WY_TMP2, WY_TMP, WY
258		vpaddd	K_XMM(K_BASE), WY, WY_TMP
259		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
260
261		PRECALC_ROTATE_WY
262	.endif
263.endm
264
265.macro PRECALC_32_79
266	/*
267	 * in SHA-1 specification:
268	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
269	 * instead we do equal:
270	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
271	 * allows more efficient vectorization
272	 * since w[i]=>w[i-3] dependency is broken
273	 */
274
275	.if   ((i & 7) == 0)
276	/*
277	 * blended AVX2 and ALU instruction scheduling
278	 * 1 vector iteration per 8 rounds
279	 */
280		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
281	.elseif ((i & 7) == 1)
282		/* W is W_minus_32 before xor */
283		vpxor	WY_minus_28, WY, WY
284	.elseif ((i & 7) == 2)
285		vpxor	WY_minus_16, WY_TMP, WY_TMP
286	.elseif ((i & 7) == 3)
287		vpxor	WY_TMP, WY, WY
288	.elseif ((i & 7) == 4)
289		vpslld	$2, WY, WY_TMP
290	.elseif ((i & 7) == 5)
291		vpsrld	$30, WY, WY
292		vpor	WY, WY_TMP, WY
293	.elseif ((i & 7) == 7)
294		vpaddd	K_XMM(K_BASE), WY, WY_TMP
295		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
296
297		PRECALC_ROTATE_WY
298	.endif
299.endm
300
301.macro PRECALC r, s
302	.set i, \r
303
304	.if (i < 40)
305		.set K_XMM, 32*0
306	.elseif (i < 80)
307		.set K_XMM, 32*1
308	.elseif (i < 120)
309		.set K_XMM, 32*2
310	.else
311		.set K_XMM, 32*3
312	.endif
313
314	.if (i<32)
315		PRECALC_00_15	\s
316	.elseif (i<64)
317		PRECALC_16_31	\s
318	.elseif (i < 160)
319		PRECALC_32_79	\s
320	.endif
321.endm
322
323.macro ROTATE_STATE
324	.set T_REG, E
325	.set E, D
326	.set D, C
327	.set C, B
328	.set B, TB
329	.set TB, A
330	.set A, T_REG
331
332	.set T_REG, RE
333	.set RE, RD
334	.set RD, RC
335	.set RC, RB
336	.set RB, RTB
337	.set RTB, RA
338	.set RA, T_REG
339.endm
340
341/* Macro relies on saved ROUND_Fx */
342
343.macro RND_FUN f, r
344	.if (\f == RND_F1)
345		ROUND_F1	\r
346	.elseif (\f == RND_F2)
347		ROUND_F2	\r
348	.elseif (\f == RND_F3)
349		ROUND_F3	\r
350	.endif
351.endm
352
353.macro RR r
354	.set round_id, (\r % 80)
355
356	.if (round_id == 0)        /* Precalculate F for first round */
357		.set ROUND_FUNC, RND_F1
358		mov	B, TB
359
360		rorx	$(32-30), B, B    /* b>>>2 */
361		andn	D, TB, T1
362		and	C, TB
363		xor	T1, TB
364	.endif
365
366	RND_FUN ROUND_FUNC, \r
367	ROTATE_STATE
368
369	.if   (round_id == 18)
370		.set ROUND_FUNC, RND_F2
371	.elseif (round_id == 38)
372		.set ROUND_FUNC, RND_F3
373	.elseif (round_id == 58)
374		.set ROUND_FUNC, RND_F2
375	.endif
376
377	.set round_id, ( (\r+1) % 80)
378
379	RND_FUN ROUND_FUNC, (\r+1)
380	ROTATE_STATE
381.endm
382
383.macro ROUND_F1 r
384	add	WK(\r), E
385
386	andn	C, A, T1			/* ~b&d */
387	lea	(RE,RTB), E		/* Add F from the previous round */
388
389	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
390	rorx	$(32-30),A, TB		/* b>>>2 for next round */
391
392	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
393
394	/*
395	 * Calculate F for the next round
396	 * (b & c) ^ andn[b, d]
397	 */
398	and	B, A			/* b&c */
399	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
400
401	lea	(RE,RTA), E		/* E += A >>> 5 */
402.endm
403
404.macro ROUND_F2 r
405	add	WK(\r), E
406	lea	(RE,RTB), E		/* Add F from the previous round */
407
408	/* Calculate F for the next round */
409	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
410	.if ((round_id) < 79)
411		rorx	$(32-30), A, TB	/* b>>>2 for next round */
412	.endif
413	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
414
415	.if ((round_id) < 79)
416		xor	B, A
417	.endif
418
419	add	TA, E			/* E += A >>> 5 */
420
421	.if ((round_id) < 79)
422		xor	C, A
423	.endif
424.endm
425
426.macro ROUND_F3 r
427	add	WK(\r), E
428	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
429
430	lea	(RE,RTB), E		/* Add F from the previous round */
431
432	mov	B, T1
433	or	A, T1
434
435	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
436	rorx	$(32-30), A, TB		/* b>>>2 for next round */
437
438	/* Calculate F for the next round
439	 * (b and c) or (d and (b or c))
440	 */
441	and	C, T1
442	and	B, A
443	or	T1, A
444
445	add	TA, E			/* E += A >>> 5 */
446
447.endm
448
 
 
 
 
 
 
 
 
 
 
449/*
450 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
451 */
452.macro SHA1_PIPELINED_MAIN_BODY
453
454	REGALLOC
455
456	mov	(HASH_PTR), A
457	mov	4(HASH_PTR), B
458	mov	8(HASH_PTR), C
459	mov	12(HASH_PTR), D
460	mov	16(HASH_PTR), E
461
462	mov	%rsp, PRECALC_BUF
463	lea	(2*4*80+32)(%rsp), WK_BUF
464
465	# Precalc WK for first 2 blocks
466	PRECALC_OFFSET = 0
467	.set i, 0
468	.rept    160
469		PRECALC i
470		.set i, i + 1
471	.endr
472	PRECALC_OFFSET = 128
 
 
 
473	xchg	WK_BUF, PRECALC_BUF
474
475	.align 32
476_loop:
477	/*
478	 * code loops through more than one block
479	 * we use K_BASE value as a signal of a last block,
480	 * it is set below by: cmovae BUFFER_PTR, K_BASE
481	 */
482	cmp	K_BASE, BUFFER_PTR
483	jne	_begin
484	.align 32
485	jmp	_end
486	.align 32
487_begin:
488
489	/*
490	 * Do first block
491	 * rounds: 0,2,4,6,8
492	 */
493	.set j, 0
494	.rept 5
495		RR	j
496		.set j, j+2
497	.endr
498
499	jmp _loop0
500_loop0:
501
502	/*
503	 * rounds:
504	 * 10,12,14,16,18
505	 * 20,22,24,26,28
506	 * 30,32,34,36,38
507	 * 40,42,44,46,48
508	 * 50,52,54,56,58
509	 */
510	.rept 25
511		RR	j
512		.set j, j+2
513	.endr
514
515	add	$(2*64), BUFFER_PTR       /* move to next odd-64-byte block */
516	cmp	BUFFER_END, BUFFER_PTR    /* is current block the last one? */
517	cmovae	K_BASE, BUFFER_PTR	/* signal the last iteration smartly */
518
519	/*
520	 * rounds
521	 * 60,62,64,66,68
522	 * 70,72,74,76,78
523	 */
524	.rept 10
525		RR	j
526		.set j, j+2
527	.endr
528
529	UPDATE_HASH	(HASH_PTR), A
530	UPDATE_HASH	4(HASH_PTR), TB
531	UPDATE_HASH	8(HASH_PTR), C
532	UPDATE_HASH	12(HASH_PTR), D
533	UPDATE_HASH	16(HASH_PTR), E
534
535	cmp	K_BASE, BUFFER_PTR	/* is current block the last one? */
536	je	_loop
537
538	mov	TB, B
539
540	/* Process second block */
541	/*
542	 * rounds
543	 *  0+80, 2+80, 4+80, 6+80, 8+80
544	 * 10+80,12+80,14+80,16+80,18+80
545	 */
546
547	.set j, 0
548	.rept 10
549		RR	j+80
550		.set j, j+2
551	.endr
552
553	jmp	_loop1
554_loop1:
555	/*
556	 * rounds
557	 * 20+80,22+80,24+80,26+80,28+80
558	 * 30+80,32+80,34+80,36+80,38+80
559	 */
560	.rept 10
561		RR	j+80
562		.set j, j+2
563	.endr
564
565	jmp	_loop2
566_loop2:
567
568	/*
569	 * rounds
570	 * 40+80,42+80,44+80,46+80,48+80
571	 * 50+80,52+80,54+80,56+80,58+80
572	 */
573	.rept 10
574		RR	j+80
575		.set j, j+2
576	.endr
577
578	add	$(2*64), BUFFER_PTR2      /* move to next even-64-byte block */
579
580	cmp	BUFFER_END, BUFFER_PTR2   /* is current block the last one */
581	cmovae	K_BASE, BUFFER_PTR       /* signal the last iteration smartly */
582
583	jmp	_loop3
584_loop3:
585
586	/*
587	 * rounds
588	 * 60+80,62+80,64+80,66+80,68+80
589	 * 70+80,72+80,74+80,76+80,78+80
590	 */
591	.rept 10
592		RR	j+80
593		.set j, j+2
594	.endr
595
596	UPDATE_HASH	(HASH_PTR), A
597	UPDATE_HASH	4(HASH_PTR), TB
598	UPDATE_HASH	8(HASH_PTR), C
599	UPDATE_HASH	12(HASH_PTR), D
600	UPDATE_HASH	16(HASH_PTR), E
601
602	/* Reset state for AVX2 reg permutation */
603	mov	A, TA
604	mov	TB, A
605	mov	C, TB
606	mov	E, C
607	mov	D, B
608	mov	TA, D
609
610	REGALLOC
611
612	xchg	WK_BUF, PRECALC_BUF
613
614	jmp	_loop
615
616	.align 32
617	_end:
618
619.endm
620/*
621 * macro implements SHA-1 function's body for several 64-byte blocks
622 * param: function's name
623 */
624.macro SHA1_VECTOR_ASM  name
625	ENTRY(\name)
626
627	push	%rbx
628	push	%rbp
629	push	%r12
630	push	%r13
631	push	%r14
632	push	%r15
633
634	RESERVE_STACK  = (W_SIZE*4 + 8+24)
635
636	/* Align stack */
637	mov	%rsp, %rbx
 
638	and	$~(0x20-1), %rsp
639	push	%rbx
640	sub	$RESERVE_STACK, %rsp
641
642	avx2_zeroupper
643
644	lea	K_XMM_AR(%rip), K_BASE
645
646	mov	CTX, HASH_PTR
647	mov	BUF, BUFFER_PTR
648	lea	64(BUF), BUFFER_PTR2
649
650	shl	$6, CNT			/* mul by 64 */
651	add	BUF, CNT
652	add	$64, CNT
653	mov	CNT, BUFFER_END
654
655	cmp	BUFFER_END, BUFFER_PTR2
656	cmovae	K_BASE, BUFFER_PTR2
657
658	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
659
660	SHA1_PIPELINED_MAIN_BODY
661
662	avx2_zeroupper
663
664	add	$RESERVE_STACK, %rsp
665	pop	%rsp
666
667	pop	%r15
668	pop	%r14
669	pop	%r13
670	pop	%r12
671	pop	%rbp
672	pop	%rbx
673
674	ret
675
676	ENDPROC(\name)
677.endm
678
679.section .rodata
680
681#define K1 0x5a827999
682#define K2 0x6ed9eba1
683#define K3 0x8f1bbcdc
684#define K4 0xca62c1d6
685
686.align 128
687K_XMM_AR:
688	.long K1, K1, K1, K1
689	.long K1, K1, K1, K1
690	.long K2, K2, K2, K2
691	.long K2, K2, K2, K2
692	.long K3, K3, K3, K3
693	.long K3, K3, K3, K3
694	.long K4, K4, K4, K4
695	.long K4, K4, K4, K4
696
697BSWAP_SHUFB_CTL:
698	.long 0x00010203
699	.long 0x04050607
700	.long 0x08090a0b
701	.long 0x0c0d0e0f
702	.long 0x00010203
703	.long 0x04050607
704	.long 0x08090a0b
705	.long 0x0c0d0e0f
706.text
707
708SHA1_VECTOR_ASM     sha1_transform_avx2