Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.10.11.
  1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
  2/*
  3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
  5 */
  6
  7#include <linux/linkage.h>
  8
  9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
 10.align 32
 11IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
 12	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
 13.section .rodata.cst16.ROT16, "aM", @progbits, 16
 14.align 16
 15ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
 16.section .rodata.cst16.ROR328, "aM", @progbits, 16
 17.align 16
 18ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
 19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
 20.align 64
 21SIGMA:
 22.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 23.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
 24.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
 25.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
 26.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
 27.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
 28.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
 29.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
 30.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
 31.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
 32#ifdef CONFIG_AS_AVX512
 33.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
 34.align 64
 35SIGMA2:
 36.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 37.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
 38.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
 39.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
 40.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
 41.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
 42.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
 43.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
 44.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
 45.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
 46#endif /* CONFIG_AS_AVX512 */
 47
 48.text
 49SYM_FUNC_START(blake2s_compress_ssse3)
 50	testq		%rdx,%rdx
 51	je		.Lendofloop
 52	movdqu		(%rdi),%xmm0
 53	movdqu		0x10(%rdi),%xmm1
 54	movdqa		ROT16(%rip),%xmm12
 55	movdqa		ROR328(%rip),%xmm13
 56	movdqu		0x20(%rdi),%xmm14
 57	movq		%rcx,%xmm15
 58	leaq		SIGMA+0xa0(%rip),%r8
 59	jmp		.Lbeginofloop
 60	.align		32
 61.Lbeginofloop:
 62	movdqa		%xmm0,%xmm10
 63	movdqa		%xmm1,%xmm11
 64	paddq		%xmm15,%xmm14
 65	movdqa		IV(%rip),%xmm2
 66	movdqa		%xmm14,%xmm3
 67	pxor		IV+0x10(%rip),%xmm3
 68	leaq		SIGMA(%rip),%rcx
 69.Lroundloop:
 70	movzbl		(%rcx),%eax
 71	movd		(%rsi,%rax,4),%xmm4
 72	movzbl		0x1(%rcx),%eax
 73	movd		(%rsi,%rax,4),%xmm5
 74	movzbl		0x2(%rcx),%eax
 75	movd		(%rsi,%rax,4),%xmm6
 76	movzbl		0x3(%rcx),%eax
 77	movd		(%rsi,%rax,4),%xmm7
 78	punpckldq	%xmm5,%xmm4
 79	punpckldq	%xmm7,%xmm6
 80	punpcklqdq	%xmm6,%xmm4
 81	paddd		%xmm4,%xmm0
 82	paddd		%xmm1,%xmm0
 83	pxor		%xmm0,%xmm3
 84	pshufb		%xmm12,%xmm3
 85	paddd		%xmm3,%xmm2
 86	pxor		%xmm2,%xmm1
 87	movdqa		%xmm1,%xmm8
 88	psrld		$0xc,%xmm1
 89	pslld		$0x14,%xmm8
 90	por		%xmm8,%xmm1
 91	movzbl		0x4(%rcx),%eax
 92	movd		(%rsi,%rax,4),%xmm5
 93	movzbl		0x5(%rcx),%eax
 94	movd		(%rsi,%rax,4),%xmm6
 95	movzbl		0x6(%rcx),%eax
 96	movd		(%rsi,%rax,4),%xmm7
 97	movzbl		0x7(%rcx),%eax
 98	movd		(%rsi,%rax,4),%xmm4
 99	punpckldq	%xmm6,%xmm5
100	punpckldq	%xmm4,%xmm7
101	punpcklqdq	%xmm7,%xmm5
102	paddd		%xmm5,%xmm0
103	paddd		%xmm1,%xmm0
104	pxor		%xmm0,%xmm3
105	pshufb		%xmm13,%xmm3
106	paddd		%xmm3,%xmm2
107	pxor		%xmm2,%xmm1
108	movdqa		%xmm1,%xmm8
109	psrld		$0x7,%xmm1
110	pslld		$0x19,%xmm8
111	por		%xmm8,%xmm1
112	pshufd		$0x93,%xmm0,%xmm0
113	pshufd		$0x4e,%xmm3,%xmm3
114	pshufd		$0x39,%xmm2,%xmm2
115	movzbl		0x8(%rcx),%eax
116	movd		(%rsi,%rax,4),%xmm6
117	movzbl		0x9(%rcx),%eax
118	movd		(%rsi,%rax,4),%xmm7
119	movzbl		0xa(%rcx),%eax
120	movd		(%rsi,%rax,4),%xmm4
121	movzbl		0xb(%rcx),%eax
122	movd		(%rsi,%rax,4),%xmm5
123	punpckldq	%xmm7,%xmm6
124	punpckldq	%xmm5,%xmm4
125	punpcklqdq	%xmm4,%xmm6
126	paddd		%xmm6,%xmm0
127	paddd		%xmm1,%xmm0
128	pxor		%xmm0,%xmm3
129	pshufb		%xmm12,%xmm3
130	paddd		%xmm3,%xmm2
131	pxor		%xmm2,%xmm1
132	movdqa		%xmm1,%xmm8
133	psrld		$0xc,%xmm1
134	pslld		$0x14,%xmm8
135	por		%xmm8,%xmm1
136	movzbl		0xc(%rcx),%eax
137	movd		(%rsi,%rax,4),%xmm7
138	movzbl		0xd(%rcx),%eax
139	movd		(%rsi,%rax,4),%xmm4
140	movzbl		0xe(%rcx),%eax
141	movd		(%rsi,%rax,4),%xmm5
142	movzbl		0xf(%rcx),%eax
143	movd		(%rsi,%rax,4),%xmm6
144	punpckldq	%xmm4,%xmm7
145	punpckldq	%xmm6,%xmm5
146	punpcklqdq	%xmm5,%xmm7
147	paddd		%xmm7,%xmm0
148	paddd		%xmm1,%xmm0
149	pxor		%xmm0,%xmm3
150	pshufb		%xmm13,%xmm3
151	paddd		%xmm3,%xmm2
152	pxor		%xmm2,%xmm1
153	movdqa		%xmm1,%xmm8
154	psrld		$0x7,%xmm1
155	pslld		$0x19,%xmm8
156	por		%xmm8,%xmm1
157	pshufd		$0x39,%xmm0,%xmm0
158	pshufd		$0x4e,%xmm3,%xmm3
159	pshufd		$0x93,%xmm2,%xmm2
160	addq		$0x10,%rcx
161	cmpq		%r8,%rcx
162	jnz		.Lroundloop
163	pxor		%xmm2,%xmm0
164	pxor		%xmm3,%xmm1
165	pxor		%xmm10,%xmm0
166	pxor		%xmm11,%xmm1
167	addq		$0x40,%rsi
168	decq		%rdx
169	jnz		.Lbeginofloop
170	movdqu		%xmm0,(%rdi)
171	movdqu		%xmm1,0x10(%rdi)
172	movdqu		%xmm14,0x20(%rdi)
173.Lendofloop:
174	RET
175SYM_FUNC_END(blake2s_compress_ssse3)
176
177#ifdef CONFIG_AS_AVX512
178SYM_FUNC_START(blake2s_compress_avx512)
179	vmovdqu		(%rdi),%xmm0
180	vmovdqu		0x10(%rdi),%xmm1
181	vmovdqu		0x20(%rdi),%xmm4
182	vmovq		%rcx,%xmm5
183	vmovdqa		IV(%rip),%xmm14
184	vmovdqa		IV+16(%rip),%xmm15
185	jmp		.Lblake2s_compress_avx512_mainloop
186.align 32
187.Lblake2s_compress_avx512_mainloop:
188	vmovdqa		%xmm0,%xmm10
189	vmovdqa		%xmm1,%xmm11
190	vpaddq		%xmm5,%xmm4,%xmm4
191	vmovdqa		%xmm14,%xmm2
192	vpxor		%xmm15,%xmm4,%xmm3
193	vmovdqu		(%rsi),%ymm6
194	vmovdqu		0x20(%rsi),%ymm7
195	addq		$0x40,%rsi
196	leaq		SIGMA2(%rip),%rax
197	movb		$0xa,%cl
198.Lblake2s_compress_avx512_roundloop:
199	addq		$0x40,%rax
200	vmovdqa		-0x40(%rax),%ymm8
201	vmovdqa		-0x20(%rax),%ymm9
202	vpermi2d	%ymm7,%ymm6,%ymm8
203	vpermi2d	%ymm7,%ymm6,%ymm9
204	vmovdqa		%ymm8,%ymm6
205	vmovdqa		%ymm9,%ymm7
206	vpaddd		%xmm8,%xmm0,%xmm0
207	vpaddd		%xmm1,%xmm0,%xmm0
208	vpxor		%xmm0,%xmm3,%xmm3
209	vprord		$0x10,%xmm3,%xmm3
210	vpaddd		%xmm3,%xmm2,%xmm2
211	vpxor		%xmm2,%xmm1,%xmm1
212	vprord		$0xc,%xmm1,%xmm1
213	vextracti128	$0x1,%ymm8,%xmm8
214	vpaddd		%xmm8,%xmm0,%xmm0
215	vpaddd		%xmm1,%xmm0,%xmm0
216	vpxor		%xmm0,%xmm3,%xmm3
217	vprord		$0x8,%xmm3,%xmm3
218	vpaddd		%xmm3,%xmm2,%xmm2
219	vpxor		%xmm2,%xmm1,%xmm1
220	vprord		$0x7,%xmm1,%xmm1
221	vpshufd		$0x93,%xmm0,%xmm0
222	vpshufd		$0x4e,%xmm3,%xmm3
223	vpshufd		$0x39,%xmm2,%xmm2
224	vpaddd		%xmm9,%xmm0,%xmm0
225	vpaddd		%xmm1,%xmm0,%xmm0
226	vpxor		%xmm0,%xmm3,%xmm3
227	vprord		$0x10,%xmm3,%xmm3
228	vpaddd		%xmm3,%xmm2,%xmm2
229	vpxor		%xmm2,%xmm1,%xmm1
230	vprord		$0xc,%xmm1,%xmm1
231	vextracti128	$0x1,%ymm9,%xmm9
232	vpaddd		%xmm9,%xmm0,%xmm0
233	vpaddd		%xmm1,%xmm0,%xmm0
234	vpxor		%xmm0,%xmm3,%xmm3
235	vprord		$0x8,%xmm3,%xmm3
236	vpaddd		%xmm3,%xmm2,%xmm2
237	vpxor		%xmm2,%xmm1,%xmm1
238	vprord		$0x7,%xmm1,%xmm1
239	vpshufd		$0x39,%xmm0,%xmm0
240	vpshufd		$0x4e,%xmm3,%xmm3
241	vpshufd		$0x93,%xmm2,%xmm2
242	decb		%cl
243	jne		.Lblake2s_compress_avx512_roundloop
244	vpxor		%xmm10,%xmm0,%xmm0
245	vpxor		%xmm11,%xmm1,%xmm1
246	vpxor		%xmm2,%xmm0,%xmm0
247	vpxor		%xmm3,%xmm1,%xmm1
248	decq		%rdx
249	jne		.Lblake2s_compress_avx512_mainloop
250	vmovdqu		%xmm0,(%rdi)
251	vmovdqu		%xmm1,0x10(%rdi)
252	vmovdqu		%xmm4,0x20(%rdi)
253	vzeroupper
254	RET
255SYM_FUNC_END(blake2s_compress_avx512)
256#endif /* CONFIG_AS_AVX512 */