chacha-neon-core.S - arch/arm/crypto/chacha-neon-core.S - Linux diff v6.13.7

  1/*
  2 * ChaCha/XChaCha NEON helper functions
  3 *
  4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  5 *
  6 * This program is free software; you can redistribute it and/or modify
  7 * it under the terms of the GNU General Public License version 2 as
  8 * published by the Free Software Foundation.
  9 *
 10 * Based on:
 11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
 12 *
 13 * Copyright (C) 2015 Martin Willi
 14 *
 15 * This program is free software; you can redistribute it and/or modify
 16 * it under the terms of the GNU General Public License as published by
 17 * the Free Software Foundation; either version 2 of the License, or
 18 * (at your option) any later version.
 19 */
 20
 21 /*
 22  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
 23  *
 24  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
 25  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
 26  * (c)  vrev32.16			(16-bit rotations only)
 27  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
 28  *					 needs index vector)
 29  *
 30  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
 31  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
 32  * cycles of (b) on both Cortex-A7 and Cortex-A53.
 33  *
 34  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
 35  * and doesn't need a temporary register.
 36  *
 37  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
 38  * is twice as fast as (a), even when doing (a) on multiple registers
 39  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
 40  * parallelizes better when temporary registers are scarce.
 41  *
 42  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
 43  * (a), so the need to load the rotation table actually makes the vtbl method
 44  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
 45  * seems to be a good compromise to get a more significant speed boost on some
 46  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
 47  */
 48
 49#include <linux/linkage.h>
 50#include <asm/cache.h>
 51
 52	.text
 53	.fpu		neon
 54	.align		5
 55
 56/*
 57 * chacha_permute - permute one block
 58 *
 59 * Permute one 64-byte block where the state matrix is stored in the four NEON
 60 * registers q0-q3.  It performs matrix operations on four words in parallel,
 61 * but requires shuffling to rearrange the words after each round.
 62 *
 63 * The round count is given in r3.
 64 *
 65 * Clobbers: r3, ip, q4-q5
 66 */
 67chacha_permute:
 68
 69	adr		ip, .Lrol8_table
 70	vld1.8		{d10}, [ip, :64]
 71
 72.Ldoubleround:
 73	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 74	vadd.i32	q0, q0, q1
 75	veor		q3, q3, q0
 76	vrev32.16	q3, q3
 77
 78	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 79	vadd.i32	q2, q2, q3
 80	veor		q4, q1, q2
 81	vshl.u32	q1, q4, #12
 82	vsri.u32	q1, q4, #20
 83
 84	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 85	vadd.i32	q0, q0, q1
 86	veor		q3, q3, q0
 87	vtbl.8		d6, {d6}, d10
 88	vtbl.8		d7, {d7}, d10
 89
 90	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 91	vadd.i32	q2, q2, q3
 92	veor		q4, q1, q2
 93	vshl.u32	q1, q4, #7
 94	vsri.u32	q1, q4, #25
 95
 96	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
 97	vext.8		q1, q1, q1, #4
 98	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 99	vext.8		q2, q2, q2, #8
100	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
101	vext.8		q3, q3, q3, #12
102
103	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
104	vadd.i32	q0, q0, q1
105	veor		q3, q3, q0
106	vrev32.16	q3, q3
107
108	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
109	vadd.i32	q2, q2, q3
110	veor		q4, q1, q2
111	vshl.u32	q1, q4, #12
112	vsri.u32	q1, q4, #20
113
114	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
115	vadd.i32	q0, q0, q1
116	veor		q3, q3, q0
117	vtbl.8		d6, {d6}, d10
118	vtbl.8		d7, {d7}, d10
119
120	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
121	vadd.i32	q2, q2, q3
122	veor		q4, q1, q2
123	vshl.u32	q1, q4, #7
124	vsri.u32	q1, q4, #25
125
126	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
127	vext.8		q1, q1, q1, #12
128	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
129	vext.8		q2, q2, q2, #8
130	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
131	vext.8		q3, q3, q3, #4
132
133	subs		r3, r3, #2
134	bne		.Ldoubleround
135
136	bx		lr
137ENDPROC(chacha_permute)
138
139ENTRY(chacha_block_xor_neon)
140	// r0: Input state matrix, s
141	// r1: 1 data block output, o
142	// r2: 1 data block input, i
143	// r3: nrounds
144	push		{lr}
145
146	// x0..3 = s0..3
147	add		ip, r0, #0x20
148	vld1.32		{q0-q1}, [r0]
149	vld1.32		{q2-q3}, [ip]
150
151	vmov		q8, q0
152	vmov		q9, q1
153	vmov		q10, q2
154	vmov		q11, q3
155
156	bl		chacha_permute
157
158	add		ip, r2, #0x20
159	vld1.8		{q4-q5}, [r2]
160	vld1.8		{q6-q7}, [ip]
161
162	// o0 = i0 ^ (x0 + s0)
163	vadd.i32	q0, q0, q8
164	veor		q0, q0, q4
165
166	// o1 = i1 ^ (x1 + s1)
167	vadd.i32	q1, q1, q9
168	veor		q1, q1, q5
169
170	// o2 = i2 ^ (x2 + s2)
171	vadd.i32	q2, q2, q10
172	veor		q2, q2, q6
173
174	// o3 = i3 ^ (x3 + s3)
175	vadd.i32	q3, q3, q11
176	veor		q3, q3, q7
177
178	add		ip, r1, #0x20
179	vst1.8		{q0-q1}, [r1]
180	vst1.8		{q2-q3}, [ip]
181
182	pop		{pc}
183ENDPROC(chacha_block_xor_neon)
184
185ENTRY(hchacha_block_neon)
186	// r0: Input state matrix, s
187	// r1: output (8 32-bit words)
188	// r2: nrounds
189	push		{lr}
190
191	vld1.32		{q0-q1}, [r0]!
192	vld1.32		{q2-q3}, [r0]
193
194	mov		r3, r2
195	bl		chacha_permute
196
197	vst1.32		{q0}, [r1]!
198	vst1.32		{q3}, [r1]
199
200	pop		{pc}
201ENDPROC(hchacha_block_neon)
202
203	.align		4
204.Lctrinc:	.word	0, 1, 2, 3
205.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
206
207	.align		5
208ENTRY(chacha_4block_xor_neon)
209	push		{r4, lr}
210	mov		r4, sp			// preserve the stack pointer
211	sub		ip, sp, #0x20		// allocate a 32 byte buffer
212	bic		ip, ip, #0x1f		// aligned to 32 bytes
213	mov		sp, ip
214
215	// r0: Input state matrix, s
216	// r1: 4 data blocks output, o
217	// r2: 4 data blocks input, i
218	// r3: nrounds
219
220	//
221	// This function encrypts four consecutive ChaCha blocks by loading
222	// the state matrix in NEON registers four times. The algorithm performs
223	// each operation on the corresponding word of each state matrix, hence
224	// requires no word shuffling. The words are re-interleaved before the
225	// final addition of the original state and the XORing step.
226	//
227
228	// x0..15[0-3] = s0..15[0-3]
229	add		ip, r0, #0x20
230	vld1.32		{q0-q1}, [r0]
231	vld1.32		{q2-q3}, [ip]
232
233	adr		lr, .Lctrinc
234	vdup.32		q15, d7[1]
235	vdup.32		q14, d7[0]
236	vld1.32		{q4}, [lr, :128]
237	vdup.32		q13, d6[1]
238	vdup.32		q12, d6[0]
239	vdup.32		q11, d5[1]
240	vdup.32		q10, d5[0]
241	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
242	vdup.32		q9, d4[1]
243	vdup.32		q8, d4[0]
244	vdup.32		q7, d3[1]
245	vdup.32		q6, d3[0]
246	vdup.32		q5, d2[1]
247	vdup.32		q4, d2[0]
248	vdup.32		q3, d1[1]
249	vdup.32		q2, d1[0]
250	vdup.32		q1, d0[1]
251	vdup.32		q0, d0[0]
252
253	adr		ip, .Lrol8_table
254	b		1f
255
256.Ldoubleround4:
257	vld1.32		{q8-q9}, [sp, :256]
2581:
259	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
260	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
261	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
262	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
263	vadd.i32	q0, q0, q4
264	vadd.i32	q1, q1, q5
265	vadd.i32	q2, q2, q6
266	vadd.i32	q3, q3, q7
267
268	veor		q12, q12, q0
269	veor		q13, q13, q1
270	veor		q14, q14, q2
271	veor		q15, q15, q3
272
273	vrev32.16	q12, q12
274	vrev32.16	q13, q13
275	vrev32.16	q14, q14
276	vrev32.16	q15, q15
277
278	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
279	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
280	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
281	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
282	vadd.i32	q8, q8, q12
283	vadd.i32	q9, q9, q13
284	vadd.i32	q10, q10, q14
285	vadd.i32	q11, q11, q15
286
287	vst1.32		{q8-q9}, [sp, :256]
288
289	veor		q8, q4, q8
290	veor		q9, q5, q9
291	vshl.u32	q4, q8, #12
292	vshl.u32	q5, q9, #12
293	vsri.u32	q4, q8, #20
294	vsri.u32	q5, q9, #20
295
296	veor		q8, q6, q10
297	veor		q9, q7, q11
298	vshl.u32	q6, q8, #12
299	vshl.u32	q7, q9, #12
300	vsri.u32	q6, q8, #20
301	vsri.u32	q7, q9, #20
302
303	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
304	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
305	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
306	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
307	vld1.8		{d16}, [ip, :64]
308	vadd.i32	q0, q0, q4
309	vadd.i32	q1, q1, q5
310	vadd.i32	q2, q2, q6
311	vadd.i32	q3, q3, q7
312
313	veor		q12, q12, q0
314	veor		q13, q13, q1
315	veor		q14, q14, q2
316	veor		q15, q15, q3
317
318	vtbl.8		d24, {d24}, d16
319	vtbl.8		d25, {d25}, d16
320	vtbl.8		d26, {d26}, d16
321	vtbl.8		d27, {d27}, d16
322	vtbl.8		d28, {d28}, d16
323	vtbl.8		d29, {d29}, d16
324	vtbl.8		d30, {d30}, d16
325	vtbl.8		d31, {d31}, d16
326
327	vld1.32		{q8-q9}, [sp, :256]
328
329	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
330	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
331	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
332	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
333	vadd.i32	q8, q8, q12
334	vadd.i32	q9, q9, q13
335	vadd.i32	q10, q10, q14
336	vadd.i32	q11, q11, q15
337
338	vst1.32		{q8-q9}, [sp, :256]
339
340	veor		q8, q4, q8
341	veor		q9, q5, q9
342	vshl.u32	q4, q8, #7
343	vshl.u32	q5, q9, #7
344	vsri.u32	q4, q8, #25
345	vsri.u32	q5, q9, #25
346
347	veor		q8, q6, q10
348	veor		q9, q7, q11
349	vshl.u32	q6, q8, #7
350	vshl.u32	q7, q9, #7
351	vsri.u32	q6, q8, #25
352	vsri.u32	q7, q9, #25
353
354	vld1.32		{q8-q9}, [sp, :256]
355
356	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
357	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
358	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
359	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
360	vadd.i32	q0, q0, q5
361	vadd.i32	q1, q1, q6
362	vadd.i32	q2, q2, q7
363	vadd.i32	q3, q3, q4
364
365	veor		q15, q15, q0
366	veor		q12, q12, q1
367	veor		q13, q13, q2
368	veor		q14, q14, q3
369
370	vrev32.16	q15, q15
371	vrev32.16	q12, q12
372	vrev32.16	q13, q13
373	vrev32.16	q14, q14
374
375	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
376	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
377	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
378	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
379	vadd.i32	q10, q10, q15
380	vadd.i32	q11, q11, q12
381	vadd.i32	q8, q8, q13
382	vadd.i32	q9, q9, q14
383
384	vst1.32		{q8-q9}, [sp, :256]
385
386	veor		q8, q7, q8
387	veor		q9, q4, q9
388	vshl.u32	q7, q8, #12
389	vshl.u32	q4, q9, #12
390	vsri.u32	q7, q8, #20
391	vsri.u32	q4, q9, #20
392
393	veor		q8, q5, q10
394	veor		q9, q6, q11
395	vshl.u32	q5, q8, #12
396	vshl.u32	q6, q9, #12
397	vsri.u32	q5, q8, #20
398	vsri.u32	q6, q9, #20
399
400	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
401	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
402	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
403	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
404	vld1.8		{d16}, [ip, :64]
405	vadd.i32	q0, q0, q5
406	vadd.i32	q1, q1, q6
407	vadd.i32	q2, q2, q7
408	vadd.i32	q3, q3, q4
409
410	veor		q15, q15, q0
411	veor		q12, q12, q1
412	veor		q13, q13, q2
413	veor		q14, q14, q3
414
415	vtbl.8		d30, {d30}, d16
416	vtbl.8		d31, {d31}, d16
417	vtbl.8		d24, {d24}, d16
418	vtbl.8		d25, {d25}, d16
419	vtbl.8		d26, {d26}, d16
420	vtbl.8		d27, {d27}, d16
421	vtbl.8		d28, {d28}, d16
422	vtbl.8		d29, {d29}, d16
423
424	vld1.32		{q8-q9}, [sp, :256]
425
426	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
427	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
428	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
429	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
430	vadd.i32	q10, q10, q15
431	vadd.i32	q11, q11, q12
432	vadd.i32	q8, q8, q13
433	vadd.i32	q9, q9, q14
434
435	vst1.32		{q8-q9}, [sp, :256]
436
437	veor		q8, q7, q8
438	veor		q9, q4, q9
439	vshl.u32	q7, q8, #7
440	vshl.u32	q4, q9, #7
441	vsri.u32	q7, q8, #25
442	vsri.u32	q4, q9, #25
443
444	veor		q8, q5, q10
445	veor		q9, q6, q11
446	vshl.u32	q5, q8, #7
447	vshl.u32	q6, q9, #7
448	vsri.u32	q5, q8, #25
449	vsri.u32	q6, q9, #25
450
451	subs		r3, r3, #2
452	bne		.Ldoubleround4
453
454	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455	// x8..9[0-3] are on the stack.
456
457	// Re-interleave the words in the first two rows of each block (x0..7).
458	// Also add the counter values 0-3 to x12[0-3].
459	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
460	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
461	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
462	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
463	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
464	  vadd.u32	q12, q8			// x12 += counter values 0-3
465	vswp		d1, d4
466	vswp		d3, d6
467	  vld1.32	{q8-q9}, [r0]!		// load s0..7
468	vswp		d9, d12
469	vswp		d11, d14
470
471	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
472	// after XORing the first 32 bytes.
473	vswp		q1, q4
474
475	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
476
477	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
478	vadd.u32	q0, q0, q8
479	vadd.u32	q2, q2, q8
480	vadd.u32	q4, q4, q8
481	vadd.u32	q3, q3, q8
482
483	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
484	vadd.u32	q1, q1, q9
485	vadd.u32	q6, q6, q9
486	vadd.u32	q5, q5, q9
487	vadd.u32	q7, q7, q9
488
489	// XOR first 32 bytes using keystream from first two rows of first block
490	vld1.8		{q8-q9}, [r2]!
491	veor		q8, q8, q0
492	veor		q9, q9, q1
493	vst1.8		{q8-q9}, [r1]!
494
495	// Re-interleave the words in the last two rows of each block (x8..15).
496	vld1.32		{q8-q9}, [sp, :256]
497	  mov		sp, r4		// restore original stack pointer
498	  ldr		r4, [r4, #8]	// load number of bytes
499	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
500	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
501	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
502	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
503	  vld1.32	{q0-q1}, [r0]	// load s8..15
504	vswp		d25, d28
505	vswp		d27, d30
506	vswp		d17, d20
507	vswp		d19, d22
508
509	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
510
511	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
512	vadd.u32	q8,  q8,  q0
513	vadd.u32	q10, q10, q0
514	vadd.u32	q9,  q9,  q0
515	vadd.u32	q11, q11, q0
516
517	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
518	vadd.u32	q12, q12, q1
519	vadd.u32	q14, q14, q1
520	vadd.u32	q13, q13, q1
521	vadd.u32	q15, q15, q1
522
523	// XOR the rest of the data with the keystream
524
525	vld1.8		{q0-q1}, [r2]!
526	subs		r4, r4, #96
527	veor		q0, q0, q8
528	veor		q1, q1, q12
529	ble		.Lle96
530	vst1.8		{q0-q1}, [r1]!
531
532	vld1.8		{q0-q1}, [r2]!
533	subs		r4, r4, #32
534	veor		q0, q0, q2
535	veor		q1, q1, q6
536	ble		.Lle128
537	vst1.8		{q0-q1}, [r1]!
538
539	vld1.8		{q0-q1}, [r2]!
540	subs		r4, r4, #32
541	veor		q0, q0, q10
542	veor		q1, q1, q14
543	ble		.Lle160
544	vst1.8		{q0-q1}, [r1]!
545
546	vld1.8		{q0-q1}, [r2]!
547	subs		r4, r4, #32
548	veor		q0, q0, q4
549	veor		q1, q1, q5
550	ble		.Lle192
551	vst1.8		{q0-q1}, [r1]!
552
553	vld1.8		{q0-q1}, [r2]!
554	subs		r4, r4, #32
555	veor		q0, q0, q9
556	veor		q1, q1, q13
557	ble		.Lle224
558	vst1.8		{q0-q1}, [r1]!
559
560	vld1.8		{q0-q1}, [r2]!
561	subs		r4, r4, #32
562	veor		q0, q0, q3
563	veor		q1, q1, q7
564	blt		.Llt256
565.Lout:
566	vst1.8		{q0-q1}, [r1]!
567
568	vld1.8		{q0-q1}, [r2]
 
569	veor		q0, q0, q11
570	veor		q1, q1, q15
571	vst1.8		{q0-q1}, [r1]
572
573	pop		{r4, pc}
574
575.Lle192:
576	vmov		q4, q9
577	vmov		q5, q13
578
579.Lle160:
580	// nothing to do
581
582.Lfinalblock:
583	// Process the final block if processing less than 4 full blocks.
584	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585	// previous 32 byte output block that still needs to be written at
586	// [r1] in q0-q1.
587	beq		.Lfullblock
588
589.Lpartialblock:
590	adr		lr, .Lpermute + 32
591	add		r2, r2, r4
592	add		lr, lr, r4
593	add		r4, r4, r1
594
595	vld1.8		{q2-q3}, [lr]
596	vld1.8		{q6-q7}, [r2]
597
598	add		r4, r4, #32
599
600	vtbl.8		d4, {q4-q5}, d4
601	vtbl.8		d5, {q4-q5}, d5
602	vtbl.8		d6, {q4-q5}, d6
603	vtbl.8		d7, {q4-q5}, d7
604
605	veor		q6, q6, q2
606	veor		q7, q7, q3
607
608	vst1.8		{q6-q7}, [r4]	// overlapping stores
609	vst1.8		{q0-q1}, [r1]
610	pop		{r4, pc}
611
612.Lfullblock:
613	vmov		q11, q4
614	vmov		q15, q5
615	b		.Lout
616.Lle96:
617	vmov		q4, q2
618	vmov		q5, q6
619	b		.Lfinalblock
620.Lle128:
621	vmov		q4, q10
622	vmov		q5, q14
623	b		.Lfinalblock
624.Lle224:
625	vmov		q4, q3
626	vmov		q5, q7
627	b		.Lfinalblock
628.Llt256:
629	vmov		q4, q11
630	vmov		q5, q15
631	b		.Lpartialblock
632ENDPROC(chacha_4block_xor_neon)
633
634	.align		L1_CACHE_SHIFT
635.Lpermute:
636	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
637	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
638	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
639	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
640	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
641	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
642	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
643	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f

  1/*
  2 * ChaCha/XChaCha NEON helper functions
  3 *
  4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  5 *
  6 * This program is free software; you can redistribute it and/or modify
  7 * it under the terms of the GNU General Public License version 2 as
  8 * published by the Free Software Foundation.
  9 *
 10 * Based on:
 11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
 12 *
 13 * Copyright (C) 2015 Martin Willi
 14 *
 15 * This program is free software; you can redistribute it and/or modify
 16 * it under the terms of the GNU General Public License as published by
 17 * the Free Software Foundation; either version 2 of the License, or
 18 * (at your option) any later version.
 19 */
 20
 21 /*
 22  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
 23  *
 24  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
 25  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
 26  * (c)  vrev32.16			(16-bit rotations only)
 27  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
 28  *					 needs index vector)
 29  *
 30  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
 31  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
 32  * cycles of (b) on both Cortex-A7 and Cortex-A53.
 33  *
 34  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
 35  * and doesn't need a temporary register.
 36  *
 37  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
 38  * is twice as fast as (a), even when doing (a) on multiple registers
 39  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
 40  * parallelizes better when temporary registers are scarce.
 41  *
 42  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
 43  * (a), so the need to load the rotation table actually makes the vtbl method
 44  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
 45  * seems to be a good compromise to get a more significant speed boost on some
 46  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
 47  */
 48
 49#include <linux/linkage.h>
 
 50
 51	.text
 52	.fpu		neon
 53	.align		5
 54
 55/*
 56 * chacha_permute - permute one block
 57 *
 58 * Permute one 64-byte block where the state matrix is stored in the four NEON
 59 * registers q0-q3.  It performs matrix operations on four words in parallel,
 60 * but requires shuffling to rearrange the words after each round.
 61 *
 62 * The round count is given in r3.
 63 *
 64 * Clobbers: r3, ip, q4-q5
 65 */
 66chacha_permute:
 67
 68	adr		ip, .Lrol8_table
 69	vld1.8		{d10}, [ip, :64]
 70
 71.Ldoubleround:
 72	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 73	vadd.i32	q0, q0, q1
 74	veor		q3, q3, q0
 75	vrev32.16	q3, q3
 76
 77	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 78	vadd.i32	q2, q2, q3
 79	veor		q4, q1, q2
 80	vshl.u32	q1, q4, #12
 81	vsri.u32	q1, q4, #20
 82
 83	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 84	vadd.i32	q0, q0, q1
 85	veor		q3, q3, q0
 86	vtbl.8		d6, {d6}, d10
 87	vtbl.8		d7, {d7}, d10
 88
 89	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 90	vadd.i32	q2, q2, q3
 91	veor		q4, q1, q2
 92	vshl.u32	q1, q4, #7
 93	vsri.u32	q1, q4, #25
 94
 95	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
 96	vext.8		q1, q1, q1, #4
 97	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 98	vext.8		q2, q2, q2, #8
 99	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
100	vext.8		q3, q3, q3, #12
101
102	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
103	vadd.i32	q0, q0, q1
104	veor		q3, q3, q0
105	vrev32.16	q3, q3
106
107	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
108	vadd.i32	q2, q2, q3
109	veor		q4, q1, q2
110	vshl.u32	q1, q4, #12
111	vsri.u32	q1, q4, #20
112
113	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
114	vadd.i32	q0, q0, q1
115	veor		q3, q3, q0
116	vtbl.8		d6, {d6}, d10
117	vtbl.8		d7, {d7}, d10
118
119	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
120	vadd.i32	q2, q2, q3
121	veor		q4, q1, q2
122	vshl.u32	q1, q4, #7
123	vsri.u32	q1, q4, #25
124
125	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
126	vext.8		q1, q1, q1, #12
127	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
128	vext.8		q2, q2, q2, #8
129	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
130	vext.8		q3, q3, q3, #4
131
132	subs		r3, r3, #2
133	bne		.Ldoubleround
134
135	bx		lr
136ENDPROC(chacha_permute)
137
138ENTRY(chacha_block_xor_neon)
139	// r0: Input state matrix, s
140	// r1: 1 data block output, o
141	// r2: 1 data block input, i
142	// r3: nrounds
143	push		{lr}
144
145	// x0..3 = s0..3
146	add		ip, r0, #0x20
147	vld1.32		{q0-q1}, [r0]
148	vld1.32		{q2-q3}, [ip]
149
150	vmov		q8, q0
151	vmov		q9, q1
152	vmov		q10, q2
153	vmov		q11, q3
154
155	bl		chacha_permute
156
157	add		ip, r2, #0x20
158	vld1.8		{q4-q5}, [r2]
159	vld1.8		{q6-q7}, [ip]
160
161	// o0 = i0 ^ (x0 + s0)
162	vadd.i32	q0, q0, q8
163	veor		q0, q0, q4
164
165	// o1 = i1 ^ (x1 + s1)
166	vadd.i32	q1, q1, q9
167	veor		q1, q1, q5
168
169	// o2 = i2 ^ (x2 + s2)
170	vadd.i32	q2, q2, q10
171	veor		q2, q2, q6
172
173	// o3 = i3 ^ (x3 + s3)
174	vadd.i32	q3, q3, q11
175	veor		q3, q3, q7
176
177	add		ip, r1, #0x20
178	vst1.8		{q0-q1}, [r1]
179	vst1.8		{q2-q3}, [ip]
180
181	pop		{pc}
182ENDPROC(chacha_block_xor_neon)
183
184ENTRY(hchacha_block_neon)
185	// r0: Input state matrix, s
186	// r1: output (8 32-bit words)
187	// r2: nrounds
188	push		{lr}
189
190	vld1.32		{q0-q1}, [r0]!
191	vld1.32		{q2-q3}, [r0]
192
193	mov		r3, r2
194	bl		chacha_permute
195
196	vst1.32		{q0}, [r1]!
197	vst1.32		{q3}, [r1]
198
199	pop		{pc}
200ENDPROC(hchacha_block_neon)
201
202	.align		4
203.Lctrinc:	.word	0, 1, 2, 3
204.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
205
206	.align		5
207ENTRY(chacha_4block_xor_neon)
208	push		{r4-r5}
209	mov		r4, sp			// preserve the stack pointer
210	sub		ip, sp, #0x20		// allocate a 32 byte buffer
211	bic		ip, ip, #0x1f		// aligned to 32 bytes
212	mov		sp, ip
213
214	// r0: Input state matrix, s
215	// r1: 4 data blocks output, o
216	// r2: 4 data blocks input, i
217	// r3: nrounds
218
219	//
220	// This function encrypts four consecutive ChaCha blocks by loading
221	// the state matrix in NEON registers four times. The algorithm performs
222	// each operation on the corresponding word of each state matrix, hence
223	// requires no word shuffling. The words are re-interleaved before the
224	// final addition of the original state and the XORing step.
225	//
226
227	// x0..15[0-3] = s0..15[0-3]
228	add		ip, r0, #0x20
229	vld1.32		{q0-q1}, [r0]
230	vld1.32		{q2-q3}, [ip]
231
232	adr		r5, .Lctrinc
233	vdup.32		q15, d7[1]
234	vdup.32		q14, d7[0]
235	vld1.32		{q4}, [r5, :128]
236	vdup.32		q13, d6[1]
237	vdup.32		q12, d6[0]
238	vdup.32		q11, d5[1]
239	vdup.32		q10, d5[0]
240	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
241	vdup.32		q9, d4[1]
242	vdup.32		q8, d4[0]
243	vdup.32		q7, d3[1]
244	vdup.32		q6, d3[0]
245	vdup.32		q5, d2[1]
246	vdup.32		q4, d2[0]
247	vdup.32		q3, d1[1]
248	vdup.32		q2, d1[0]
249	vdup.32		q1, d0[1]
250	vdup.32		q0, d0[0]
251
252	adr		ip, .Lrol8_table
253	b		1f
254
255.Ldoubleround4:
256	vld1.32		{q8-q9}, [sp, :256]
2571:
258	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
259	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
260	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
261	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
262	vadd.i32	q0, q0, q4
263	vadd.i32	q1, q1, q5
264	vadd.i32	q2, q2, q6
265	vadd.i32	q3, q3, q7
266
267	veor		q12, q12, q0
268	veor		q13, q13, q1
269	veor		q14, q14, q2
270	veor		q15, q15, q3
271
272	vrev32.16	q12, q12
273	vrev32.16	q13, q13
274	vrev32.16	q14, q14
275	vrev32.16	q15, q15
276
277	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
278	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
279	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
280	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
281	vadd.i32	q8, q8, q12
282	vadd.i32	q9, q9, q13
283	vadd.i32	q10, q10, q14
284	vadd.i32	q11, q11, q15
285
286	vst1.32		{q8-q9}, [sp, :256]
287
288	veor		q8, q4, q8
289	veor		q9, q5, q9
290	vshl.u32	q4, q8, #12
291	vshl.u32	q5, q9, #12
292	vsri.u32	q4, q8, #20
293	vsri.u32	q5, q9, #20
294
295	veor		q8, q6, q10
296	veor		q9, q7, q11
297	vshl.u32	q6, q8, #12
298	vshl.u32	q7, q9, #12
299	vsri.u32	q6, q8, #20
300	vsri.u32	q7, q9, #20
301
302	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
303	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
304	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
305	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
306	vld1.8		{d16}, [ip, :64]
307	vadd.i32	q0, q0, q4
308	vadd.i32	q1, q1, q5
309	vadd.i32	q2, q2, q6
310	vadd.i32	q3, q3, q7
311
312	veor		q12, q12, q0
313	veor		q13, q13, q1
314	veor		q14, q14, q2
315	veor		q15, q15, q3
316
317	vtbl.8		d24, {d24}, d16
318	vtbl.8		d25, {d25}, d16
319	vtbl.8		d26, {d26}, d16
320	vtbl.8		d27, {d27}, d16
321	vtbl.8		d28, {d28}, d16
322	vtbl.8		d29, {d29}, d16
323	vtbl.8		d30, {d30}, d16
324	vtbl.8		d31, {d31}, d16
325
326	vld1.32		{q8-q9}, [sp, :256]
327
328	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
329	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
330	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
331	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
332	vadd.i32	q8, q8, q12
333	vadd.i32	q9, q9, q13
334	vadd.i32	q10, q10, q14
335	vadd.i32	q11, q11, q15
336
337	vst1.32		{q8-q9}, [sp, :256]
338
339	veor		q8, q4, q8
340	veor		q9, q5, q9
341	vshl.u32	q4, q8, #7
342	vshl.u32	q5, q9, #7
343	vsri.u32	q4, q8, #25
344	vsri.u32	q5, q9, #25
345
346	veor		q8, q6, q10
347	veor		q9, q7, q11
348	vshl.u32	q6, q8, #7
349	vshl.u32	q7, q9, #7
350	vsri.u32	q6, q8, #25
351	vsri.u32	q7, q9, #25
352
353	vld1.32		{q8-q9}, [sp, :256]
354
355	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
356	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
357	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
358	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
359	vadd.i32	q0, q0, q5
360	vadd.i32	q1, q1, q6
361	vadd.i32	q2, q2, q7
362	vadd.i32	q3, q3, q4
363
364	veor		q15, q15, q0
365	veor		q12, q12, q1
366	veor		q13, q13, q2
367	veor		q14, q14, q3
368
369	vrev32.16	q15, q15
370	vrev32.16	q12, q12
371	vrev32.16	q13, q13
372	vrev32.16	q14, q14
373
374	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
375	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
376	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
377	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
378	vadd.i32	q10, q10, q15
379	vadd.i32	q11, q11, q12
380	vadd.i32	q8, q8, q13
381	vadd.i32	q9, q9, q14
382
383	vst1.32		{q8-q9}, [sp, :256]
384
385	veor		q8, q7, q8
386	veor		q9, q4, q9
387	vshl.u32	q7, q8, #12
388	vshl.u32	q4, q9, #12
389	vsri.u32	q7, q8, #20
390	vsri.u32	q4, q9, #20
391
392	veor		q8, q5, q10
393	veor		q9, q6, q11
394	vshl.u32	q5, q8, #12
395	vshl.u32	q6, q9, #12
396	vsri.u32	q5, q8, #20
397	vsri.u32	q6, q9, #20
398
399	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
400	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
401	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
402	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
403	vld1.8		{d16}, [ip, :64]
404	vadd.i32	q0, q0, q5
405	vadd.i32	q1, q1, q6
406	vadd.i32	q2, q2, q7
407	vadd.i32	q3, q3, q4
408
409	veor		q15, q15, q0
410	veor		q12, q12, q1
411	veor		q13, q13, q2
412	veor		q14, q14, q3
413
414	vtbl.8		d30, {d30}, d16
415	vtbl.8		d31, {d31}, d16
416	vtbl.8		d24, {d24}, d16
417	vtbl.8		d25, {d25}, d16
418	vtbl.8		d26, {d26}, d16
419	vtbl.8		d27, {d27}, d16
420	vtbl.8		d28, {d28}, d16
421	vtbl.8		d29, {d29}, d16
422
423	vld1.32		{q8-q9}, [sp, :256]
424
425	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
426	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
427	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
428	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
429	vadd.i32	q10, q10, q15
430	vadd.i32	q11, q11, q12
431	vadd.i32	q8, q8, q13
432	vadd.i32	q9, q9, q14
433
434	vst1.32		{q8-q9}, [sp, :256]
435
436	veor		q8, q7, q8
437	veor		q9, q4, q9
438	vshl.u32	q7, q8, #7
439	vshl.u32	q4, q9, #7
440	vsri.u32	q7, q8, #25
441	vsri.u32	q4, q9, #25
442
443	veor		q8, q5, q10
444	veor		q9, q6, q11
445	vshl.u32	q5, q8, #7
446	vshl.u32	q6, q9, #7
447	vsri.u32	q5, q8, #25
448	vsri.u32	q6, q9, #25
449
450	subs		r3, r3, #2
451	bne		.Ldoubleround4
452
453	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
454	// x8..9[0-3] are on the stack.
455
456	// Re-interleave the words in the first two rows of each block (x0..7).
457	// Also add the counter values 0-3 to x12[0-3].
458	  vld1.32	{q8}, [r5, :128]	// load counter values 0-3
459	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
460	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
461	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
462	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
463	  vadd.u32	q12, q8			// x12 += counter values 0-3
464	vswp		d1, d4
465	vswp		d3, d6
466	  vld1.32	{q8-q9}, [r0]!		// load s0..7
467	vswp		d9, d12
468	vswp		d11, d14
469
470	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
471	// after XORing the first 32 bytes.
472	vswp		q1, q4
473
474	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
475
476	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
477	vadd.u32	q0, q0, q8
478	vadd.u32	q2, q2, q8
479	vadd.u32	q4, q4, q8
480	vadd.u32	q3, q3, q8
481
482	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
483	vadd.u32	q1, q1, q9
484	vadd.u32	q6, q6, q9
485	vadd.u32	q5, q5, q9
486	vadd.u32	q7, q7, q9
487
488	// XOR first 32 bytes using keystream from first two rows of first block
489	vld1.8		{q8-q9}, [r2]!
490	veor		q8, q8, q0
491	veor		q9, q9, q1
492	vst1.8		{q8-q9}, [r1]!
493
494	// Re-interleave the words in the last two rows of each block (x8..15).
495	vld1.32		{q8-q9}, [sp, :256]
 
 
496	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
497	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
498	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
499	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
500	  vld1.32	{q0-q1}, [r0]	// load s8..15
501	vswp		d25, d28
502	vswp		d27, d30
503	vswp		d17, d20
504	vswp		d19, d22
505
506	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
507
508	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
509	vadd.u32	q8,  q8,  q0
510	vadd.u32	q10, q10, q0
511	vadd.u32	q9,  q9,  q0
512	vadd.u32	q11, q11, q0
513
514	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
515	vadd.u32	q12, q12, q1
516	vadd.u32	q14, q14, q1
517	vadd.u32	q13, q13, q1
518	vadd.u32	q15, q15, q1
519
520	// XOR the rest of the data with the keystream
521
522	vld1.8		{q0-q1}, [r2]!
 
523	veor		q0, q0, q8
524	veor		q1, q1, q12
 
525	vst1.8		{q0-q1}, [r1]!
526
527	vld1.8		{q0-q1}, [r2]!
 
528	veor		q0, q0, q2
529	veor		q1, q1, q6
 
530	vst1.8		{q0-q1}, [r1]!
531
532	vld1.8		{q0-q1}, [r2]!
 
533	veor		q0, q0, q10
534	veor		q1, q1, q14
 
535	vst1.8		{q0-q1}, [r1]!
536
537	vld1.8		{q0-q1}, [r2]!
 
538	veor		q0, q0, q4
539	veor		q1, q1, q5
 
540	vst1.8		{q0-q1}, [r1]!
541
542	vld1.8		{q0-q1}, [r2]!
 
543	veor		q0, q0, q9
544	veor		q1, q1, q13
 
545	vst1.8		{q0-q1}, [r1]!
546
547	vld1.8		{q0-q1}, [r2]!
 
548	veor		q0, q0, q3
549	veor		q1, q1, q7
 
 
550	vst1.8		{q0-q1}, [r1]!
551
552	vld1.8		{q0-q1}, [r2]
553	  mov		sp, r4		// restore original stack pointer
554	veor		q0, q0, q11
555	veor		q1, q1, q15
556	vst1.8		{q0-q1}, [r1]
557
558	pop		{r4-r5}
559	bx		lr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560ENDPROC(chacha_4block_xor_neon)