Loading...
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2018 Google, Inc.
4 */
5
6#include <linux/linkage.h>
7#include <asm/assembler.h>
8
9/*
10 * Design notes:
11 *
12 * 16 registers would be needed to hold the state matrix, but only 14 are
13 * available because 'sp' and 'pc' cannot be used. So we spill the elements
14 * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
15 * 'ldrd' and one 'strd' instruction per round.
16 *
17 * All rotates are performed using the implicit rotate operand accepted by the
18 * 'add' and 'eor' instructions. This is faster than using explicit rotate
19 * instructions. To make this work, we allow the values in the second and last
20 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
21 * wrong rotation amount. The rotation amount is then fixed up just in time
22 * when the values are used. 'brot' is the number of bits the values in row 'b'
23 * need to be rotated right to arrive at the correct values, and 'drot'
24 * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
25 * that they end up as (25, 24) after every round.
26 */
27
28 // ChaCha state registers
29 X0 .req r0
30 X1 .req r1
31 X2 .req r2
32 X3 .req r3
33 X4 .req r4
34 X5 .req r5
35 X6 .req r6
36 X7 .req r7
37 X8_X10 .req r8 // shared by x8 and x10
38 X9_X11 .req r9 // shared by x9 and x11
39 X12 .req r10
40 X13 .req r11
41 X14 .req r12
42 X15 .req r14
43
44.macro __rev out, in, t0, t1, t2
45.if __LINUX_ARM_ARCH__ >= 6
46 rev \out, \in
47.else
48 lsl \t0, \in, #24
49 and \t1, \in, #0xff00
50 and \t2, \in, #0xff0000
51 orr \out, \t0, \in, lsr #24
52 orr \out, \out, \t1, lsl #8
53 orr \out, \out, \t2, lsr #8
54.endif
55.endm
56
57.macro _le32_bswap x, t0, t1, t2
58#ifdef __ARMEB__
59 __rev \x, \x, \t0, \t1, \t2
60#endif
61.endm
62
63.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
64 _le32_bswap \a, \t0, \t1, \t2
65 _le32_bswap \b, \t0, \t1, \t2
66 _le32_bswap \c, \t0, \t1, \t2
67 _le32_bswap \d, \t0, \t1, \t2
68.endm
69
70.macro __ldrd a, b, src, offset
71#if __LINUX_ARM_ARCH__ >= 6
72 ldrd \a, \b, [\src, #\offset]
73#else
74 ldr \a, [\src, #\offset]
75 ldr \b, [\src, #\offset + 4]
76#endif
77.endm
78
79.macro __strd a, b, dst, offset
80#if __LINUX_ARM_ARCH__ >= 6
81 strd \a, \b, [\dst, #\offset]
82#else
83 str \a, [\dst, #\offset]
84 str \b, [\dst, #\offset + 4]
85#endif
86.endm
87
88.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
89
90 // a += b; d ^= a; d = rol(d, 16);
91 add \a1, \a1, \b1, ror #brot
92 add \a2, \a2, \b2, ror #brot
93 eor \d1, \a1, \d1, ror #drot
94 eor \d2, \a2, \d2, ror #drot
95 // drot == 32 - 16 == 16
96
97 // c += d; b ^= c; b = rol(b, 12);
98 add \c1, \c1, \d1, ror #16
99 add \c2, \c2, \d2, ror #16
100 eor \b1, \c1, \b1, ror #brot
101 eor \b2, \c2, \b2, ror #brot
102 // brot == 32 - 12 == 20
103
104 // a += b; d ^= a; d = rol(d, 8);
105 add \a1, \a1, \b1, ror #20
106 add \a2, \a2, \b2, ror #20
107 eor \d1, \a1, \d1, ror #16
108 eor \d2, \a2, \d2, ror #16
109 // drot == 32 - 8 == 24
110
111 // c += d; b ^= c; b = rol(b, 7);
112 add \c1, \c1, \d1, ror #24
113 add \c2, \c2, \d2, ror #24
114 eor \b1, \c1, \b1, ror #20
115 eor \b2, \c2, \b2, ror #20
116 // brot == 32 - 7 == 25
117.endm
118
119.macro _doubleround
120
121 // column round
122
123 // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
124 _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
125
126 // save (x8, x9); restore (x10, x11)
127 __strd X8_X10, X9_X11, sp, 0
128 __ldrd X8_X10, X9_X11, sp, 8
129
130 // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
131 _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
132
133 .set brot, 25
134 .set drot, 24
135
136 // diagonal round
137
138 // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
139 _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
140
141 // save (x10, x11); restore (x8, x9)
142 __strd X8_X10, X9_X11, sp, 8
143 __ldrd X8_X10, X9_X11, sp, 0
144
145 // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
146 _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
147.endm
148
149.macro _chacha_permute nrounds
150 .set brot, 0
151 .set drot, 0
152 .rept \nrounds / 2
153 _doubleround
154 .endr
155.endm
156
157.macro _chacha nrounds
158
159.Lnext_block\@:
160 // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
161 // Registers contain x0-x9,x12-x15.
162
163 // Do the core ChaCha permutation to update x0-x15.
164 _chacha_permute \nrounds
165
166 add sp, #8
167 // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
168 // Registers contain x0-x9,x12-x15.
169 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
170
171 // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
172 push {X8_X10, X9_X11, X12, X13, X14, X15}
173
174 // Load (OUT, IN, LEN).
175 ldr r14, [sp, #96]
176 ldr r12, [sp, #100]
177 ldr r11, [sp, #104]
178
179 orr r10, r14, r12
180
181 // Use slow path if fewer than 64 bytes remain.
182 cmp r11, #64
183 blt .Lxor_slowpath\@
184
185 // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
186 // ARMv6+, since ldmia and stmia (used below) still require alignment.
187 tst r10, #3
188 bne .Lxor_slowpath\@
189
190 // Fast path: XOR 64 bytes of aligned data.
191
192 // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
193 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
194 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
195
196 // x0-x3
197 __ldrd r8, r9, sp, 32
198 __ldrd r10, r11, sp, 40
199 add X0, X0, r8
200 add X1, X1, r9
201 add X2, X2, r10
202 add X3, X3, r11
203 _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
204 ldmia r12!, {r8-r11}
205 eor X0, X0, r8
206 eor X1, X1, r9
207 eor X2, X2, r10
208 eor X3, X3, r11
209 stmia r14!, {X0-X3}
210
211 // x4-x7
212 __ldrd r8, r9, sp, 48
213 __ldrd r10, r11, sp, 56
214 add X4, r8, X4, ror #brot
215 add X5, r9, X5, ror #brot
216 ldmia r12!, {X0-X3}
217 add X6, r10, X6, ror #brot
218 add X7, r11, X7, ror #brot
219 _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
220 eor X4, X4, X0
221 eor X5, X5, X1
222 eor X6, X6, X2
223 eor X7, X7, X3
224 stmia r14!, {X4-X7}
225
226 // x8-x15
227 pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
228 __ldrd r8, r9, sp, 32
229 __ldrd r10, r11, sp, 40
230 add r0, r0, r8 // x8
231 add r1, r1, r9 // x9
232 add r6, r6, r10 // x10
233 add r7, r7, r11 // x11
234 _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
235 ldmia r12!, {r8-r11}
236 eor r0, r0, r8 // x8
237 eor r1, r1, r9 // x9
238 eor r6, r6, r10 // x10
239 eor r7, r7, r11 // x11
240 stmia r14!, {r0,r1,r6,r7}
241 ldmia r12!, {r0,r1,r6,r7}
242 __ldrd r8, r9, sp, 48
243 __ldrd r10, r11, sp, 56
244 add r2, r8, r2, ror #drot // x12
245 add r3, r9, r3, ror #drot // x13
246 add r4, r10, r4, ror #drot // x14
247 add r5, r11, r5, ror #drot // x15
248 _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
249 ldr r9, [sp, #72] // load LEN
250 eor r2, r2, r0 // x12
251 eor r3, r3, r1 // x13
252 eor r4, r4, r6 // x14
253 eor r5, r5, r7 // x15
254 subs r9, #64 // decrement and check LEN
255 stmia r14!, {r2-r5}
256
257 beq .Ldone\@
258
259.Lprepare_for_next_block\@:
260
261 // Stack: x0-x15 OUT IN LEN
262
263 // Increment block counter (x12)
264 add r8, #1
265
266 // Store updated (OUT, IN, LEN)
267 str r14, [sp, #64]
268 str r12, [sp, #68]
269 str r9, [sp, #72]
270
271 mov r14, sp
272
273 // Store updated block counter (x12)
274 str r8, [sp, #48]
275
276 sub sp, #16
277
278 // Reload state and do next block
279 ldmia r14!, {r0-r11} // load x0-x11
280 __strd r10, r11, sp, 8 // store x10-x11 before state
281 ldmia r14, {r10-r12,r14} // load x12-x15
282 b .Lnext_block\@
283
284.Lxor_slowpath\@:
285 // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
286 // We handle it by storing the 64 bytes of keystream to the stack, then
287 // XOR-ing the needed portion with the data.
288
289 // Allocate keystream buffer
290 sub sp, #64
291 mov r14, sp
292
293 // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
294 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
295 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
296
297 // Save keystream for x0-x3
298 __ldrd r8, r9, sp, 96
299 __ldrd r10, r11, sp, 104
300 add X0, X0, r8
301 add X1, X1, r9
302 add X2, X2, r10
303 add X3, X3, r11
304 _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
305 stmia r14!, {X0-X3}
306
307 // Save keystream for x4-x7
308 __ldrd r8, r9, sp, 112
309 __ldrd r10, r11, sp, 120
310 add X4, r8, X4, ror #brot
311 add X5, r9, X5, ror #brot
312 add X6, r10, X6, ror #brot
313 add X7, r11, X7, ror #brot
314 _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
315 add r8, sp, #64
316 stmia r14!, {X4-X7}
317
318 // Save keystream for x8-x15
319 ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
320 __ldrd r8, r9, sp, 128
321 __ldrd r10, r11, sp, 136
322 add r0, r0, r8 // x8
323 add r1, r1, r9 // x9
324 add r6, r6, r10 // x10
325 add r7, r7, r11 // x11
326 _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
327 stmia r14!, {r0,r1,r6,r7}
328 __ldrd r8, r9, sp, 144
329 __ldrd r10, r11, sp, 152
330 add r2, r8, r2, ror #drot // x12
331 add r3, r9, r3, ror #drot // x13
332 add r4, r10, r4, ror #drot // x14
333 add r5, r11, r5, ror #drot // x15
334 _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
335 stmia r14, {r2-r5}
336
337 // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
338 // Registers: r8 is block counter, r12 is IN.
339
340 ldr r9, [sp, #168] // LEN
341 ldr r14, [sp, #160] // OUT
342 cmp r9, #64
343 mov r0, sp
344 movle r1, r9
345 movgt r1, #64
346 // r1 is number of bytes to XOR, in range [1, 64]
347
348.if __LINUX_ARM_ARCH__ < 6
349 orr r2, r12, r14
350 tst r2, #3 // IN or OUT misaligned?
351 bne .Lxor_next_byte\@
352.endif
353
354 // XOR a word at a time
355.rept 16
356 subs r1, #4
357 blt .Lxor_words_done\@
358 ldr r2, [r12], #4
359 ldr r3, [r0], #4
360 eor r2, r2, r3
361 str r2, [r14], #4
362.endr
363 b .Lxor_slowpath_done\@
364.Lxor_words_done\@:
365 ands r1, r1, #3
366 beq .Lxor_slowpath_done\@
367
368 // XOR a byte at a time
369.Lxor_next_byte\@:
370 ldrb r2, [r12], #1
371 ldrb r3, [r0], #1
372 eor r2, r2, r3
373 strb r2, [r14], #1
374 subs r1, #1
375 bne .Lxor_next_byte\@
376
377.Lxor_slowpath_done\@:
378 subs r9, #64
379 add sp, #96
380 bgt .Lprepare_for_next_block\@
381
382.Ldone\@:
383.endm // _chacha
384
385/*
386 * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
387 * const u32 *state, int nrounds);
388 */
389ENTRY(chacha_doarm)
390 cmp r2, #0 // len == 0?
391 reteq lr
392
393 ldr ip, [sp]
394 cmp ip, #12
395
396 push {r0-r2,r4-r11,lr}
397
398 // Push state x0-x15 onto stack.
399 // Also store an extra copy of x10-x11 just before the state.
400
401 add X12, r3, #48
402 ldm X12, {X12,X13,X14,X15}
403 push {X12,X13,X14,X15}
404 sub sp, sp, #64
405
406 __ldrd X8_X10, X9_X11, r3, 40
407 __strd X8_X10, X9_X11, sp, 8
408 __strd X8_X10, X9_X11, sp, 56
409 ldm r3, {X0-X9_X11}
410 __strd X0, X1, sp, 16
411 __strd X2, X3, sp, 24
412 __strd X4, X5, sp, 32
413 __strd X6, X7, sp, 40
414 __strd X8_X10, X9_X11, sp, 48
415
416 beq 1f
417 _chacha 20
418
4190: add sp, #76
420 pop {r4-r11, pc}
421
4221: _chacha 12
423 b 0b
424ENDPROC(chacha_doarm)
425
426/*
427 * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
428 */
429ENTRY(hchacha_block_arm)
430 push {r1,r4-r11,lr}
431
432 cmp r2, #12 // ChaCha12 ?
433
434 mov r14, r0
435 ldmia r14!, {r0-r11} // load x0-x11
436 push {r10-r11} // store x10-x11 to stack
437 ldm r14, {r10-r12,r14} // load x12-x15
438 sub sp, #8
439
440 beq 1f
441 _chacha_permute 20
442
443 // Skip over (unused0-unused1, x10-x11)
4440: add sp, #16
445
446 // Fix up rotations of x12-x15
447 ror X12, X12, #drot
448 ror X13, X13, #drot
449 pop {r4} // load 'out'
450 ror X14, X14, #drot
451 ror X15, X15, #drot
452
453 // Store (x0-x3,x12-x15) to 'out'
454 stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
455
456 pop {r4-r11,pc}
457
4581: _chacha_permute 12
459 b 0b
460ENDPROC(hchacha_block_arm)