Loading...
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 16-way parallel algorithm (AVX)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <linux/cfi_types.h>
11#include <asm/asm-offsets.h>
12#include <asm/frame.h>
13
14/* register macros */
15#define CTX %rdi
16
17
18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
19 ( (((a0) & 1) << 0) | \
20 (((a1) & 1) << 1) | \
21 (((a2) & 1) << 2) | \
22 (((a3) & 1) << 3) | \
23 (((a4) & 1) << 4) | \
24 (((a5) & 1) << 5) | \
25 (((a6) & 1) << 6) | \
26 (((a7) & 1) << 7) )
27
28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
29 ( ((l7) << (0 * 8)) | \
30 ((l6) << (1 * 8)) | \
31 ((l5) << (2 * 8)) | \
32 ((l4) << (3 * 8)) | \
33 ((l3) << (4 * 8)) | \
34 ((l2) << (5 * 8)) | \
35 ((l1) << (6 * 8)) | \
36 ((l0) << (7 * 8)) )
37
38#define inc_le128(x, minus_one, tmp) \
39 vpcmpeqq minus_one, x, tmp; \
40 vpsubq minus_one, x, x; \
41 vpslldq $8, tmp, tmp; \
42 vpsubq tmp, x, x;
43
44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
45 vpand x, mask4bit, tmp0; \
46 vpandn x, mask4bit, x; \
47 vpsrld $4, x, x; \
48 \
49 vpshufb tmp0, lo_t, tmp0; \
50 vpshufb x, hi_t, x; \
51 vpxor tmp0, x, x;
52
53#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
54 vpunpckhdq x1, x0, t2; \
55 vpunpckldq x1, x0, x0; \
56 \
57 vpunpckldq x3, x2, t1; \
58 vpunpckhdq x3, x2, x2; \
59 \
60 vpunpckhqdq t1, x0, x1; \
61 vpunpcklqdq t1, x0, x0; \
62 \
63 vpunpckhqdq x2, t2, x3; \
64 vpunpcklqdq x2, t2, x2;
65
66#define byteslice_16x16b(a0, b0, c0, d0, \
67 a1, b1, c1, d1, \
68 a2, b2, c2, d2, \
69 a3, b3, c3, d3, \
70 st0, st1) \
71 vmovdqu d2, st0; \
72 vmovdqu d3, st1; \
73 transpose_4x4(a0, a1, a2, a3, d2, d3); \
74 transpose_4x4(b0, b1, b2, b3, d2, d3); \
75 vmovdqu st0, d2; \
76 vmovdqu st1, d3; \
77 \
78 vmovdqu a0, st0; \
79 vmovdqu a1, st1; \
80 transpose_4x4(c0, c1, c2, c3, a0, a1); \
81 transpose_4x4(d0, d1, d2, d3, a0, a1); \
82 \
83 vmovdqu .Lshufb_16x16b(%rip), a0; \
84 vmovdqu st1, a1; \
85 vpshufb a0, a2, a2; \
86 vpshufb a0, a3, a3; \
87 vpshufb a0, b0, b0; \
88 vpshufb a0, b1, b1; \
89 vpshufb a0, b2, b2; \
90 vpshufb a0, b3, b3; \
91 vpshufb a0, a1, a1; \
92 vpshufb a0, c0, c0; \
93 vpshufb a0, c1, c1; \
94 vpshufb a0, c2, c2; \
95 vpshufb a0, c3, c3; \
96 vpshufb a0, d0, d0; \
97 vpshufb a0, d1, d1; \
98 vpshufb a0, d2, d2; \
99 vpshufb a0, d3, d3; \
100 vmovdqu d3, st1; \
101 vmovdqu st0, d3; \
102 vpshufb a0, d3, a0; \
103 vmovdqu d2, st0; \
104 \
105 transpose_4x4(a0, b0, c0, d0, d2, d3); \
106 transpose_4x4(a1, b1, c1, d1, d2, d3); \
107 vmovdqu st0, d2; \
108 vmovdqu st1, d3; \
109 \
110 vmovdqu b0, st0; \
111 vmovdqu b1, st1; \
112 transpose_4x4(a2, b2, c2, d2, b0, b1); \
113 transpose_4x4(a3, b3, c3, d3, b0, b1); \
114 vmovdqu st0, b0; \
115 vmovdqu st1, b1; \
116 /* does not adjust output bytes inside vectors */
117
118#define debyteslice_16x16b(a0, b0, c0, d0, \
119 a1, b1, c1, d1, \
120 a2, b2, c2, d2, \
121 a3, b3, c3, d3, \
122 st0, st1) \
123 vmovdqu d2, st0; \
124 vmovdqu d3, st1; \
125 transpose_4x4(a0, a1, a2, a3, d2, d3); \
126 transpose_4x4(b0, b1, b2, b3, d2, d3); \
127 vmovdqu st0, d2; \
128 vmovdqu st1, d3; \
129 \
130 vmovdqu a0, st0; \
131 vmovdqu a1, st1; \
132 transpose_4x4(c0, c1, c2, c3, a0, a1); \
133 transpose_4x4(d0, d1, d2, d3, a0, a1); \
134 \
135 vmovdqu .Lshufb_16x16b(%rip), a0; \
136 vmovdqu st1, a1; \
137 vpshufb a0, a2, a2; \
138 vpshufb a0, a3, a3; \
139 vpshufb a0, b0, b0; \
140 vpshufb a0, b1, b1; \
141 vpshufb a0, b2, b2; \
142 vpshufb a0, b3, b3; \
143 vpshufb a0, a1, a1; \
144 vpshufb a0, c0, c0; \
145 vpshufb a0, c1, c1; \
146 vpshufb a0, c2, c2; \
147 vpshufb a0, c3, c3; \
148 vpshufb a0, d0, d0; \
149 vpshufb a0, d1, d1; \
150 vpshufb a0, d2, d2; \
151 vpshufb a0, d3, d3; \
152 vmovdqu d3, st1; \
153 vmovdqu st0, d3; \
154 vpshufb a0, d3, a0; \
155 vmovdqu d2, st0; \
156 \
157 transpose_4x4(c0, d0, a0, b0, d2, d3); \
158 transpose_4x4(c1, d1, a1, b1, d2, d3); \
159 vmovdqu st0, d2; \
160 vmovdqu st1, d3; \
161 \
162 vmovdqu b0, st0; \
163 vmovdqu b1, st1; \
164 transpose_4x4(c2, d2, a2, b2, b0, b1); \
165 transpose_4x4(c3, d3, a3, b3, b0, b1); \
166 vmovdqu st0, b0; \
167 vmovdqu st1, b1; \
168 /* does not adjust output bytes inside vectors */
169
170/* load blocks to registers and apply pre-whitening */
171#define inpack16_pre(x0, x1, x2, x3, \
172 x4, x5, x6, x7, \
173 y0, y1, y2, y3, \
174 y4, y5, y6, y7, \
175 rio) \
176 vmovdqu (0 * 16)(rio), x0; \
177 vmovdqu (1 * 16)(rio), x1; \
178 vmovdqu (2 * 16)(rio), x2; \
179 vmovdqu (3 * 16)(rio), x3; \
180 vmovdqu (4 * 16)(rio), x4; \
181 vmovdqu (5 * 16)(rio), x5; \
182 vmovdqu (6 * 16)(rio), x6; \
183 vmovdqu (7 * 16)(rio), x7; \
184 vmovdqu (8 * 16)(rio), y0; \
185 vmovdqu (9 * 16)(rio), y1; \
186 vmovdqu (10 * 16)(rio), y2; \
187 vmovdqu (11 * 16)(rio), y3; \
188 vmovdqu (12 * 16)(rio), y4; \
189 vmovdqu (13 * 16)(rio), y5; \
190 vmovdqu (14 * 16)(rio), y6; \
191 vmovdqu (15 * 16)(rio), y7;
192
193/* byteslice pre-whitened blocks and store to temporary memory */
194#define inpack16_post(x0, x1, x2, x3, \
195 x4, x5, x6, x7, \
196 y0, y1, y2, y3, \
197 y4, y5, y6, y7, \
198 mem_ab, mem_cd) \
199 byteslice_16x16b(x0, x1, x2, x3, \
200 x4, x5, x6, x7, \
201 y0, y1, y2, y3, \
202 y4, y5, y6, y7, \
203 (mem_ab), (mem_cd)); \
204 \
205 vmovdqu x0, 0 * 16(mem_ab); \
206 vmovdqu x1, 1 * 16(mem_ab); \
207 vmovdqu x2, 2 * 16(mem_ab); \
208 vmovdqu x3, 3 * 16(mem_ab); \
209 vmovdqu x4, 4 * 16(mem_ab); \
210 vmovdqu x5, 5 * 16(mem_ab); \
211 vmovdqu x6, 6 * 16(mem_ab); \
212 vmovdqu x7, 7 * 16(mem_ab); \
213 vmovdqu y0, 0 * 16(mem_cd); \
214 vmovdqu y1, 1 * 16(mem_cd); \
215 vmovdqu y2, 2 * 16(mem_cd); \
216 vmovdqu y3, 3 * 16(mem_cd); \
217 vmovdqu y4, 4 * 16(mem_cd); \
218 vmovdqu y5, 5 * 16(mem_cd); \
219 vmovdqu y6, 6 * 16(mem_cd); \
220 vmovdqu y7, 7 * 16(mem_cd);
221
222#define write_output(x0, x1, x2, x3, \
223 x4, x5, x6, x7, \
224 y0, y1, y2, y3, \
225 y4, y5, y6, y7, \
226 mem) \
227 vmovdqu x0, 0 * 16(mem); \
228 vmovdqu x1, 1 * 16(mem); \
229 vmovdqu x2, 2 * 16(mem); \
230 vmovdqu x3, 3 * 16(mem); \
231 vmovdqu x4, 4 * 16(mem); \
232 vmovdqu x5, 5 * 16(mem); \
233 vmovdqu x6, 6 * 16(mem); \
234 vmovdqu x7, 7 * 16(mem); \
235 vmovdqu y0, 8 * 16(mem); \
236 vmovdqu y1, 9 * 16(mem); \
237 vmovdqu y2, 10 * 16(mem); \
238 vmovdqu y3, 11 * 16(mem); \
239 vmovdqu y4, 12 * 16(mem); \
240 vmovdqu y5, 13 * 16(mem); \
241 vmovdqu y6, 14 * 16(mem); \
242 vmovdqu y7, 15 * 16(mem); \
243
244#define aria_store_state_8way(x0, x1, x2, x3, \
245 x4, x5, x6, x7, \
246 mem_tmp, idx) \
247 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
248 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
249 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
250 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
251 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
252 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
253 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
254 vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255
256#define aria_load_state_8way(x0, x1, x2, x3, \
257 x4, x5, x6, x7, \
258 mem_tmp, idx) \
259 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
260 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
261 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
262 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
263 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
264 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
265 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
266 vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267
268#define aria_ark_8way(x0, x1, x2, x3, \
269 x4, x5, x6, x7, \
270 t0, t1, t2, rk, \
271 idx, round) \
272 /* AddRoundKey */ \
273 vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
274 vpsrld $24, t0, t2; \
275 vpshufb t1, t2, t2; \
276 vpxor t2, x0, x0; \
277 vpsrld $16, t0, t2; \
278 vpshufb t1, t2, t2; \
279 vpxor t2, x1, x1; \
280 vpsrld $8, t0, t2; \
281 vpshufb t1, t2, t2; \
282 vpxor t2, x2, x2; \
283 vpshufb t1, t0, t2; \
284 vpxor t2, x3, x3; \
285 vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
286 vpsrld $24, t0, t2; \
287 vpshufb t1, t2, t2; \
288 vpxor t2, x4, x4; \
289 vpsrld $16, t0, t2; \
290 vpshufb t1, t2, t2; \
291 vpxor t2, x5, x5; \
292 vpsrld $8, t0, t2; \
293 vpshufb t1, t2, t2; \
294 vpxor t2, x6, x6; \
295 vpshufb t1, t0, t2; \
296 vpxor t2, x7, x7;
297
298#ifdef CONFIG_AS_GFNI
299#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
300 x4, x5, x6, x7, \
301 t0, t1, t2, t3, \
302 t4, t5, t6, t7) \
303 vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
304 vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
305 vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
306 vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
307 vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
308 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
309 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
310 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
311 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
312 vgf2p8affineinvqb $0, t2, x2, x2; \
313 vgf2p8affineinvqb $0, t2, x6, x6; \
314 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
315 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
316 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
317 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
318 vgf2p8affineinvqb $0, t2, x3, x3; \
319 vgf2p8affineinvqb $0, t2, x7, x7
320
321#endif /* CONFIG_AS_GFNI */
322
323#define aria_sbox_8way(x0, x1, x2, x3, \
324 x4, x5, x6, x7, \
325 t0, t1, t2, t3, \
326 t4, t5, t6, t7) \
327 vmovdqa .Linv_shift_row(%rip), t0; \
328 vmovdqa .Lshift_row(%rip), t1; \
329 vbroadcastss .L0f0f0f0f(%rip), t6; \
330 vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
331 vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
332 vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
333 vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
334 \
335 vaesenclast t7, x0, x0; \
336 vaesenclast t7, x4, x4; \
337 vaesenclast t7, x1, x1; \
338 vaesenclast t7, x5, x5; \
339 vaesdeclast t7, x2, x2; \
340 vaesdeclast t7, x6, x6; \
341 \
342 /* AES inverse shift rows */ \
343 vpshufb t0, x0, x0; \
344 vpshufb t0, x4, x4; \
345 vpshufb t0, x1, x1; \
346 vpshufb t0, x5, x5; \
347 vpshufb t1, x3, x3; \
348 vpshufb t1, x7, x7; \
349 vpshufb t1, x2, x2; \
350 vpshufb t1, x6, x6; \
351 \
352 /* affine transformation for S2 */ \
353 filter_8bit(x1, t2, t3, t6, t0); \
354 /* affine transformation for S2 */ \
355 filter_8bit(x5, t2, t3, t6, t0); \
356 \
357 /* affine transformation for X2 */ \
358 filter_8bit(x3, t4, t5, t6, t0); \
359 /* affine transformation for X2 */ \
360 filter_8bit(x7, t4, t5, t6, t0); \
361 vaesdeclast t7, x3, x3; \
362 vaesdeclast t7, x7, x7;
363
364#define aria_diff_m(x0, x1, x2, x3, \
365 t0, t1, t2, t3) \
366 /* T = rotr32(X, 8); */ \
367 /* X ^= T */ \
368 vpxor x0, x3, t0; \
369 vpxor x1, x0, t1; \
370 vpxor x2, x1, t2; \
371 vpxor x3, x2, t3; \
372 /* X = T ^ rotr(X, 16); */ \
373 vpxor t2, x0, x0; \
374 vpxor x1, t3, t3; \
375 vpxor t0, x2, x2; \
376 vpxor t1, x3, x1; \
377 vmovdqu t3, x3;
378
379#define aria_diff_word(x0, x1, x2, x3, \
380 x4, x5, x6, x7, \
381 y0, y1, y2, y3, \
382 y4, y5, y6, y7) \
383 /* t1 ^= t2; */ \
384 vpxor y0, x4, x4; \
385 vpxor y1, x5, x5; \
386 vpxor y2, x6, x6; \
387 vpxor y3, x7, x7; \
388 \
389 /* t2 ^= t3; */ \
390 vpxor y4, y0, y0; \
391 vpxor y5, y1, y1; \
392 vpxor y6, y2, y2; \
393 vpxor y7, y3, y3; \
394 \
395 /* t0 ^= t1; */ \
396 vpxor x4, x0, x0; \
397 vpxor x5, x1, x1; \
398 vpxor x6, x2, x2; \
399 vpxor x7, x3, x3; \
400 \
401 /* t3 ^= t1; */ \
402 vpxor x4, y4, y4; \
403 vpxor x5, y5, y5; \
404 vpxor x6, y6, y6; \
405 vpxor x7, y7, y7; \
406 \
407 /* t2 ^= t0; */ \
408 vpxor x0, y0, y0; \
409 vpxor x1, y1, y1; \
410 vpxor x2, y2, y2; \
411 vpxor x3, y3, y3; \
412 \
413 /* t1 ^= t2; */ \
414 vpxor y0, x4, x4; \
415 vpxor y1, x5, x5; \
416 vpxor y2, x6, x6; \
417 vpxor y3, x7, x7;
418
419#define aria_fe(x0, x1, x2, x3, \
420 x4, x5, x6, x7, \
421 y0, y1, y2, y3, \
422 y4, y5, y6, y7, \
423 mem_tmp, rk, round) \
424 vpxor y7, y7, y7; \
425 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
426 y0, y7, y2, rk, 8, round); \
427 \
428 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
429 y0, y1, y2, y3, y4, y5, y6, y7); \
430 \
431 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
432 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
433 aria_store_state_8way(x0, x1, x2, x3, \
434 x4, x5, x6, x7, \
435 mem_tmp, 8); \
436 \
437 aria_load_state_8way(x0, x1, x2, x3, \
438 x4, x5, x6, x7, \
439 mem_tmp, 0); \
440 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
441 y0, y7, y2, rk, 0, round); \
442 \
443 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
444 y0, y1, y2, y3, y4, y5, y6, y7); \
445 \
446 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
447 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
448 aria_store_state_8way(x0, x1, x2, x3, \
449 x4, x5, x6, x7, \
450 mem_tmp, 0); \
451 aria_load_state_8way(y0, y1, y2, y3, \
452 y4, y5, y6, y7, \
453 mem_tmp, 8); \
454 aria_diff_word(x0, x1, x2, x3, \
455 x4, x5, x6, x7, \
456 y0, y1, y2, y3, \
457 y4, y5, y6, y7); \
458 /* aria_diff_byte() \
459 * T3 = ABCD -> BADC \
460 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
461 * T0 = ABCD -> CDAB \
462 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
463 * T1 = ABCD -> DCBA \
464 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
465 */ \
466 aria_diff_word(x2, x3, x0, x1, \
467 x7, x6, x5, x4, \
468 y0, y1, y2, y3, \
469 y5, y4, y7, y6); \
470 aria_store_state_8way(x3, x2, x1, x0, \
471 x6, x7, x4, x5, \
472 mem_tmp, 0);
473
474#define aria_fo(x0, x1, x2, x3, \
475 x4, x5, x6, x7, \
476 y0, y1, y2, y3, \
477 y4, y5, y6, y7, \
478 mem_tmp, rk, round) \
479 vpxor y7, y7, y7; \
480 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
481 y0, y7, y2, rk, 8, round); \
482 \
483 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
484 y0, y1, y2, y3, y4, y5, y6, y7); \
485 \
486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
488 aria_store_state_8way(x0, x1, x2, x3, \
489 x4, x5, x6, x7, \
490 mem_tmp, 8); \
491 \
492 aria_load_state_8way(x0, x1, x2, x3, \
493 x4, x5, x6, x7, \
494 mem_tmp, 0); \
495 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
496 y0, y7, y2, rk, 0, round); \
497 \
498 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
499 y0, y1, y2, y3, y4, y5, y6, y7); \
500 \
501 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
502 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
503 aria_store_state_8way(x0, x1, x2, x3, \
504 x4, x5, x6, x7, \
505 mem_tmp, 0); \
506 aria_load_state_8way(y0, y1, y2, y3, \
507 y4, y5, y6, y7, \
508 mem_tmp, 8); \
509 aria_diff_word(x0, x1, x2, x3, \
510 x4, x5, x6, x7, \
511 y0, y1, y2, y3, \
512 y4, y5, y6, y7); \
513 /* aria_diff_byte() \
514 * T1 = ABCD -> BADC \
515 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
516 * T2 = ABCD -> CDAB \
517 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
518 * T3 = ABCD -> DCBA \
519 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
520 */ \
521 aria_diff_word(x0, x1, x2, x3, \
522 x5, x4, x7, x6, \
523 y2, y3, y0, y1, \
524 y7, y6, y5, y4); \
525 aria_store_state_8way(x3, x2, x1, x0, \
526 x6, x7, x4, x5, \
527 mem_tmp, 0);
528
529#define aria_ff(x0, x1, x2, x3, \
530 x4, x5, x6, x7, \
531 y0, y1, y2, y3, \
532 y4, y5, y6, y7, \
533 mem_tmp, rk, round, last_round) \
534 vpxor y7, y7, y7; \
535 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
536 y0, y7, y2, rk, 8, round); \
537 \
538 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
539 y0, y1, y2, y3, y4, y5, y6, y7); \
540 \
541 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
542 y0, y7, y2, rk, 8, last_round); \
543 \
544 aria_store_state_8way(x0, x1, x2, x3, \
545 x4, x5, x6, x7, \
546 mem_tmp, 8); \
547 \
548 aria_load_state_8way(x0, x1, x2, x3, \
549 x4, x5, x6, x7, \
550 mem_tmp, 0); \
551 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
552 y0, y7, y2, rk, 0, round); \
553 \
554 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
555 y0, y1, y2, y3, y4, y5, y6, y7); \
556 \
557 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
558 y0, y7, y2, rk, 0, last_round); \
559 \
560 aria_load_state_8way(y0, y1, y2, y3, \
561 y4, y5, y6, y7, \
562 mem_tmp, 8);
563
564#ifdef CONFIG_AS_GFNI
565#define aria_fe_gfni(x0, x1, x2, x3, \
566 x4, x5, x6, x7, \
567 y0, y1, y2, y3, \
568 y4, y5, y6, y7, \
569 mem_tmp, rk, round) \
570 vpxor y7, y7, y7; \
571 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
572 y0, y7, y2, rk, 8, round); \
573 \
574 aria_sbox_8way_gfni(x2, x3, x0, x1, \
575 x6, x7, x4, x5, \
576 y0, y1, y2, y3, \
577 y4, y5, y6, y7); \
578 \
579 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
580 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
581 aria_store_state_8way(x0, x1, x2, x3, \
582 x4, x5, x6, x7, \
583 mem_tmp, 8); \
584 \
585 aria_load_state_8way(x0, x1, x2, x3, \
586 x4, x5, x6, x7, \
587 mem_tmp, 0); \
588 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
589 y0, y7, y2, rk, 0, round); \
590 \
591 aria_sbox_8way_gfni(x2, x3, x0, x1, \
592 x6, x7, x4, x5, \
593 y0, y1, y2, y3, \
594 y4, y5, y6, y7); \
595 \
596 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
597 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
598 aria_store_state_8way(x0, x1, x2, x3, \
599 x4, x5, x6, x7, \
600 mem_tmp, 0); \
601 aria_load_state_8way(y0, y1, y2, y3, \
602 y4, y5, y6, y7, \
603 mem_tmp, 8); \
604 aria_diff_word(x0, x1, x2, x3, \
605 x4, x5, x6, x7, \
606 y0, y1, y2, y3, \
607 y4, y5, y6, y7); \
608 /* aria_diff_byte() \
609 * T3 = ABCD -> BADC \
610 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
611 * T0 = ABCD -> CDAB \
612 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
613 * T1 = ABCD -> DCBA \
614 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
615 */ \
616 aria_diff_word(x2, x3, x0, x1, \
617 x7, x6, x5, x4, \
618 y0, y1, y2, y3, \
619 y5, y4, y7, y6); \
620 aria_store_state_8way(x3, x2, x1, x0, \
621 x6, x7, x4, x5, \
622 mem_tmp, 0);
623
624#define aria_fo_gfni(x0, x1, x2, x3, \
625 x4, x5, x6, x7, \
626 y0, y1, y2, y3, \
627 y4, y5, y6, y7, \
628 mem_tmp, rk, round) \
629 vpxor y7, y7, y7; \
630 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
631 y0, y7, y2, rk, 8, round); \
632 \
633 aria_sbox_8way_gfni(x0, x1, x2, x3, \
634 x4, x5, x6, x7, \
635 y0, y1, y2, y3, \
636 y4, y5, y6, y7); \
637 \
638 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
639 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
640 aria_store_state_8way(x0, x1, x2, x3, \
641 x4, x5, x6, x7, \
642 mem_tmp, 8); \
643 \
644 aria_load_state_8way(x0, x1, x2, x3, \
645 x4, x5, x6, x7, \
646 mem_tmp, 0); \
647 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
648 y0, y7, y2, rk, 0, round); \
649 \
650 aria_sbox_8way_gfni(x0, x1, x2, x3, \
651 x4, x5, x6, x7, \
652 y0, y1, y2, y3, \
653 y4, y5, y6, y7); \
654 \
655 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
656 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
657 aria_store_state_8way(x0, x1, x2, x3, \
658 x4, x5, x6, x7, \
659 mem_tmp, 0); \
660 aria_load_state_8way(y0, y1, y2, y3, \
661 y4, y5, y6, y7, \
662 mem_tmp, 8); \
663 aria_diff_word(x0, x1, x2, x3, \
664 x4, x5, x6, x7, \
665 y0, y1, y2, y3, \
666 y4, y5, y6, y7); \
667 /* aria_diff_byte() \
668 * T1 = ABCD -> BADC \
669 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
670 * T2 = ABCD -> CDAB \
671 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
672 * T3 = ABCD -> DCBA \
673 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
674 */ \
675 aria_diff_word(x0, x1, x2, x3, \
676 x5, x4, x7, x6, \
677 y2, y3, y0, y1, \
678 y7, y6, y5, y4); \
679 aria_store_state_8way(x3, x2, x1, x0, \
680 x6, x7, x4, x5, \
681 mem_tmp, 0);
682
683#define aria_ff_gfni(x0, x1, x2, x3, \
684 x4, x5, x6, x7, \
685 y0, y1, y2, y3, \
686 y4, y5, y6, y7, \
687 mem_tmp, rk, round, last_round) \
688 vpxor y7, y7, y7; \
689 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
690 y0, y7, y2, rk, 8, round); \
691 \
692 aria_sbox_8way_gfni(x2, x3, x0, x1, \
693 x6, x7, x4, x5, \
694 y0, y1, y2, y3, \
695 y4, y5, y6, y7); \
696 \
697 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
698 y0, y7, y2, rk, 8, last_round); \
699 \
700 aria_store_state_8way(x0, x1, x2, x3, \
701 x4, x5, x6, x7, \
702 mem_tmp, 8); \
703 \
704 aria_load_state_8way(x0, x1, x2, x3, \
705 x4, x5, x6, x7, \
706 mem_tmp, 0); \
707 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
708 y0, y7, y2, rk, 0, round); \
709 \
710 aria_sbox_8way_gfni(x2, x3, x0, x1, \
711 x6, x7, x4, x5, \
712 y0, y1, y2, y3, \
713 y4, y5, y6, y7); \
714 \
715 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
716 y0, y7, y2, rk, 0, last_round); \
717 \
718 aria_load_state_8way(y0, y1, y2, y3, \
719 y4, y5, y6, y7, \
720 mem_tmp, 8);
721
722#endif /* CONFIG_AS_GFNI */
723
724/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725.section .rodata.cst16, "aM", @progbits, 16
726.align 16
727
728#define SHUFB_BYTES(idx) \
729 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
730
731.Lshufb_16x16b:
732 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733/* For isolating SubBytes from AESENCLAST, inverse shift row */
734.Linv_shift_row:
735 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
737.Lshift_row:
738 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740/* For CTR-mode IV byteswap */
741.Lbswap128_mask:
742 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
744
745/* AES inverse affine and S2 combined:
746 * 1 1 0 0 0 0 0 1 x0 0
747 * 0 1 0 0 1 0 0 0 x1 0
748 * 1 1 0 0 1 1 1 1 x2 0
749 * 0 1 1 0 1 0 0 1 x3 1
750 * 0 1 0 0 1 1 0 0 * x4 + 0
751 * 0 1 0 1 1 0 0 0 x5 0
752 * 0 0 0 0 0 1 0 1 x6 0
753 * 1 1 1 0 0 1 1 1 x7 1
754 */
755.Ltf_lo__inv_aff__and__s2:
756 .octa 0x92172DA81A9FA520B2370D883ABF8500
757.Ltf_hi__inv_aff__and__s2:
758 .octa 0x2B15FFC1AF917B45E6D8320C625CB688
759
760/* X2 and AES forward affine combined:
761 * 1 0 1 1 0 0 0 1 x0 0
762 * 0 1 1 1 1 0 1 1 x1 0
763 * 0 0 0 1 1 0 1 0 x2 1
764 * 0 1 0 0 0 1 0 0 x3 0
765 * 0 0 1 1 1 0 1 1 * x4 + 0
766 * 0 1 0 0 1 0 0 0 x5 0
767 * 1 1 0 1 0 0 1 1 x6 0
768 * 0 1 0 0 1 0 1 0 x7 0
769 */
770.Ltf_lo__x2__and__fwd_aff:
771 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772.Ltf_hi__x2__and__fwd_aff:
773 .octa 0x3F893781E95FE1576CDA64D2BA0CB204
774
775#ifdef CONFIG_AS_GFNI
776/* AES affine: */
777#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
778.Ltf_aff_bitmatrix:
779 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
780 BV8(1, 1, 0, 0, 0, 1, 1, 1),
781 BV8(1, 1, 1, 0, 0, 0, 1, 1),
782 BV8(1, 1, 1, 1, 0, 0, 0, 1),
783 BV8(1, 1, 1, 1, 1, 0, 0, 0),
784 BV8(0, 1, 1, 1, 1, 1, 0, 0),
785 BV8(0, 0, 1, 1, 1, 1, 1, 0),
786 BV8(0, 0, 0, 1, 1, 1, 1, 1))
787 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
788 BV8(1, 1, 0, 0, 0, 1, 1, 1),
789 BV8(1, 1, 1, 0, 0, 0, 1, 1),
790 BV8(1, 1, 1, 1, 0, 0, 0, 1),
791 BV8(1, 1, 1, 1, 1, 0, 0, 0),
792 BV8(0, 1, 1, 1, 1, 1, 0, 0),
793 BV8(0, 0, 1, 1, 1, 1, 1, 0),
794 BV8(0, 0, 0, 1, 1, 1, 1, 1))
795
796/* AES inverse affine: */
797#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
798.Ltf_inv_bitmatrix:
799 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
800 BV8(1, 0, 0, 1, 0, 0, 1, 0),
801 BV8(0, 1, 0, 0, 1, 0, 0, 1),
802 BV8(1, 0, 1, 0, 0, 1, 0, 0),
803 BV8(0, 1, 0, 1, 0, 0, 1, 0),
804 BV8(0, 0, 1, 0, 1, 0, 0, 1),
805 BV8(1, 0, 0, 1, 0, 1, 0, 0),
806 BV8(0, 1, 0, 0, 1, 0, 1, 0))
807 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
808 BV8(1, 0, 0, 1, 0, 0, 1, 0),
809 BV8(0, 1, 0, 0, 1, 0, 0, 1),
810 BV8(1, 0, 1, 0, 0, 1, 0, 0),
811 BV8(0, 1, 0, 1, 0, 0, 1, 0),
812 BV8(0, 0, 1, 0, 1, 0, 0, 1),
813 BV8(1, 0, 0, 1, 0, 1, 0, 0),
814 BV8(0, 1, 0, 0, 1, 0, 1, 0))
815
816/* S2: */
817#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
818.Ltf_s2_bitmatrix:
819 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
820 BV8(0, 0, 1, 1, 1, 1, 1, 1),
821 BV8(1, 1, 1, 0, 1, 1, 0, 1),
822 BV8(1, 1, 0, 0, 0, 0, 1, 1),
823 BV8(0, 1, 0, 0, 0, 0, 1, 1),
824 BV8(1, 1, 0, 0, 1, 1, 1, 0),
825 BV8(0, 1, 1, 0, 0, 0, 1, 1),
826 BV8(1, 1, 1, 1, 0, 1, 1, 0))
827 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
828 BV8(0, 0, 1, 1, 1, 1, 1, 1),
829 BV8(1, 1, 1, 0, 1, 1, 0, 1),
830 BV8(1, 1, 0, 0, 0, 0, 1, 1),
831 BV8(0, 1, 0, 0, 0, 0, 1, 1),
832 BV8(1, 1, 0, 0, 1, 1, 1, 0),
833 BV8(0, 1, 1, 0, 0, 0, 1, 1),
834 BV8(1, 1, 1, 1, 0, 1, 1, 0))
835
836/* X2: */
837#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
838.Ltf_x2_bitmatrix:
839 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
840 BV8(0, 0, 1, 0, 0, 1, 1, 0),
841 BV8(0, 0, 0, 0, 1, 0, 1, 0),
842 BV8(1, 1, 1, 0, 0, 0, 1, 1),
843 BV8(1, 1, 1, 0, 1, 1, 0, 0),
844 BV8(0, 1, 1, 0, 1, 0, 1, 1),
845 BV8(1, 0, 1, 1, 1, 1, 0, 1),
846 BV8(1, 0, 0, 1, 0, 0, 1, 1))
847 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848 BV8(0, 0, 1, 0, 0, 1, 1, 0),
849 BV8(0, 0, 0, 0, 1, 0, 1, 0),
850 BV8(1, 1, 1, 0, 0, 0, 1, 1),
851 BV8(1, 1, 1, 0, 1, 1, 0, 0),
852 BV8(0, 1, 1, 0, 1, 0, 1, 1),
853 BV8(1, 0, 1, 1, 1, 1, 0, 1),
854 BV8(1, 0, 0, 1, 0, 0, 1, 1))
855
856/* Identity matrix: */
857.Ltf_id_bitmatrix:
858 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859 BV8(0, 1, 0, 0, 0, 0, 0, 0),
860 BV8(0, 0, 1, 0, 0, 0, 0, 0),
861 BV8(0, 0, 0, 1, 0, 0, 0, 0),
862 BV8(0, 0, 0, 0, 1, 0, 0, 0),
863 BV8(0, 0, 0, 0, 0, 1, 0, 0),
864 BV8(0, 0, 0, 0, 0, 0, 1, 0),
865 BV8(0, 0, 0, 0, 0, 0, 0, 1))
866 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
867 BV8(0, 1, 0, 0, 0, 0, 0, 0),
868 BV8(0, 0, 1, 0, 0, 0, 0, 0),
869 BV8(0, 0, 0, 1, 0, 0, 0, 0),
870 BV8(0, 0, 0, 0, 1, 0, 0, 0),
871 BV8(0, 0, 0, 0, 0, 1, 0, 0),
872 BV8(0, 0, 0, 0, 0, 0, 1, 0),
873 BV8(0, 0, 0, 0, 0, 0, 0, 1))
874#endif /* CONFIG_AS_GFNI */
875
876/* 4-bit mask */
877.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
878.align 4
879.L0f0f0f0f:
880 .long 0x0f0f0f0f
881
882.text
883
884SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
885 /* input:
886 * %r9: rk
887 * %rsi: dst
888 * %rdx: src
889 * %xmm0..%xmm15: 16 byte-sliced blocks
890 */
891
892 FRAME_BEGIN
893
894 movq %rsi, %rax;
895 leaq 8 * 16(%rax), %r8;
896
897 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
898 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899 %xmm15, %rax, %r8);
900 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
901 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902 %rax, %r9, 0);
903 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905 %xmm15, %rax, %r9, 1);
906 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
907 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
908 %rax, %r9, 2);
909 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
910 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
911 %xmm15, %rax, %r9, 3);
912 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
913 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
914 %rax, %r9, 4);
915 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
916 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
917 %xmm15, %rax, %r9, 5);
918 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
919 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
920 %rax, %r9, 6);
921 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
922 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
923 %xmm15, %rax, %r9, 7);
924 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
925 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
926 %rax, %r9, 8);
927 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929 %xmm15, %rax, %r9, 9);
930 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
931 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
932 %rax, %r9, 10);
933 cmpl $12, ARIA_CTX_rounds(CTX);
934 jne .Laria_192;
935 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
936 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
937 %xmm15, %rax, %r9, 11, 12);
938 jmp .Laria_end;
939.Laria_192:
940 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942 %xmm15, %rax, %r9, 11);
943 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
944 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
945 %rax, %r9, 12);
946 cmpl $14, ARIA_CTX_rounds(CTX);
947 jne .Laria_256;
948 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
949 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
950 %xmm15, %rax, %r9, 13, 14);
951 jmp .Laria_end;
952.Laria_256:
953 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
954 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955 %xmm15, %rax, %r9, 13);
956 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
957 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
958 %rax, %r9, 14);
959 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961 %xmm15, %rax, %r9, 15, 16);
962.Laria_end:
963 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
964 %xmm9, %xmm13, %xmm0, %xmm5,
965 %xmm10, %xmm14, %xmm3, %xmm6,
966 %xmm11, %xmm15, %xmm2, %xmm7,
967 (%rax), (%r8));
968
969 FRAME_END
970 RET;
971SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
972
973SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
974 /* input:
975 * %rdi: ctx, CTX
976 * %rsi: dst
977 * %rdx: src
978 */
979
980 FRAME_BEGIN
981
982 leaq ARIA_CTX_enc_key(CTX), %r9;
983
984 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
985 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
986 %xmm15, %rdx);
987
988 call __aria_aesni_avx_crypt_16way;
989
990 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
991 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
992 %xmm15, %rax);
993
994 FRAME_END
995 RET;
996SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
997
998SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
999 /* input:
1000 * %rdi: ctx, CTX
1001 * %rsi: dst
1002 * %rdx: src
1003 */
1004
1005 FRAME_BEGIN
1006
1007 leaq ARIA_CTX_dec_key(CTX), %r9;
1008
1009 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1010 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1011 %xmm15, %rdx);
1012
1013 call __aria_aesni_avx_crypt_16way;
1014
1015 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1016 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1017 %xmm15, %rax);
1018
1019 FRAME_END
1020 RET;
1021SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1022
1023SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1024 /* input:
1025 * %rdi: ctx
1026 * %rsi: dst
1027 * %rdx: src
1028 * %rcx: keystream
1029 * %r8: iv (big endian, 128bit)
1030 */
1031
1032 FRAME_BEGIN
1033 /* load IV and byteswap */
1034 vmovdqu (%r8), %xmm8;
1035
1036 vmovdqa .Lbswap128_mask (%rip), %xmm1;
1037 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1038
1039 vpcmpeqd %xmm0, %xmm0, %xmm0;
1040 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1041
1042 /* construct IVs */
1043 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044 vpshufb %xmm1, %xmm3, %xmm9;
1045 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046 vpshufb %xmm1, %xmm3, %xmm10;
1047 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048 vpshufb %xmm1, %xmm3, %xmm11;
1049 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050 vpshufb %xmm1, %xmm3, %xmm12;
1051 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052 vpshufb %xmm1, %xmm3, %xmm13;
1053 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054 vpshufb %xmm1, %xmm3, %xmm14;
1055 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056 vpshufb %xmm1, %xmm3, %xmm15;
1057 vmovdqu %xmm8, (0 * 16)(%rcx);
1058 vmovdqu %xmm9, (1 * 16)(%rcx);
1059 vmovdqu %xmm10, (2 * 16)(%rcx);
1060 vmovdqu %xmm11, (3 * 16)(%rcx);
1061 vmovdqu %xmm12, (4 * 16)(%rcx);
1062 vmovdqu %xmm13, (5 * 16)(%rcx);
1063 vmovdqu %xmm14, (6 * 16)(%rcx);
1064 vmovdqu %xmm15, (7 * 16)(%rcx);
1065
1066 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067 vpshufb %xmm1, %xmm3, %xmm8;
1068 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069 vpshufb %xmm1, %xmm3, %xmm9;
1070 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071 vpshufb %xmm1, %xmm3, %xmm10;
1072 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073 vpshufb %xmm1, %xmm3, %xmm11;
1074 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075 vpshufb %xmm1, %xmm3, %xmm12;
1076 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077 vpshufb %xmm1, %xmm3, %xmm13;
1078 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079 vpshufb %xmm1, %xmm3, %xmm14;
1080 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081 vpshufb %xmm1, %xmm3, %xmm15;
1082 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083 vpshufb %xmm1, %xmm3, %xmm4;
1084 vmovdqu %xmm4, (%r8);
1085
1086 vmovdqu (0 * 16)(%rcx), %xmm0;
1087 vmovdqu (1 * 16)(%rcx), %xmm1;
1088 vmovdqu (2 * 16)(%rcx), %xmm2;
1089 vmovdqu (3 * 16)(%rcx), %xmm3;
1090 vmovdqu (4 * 16)(%rcx), %xmm4;
1091 vmovdqu (5 * 16)(%rcx), %xmm5;
1092 vmovdqu (6 * 16)(%rcx), %xmm6;
1093 vmovdqu (7 * 16)(%rcx), %xmm7;
1094
1095 FRAME_END
1096 RET;
1097SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1098
1099SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1100 /* input:
1101 * %rdi: ctx
1102 * %rsi: dst
1103 * %rdx: src
1104 * %rcx: keystream
1105 * %r8: iv (big endian, 128bit)
1106 */
1107 FRAME_BEGIN
1108
1109 call __aria_aesni_avx_ctr_gen_keystream_16way;
1110
1111 leaq (%rsi), %r10;
1112 leaq (%rdx), %r11;
1113 leaq (%rcx), %rsi;
1114 leaq (%rcx), %rdx;
1115 leaq ARIA_CTX_enc_key(CTX), %r9;
1116
1117 call __aria_aesni_avx_crypt_16way;
1118
1119 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1120 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1121 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1122 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1123 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1124 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1125 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1126 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1127 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1128 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1129 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1130 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1131 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1132 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1133 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1134 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1135 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1136 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1137 %xmm15, %r10);
1138
1139 FRAME_END
1140 RET;
1141SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1142
1143#ifdef CONFIG_AS_GFNI
1144SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1145 /* input:
1146 * %r9: rk
1147 * %rsi: dst
1148 * %rdx: src
1149 * %xmm0..%xmm15: 16 byte-sliced blocks
1150 */
1151
1152 FRAME_BEGIN
1153
1154 movq %rsi, %rax;
1155 leaq 8 * 16(%rax), %r8;
1156
1157 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1158 %xmm4, %xmm5, %xmm6, %xmm7,
1159 %xmm8, %xmm9, %xmm10, %xmm11,
1160 %xmm12, %xmm13, %xmm14,
1161 %xmm15, %rax, %r8);
1162 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1163 %xmm12, %xmm13, %xmm14, %xmm15,
1164 %xmm0, %xmm1, %xmm2, %xmm3,
1165 %xmm4, %xmm5, %xmm6, %xmm7,
1166 %rax, %r9, 0);
1167 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168 %xmm4, %xmm5, %xmm6, %xmm7,
1169 %xmm8, %xmm9, %xmm10, %xmm11,
1170 %xmm12, %xmm13, %xmm14,
1171 %xmm15, %rax, %r9, 1);
1172 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173 %xmm12, %xmm13, %xmm14, %xmm15,
1174 %xmm0, %xmm1, %xmm2, %xmm3,
1175 %xmm4, %xmm5, %xmm6, %xmm7,
1176 %rax, %r9, 2);
1177 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1178 %xmm4, %xmm5, %xmm6, %xmm7,
1179 %xmm8, %xmm9, %xmm10, %xmm11,
1180 %xmm12, %xmm13, %xmm14,
1181 %xmm15, %rax, %r9, 3);
1182 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1183 %xmm12, %xmm13, %xmm14, %xmm15,
1184 %xmm0, %xmm1, %xmm2, %xmm3,
1185 %xmm4, %xmm5, %xmm6, %xmm7,
1186 %rax, %r9, 4);
1187 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1188 %xmm4, %xmm5, %xmm6, %xmm7,
1189 %xmm8, %xmm9, %xmm10, %xmm11,
1190 %xmm12, %xmm13, %xmm14,
1191 %xmm15, %rax, %r9, 5);
1192 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1193 %xmm12, %xmm13, %xmm14, %xmm15,
1194 %xmm0, %xmm1, %xmm2, %xmm3,
1195 %xmm4, %xmm5, %xmm6, %xmm7,
1196 %rax, %r9, 6);
1197 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1198 %xmm4, %xmm5, %xmm6, %xmm7,
1199 %xmm8, %xmm9, %xmm10, %xmm11,
1200 %xmm12, %xmm13, %xmm14,
1201 %xmm15, %rax, %r9, 7);
1202 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1203 %xmm12, %xmm13, %xmm14, %xmm15,
1204 %xmm0, %xmm1, %xmm2, %xmm3,
1205 %xmm4, %xmm5, %xmm6, %xmm7,
1206 %rax, %r9, 8);
1207 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1208 %xmm4, %xmm5, %xmm6, %xmm7,
1209 %xmm8, %xmm9, %xmm10, %xmm11,
1210 %xmm12, %xmm13, %xmm14,
1211 %xmm15, %rax, %r9, 9);
1212 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1213 %xmm12, %xmm13, %xmm14, %xmm15,
1214 %xmm0, %xmm1, %xmm2, %xmm3,
1215 %xmm4, %xmm5, %xmm6, %xmm7,
1216 %rax, %r9, 10);
1217 cmpl $12, ARIA_CTX_rounds(CTX);
1218 jne .Laria_gfni_192;
1219 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1220 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1221 %xmm15, %rax, %r9, 11, 12);
1222 jmp .Laria_gfni_end;
1223.Laria_gfni_192:
1224 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1225 %xmm4, %xmm5, %xmm6, %xmm7,
1226 %xmm8, %xmm9, %xmm10, %xmm11,
1227 %xmm12, %xmm13, %xmm14,
1228 %xmm15, %rax, %r9, 11);
1229 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1230 %xmm12, %xmm13, %xmm14, %xmm15,
1231 %xmm0, %xmm1, %xmm2, %xmm3,
1232 %xmm4, %xmm5, %xmm6, %xmm7,
1233 %rax, %r9, 12);
1234 cmpl $14, ARIA_CTX_rounds(CTX);
1235 jne .Laria_gfni_256;
1236 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1237 %xmm4, %xmm5, %xmm6, %xmm7,
1238 %xmm8, %xmm9, %xmm10, %xmm11,
1239 %xmm12, %xmm13, %xmm14,
1240 %xmm15, %rax, %r9, 13, 14);
1241 jmp .Laria_gfni_end;
1242.Laria_gfni_256:
1243 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1244 %xmm4, %xmm5, %xmm6, %xmm7,
1245 %xmm8, %xmm9, %xmm10, %xmm11,
1246 %xmm12, %xmm13, %xmm14,
1247 %xmm15, %rax, %r9, 13);
1248 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1249 %xmm12, %xmm13, %xmm14, %xmm15,
1250 %xmm0, %xmm1, %xmm2, %xmm3,
1251 %xmm4, %xmm5, %xmm6, %xmm7,
1252 %rax, %r9, 14);
1253 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1254 %xmm4, %xmm5, %xmm6, %xmm7,
1255 %xmm8, %xmm9, %xmm10, %xmm11,
1256 %xmm12, %xmm13, %xmm14,
1257 %xmm15, %rax, %r9, 15, 16);
1258.Laria_gfni_end:
1259 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1260 %xmm9, %xmm13, %xmm0, %xmm5,
1261 %xmm10, %xmm14, %xmm3, %xmm6,
1262 %xmm11, %xmm15, %xmm2, %xmm7,
1263 (%rax), (%r8));
1264
1265 FRAME_END
1266 RET;
1267SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1268
1269SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1270 /* input:
1271 * %rdi: ctx, CTX
1272 * %rsi: dst
1273 * %rdx: src
1274 */
1275
1276 FRAME_BEGIN
1277
1278 leaq ARIA_CTX_enc_key(CTX), %r9;
1279
1280 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1281 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1282 %xmm15, %rdx);
1283
1284 call __aria_aesni_avx_gfni_crypt_16way;
1285
1286 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1287 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1288 %xmm15, %rax);
1289
1290 FRAME_END
1291 RET;
1292SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1293
1294SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1295 /* input:
1296 * %rdi: ctx, CTX
1297 * %rsi: dst
1298 * %rdx: src
1299 */
1300
1301 FRAME_BEGIN
1302
1303 leaq ARIA_CTX_dec_key(CTX), %r9;
1304
1305 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1306 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1307 %xmm15, %rdx);
1308
1309 call __aria_aesni_avx_gfni_crypt_16way;
1310
1311 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1312 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1313 %xmm15, %rax);
1314
1315 FRAME_END
1316 RET;
1317SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1318
1319SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1320 /* input:
1321 * %rdi: ctx
1322 * %rsi: dst
1323 * %rdx: src
1324 * %rcx: keystream
1325 * %r8: iv (big endian, 128bit)
1326 */
1327 FRAME_BEGIN
1328
1329 call __aria_aesni_avx_ctr_gen_keystream_16way
1330
1331 leaq (%rsi), %r10;
1332 leaq (%rdx), %r11;
1333 leaq (%rcx), %rsi;
1334 leaq (%rcx), %rdx;
1335 leaq ARIA_CTX_enc_key(CTX), %r9;
1336
1337 call __aria_aesni_avx_gfni_crypt_16way;
1338
1339 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1340 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1341 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1342 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1343 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1344 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1345 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1346 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1347 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1348 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1349 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1350 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1351 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1352 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1353 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1354 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1355 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1356 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1357 %xmm15, %r10);
1358
1359 FRAME_END
1360 RET;
1361SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1362#endif /* CONFIG_AS_GFNI */
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 16-way parallel algorithm (AVX)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <linux/cfi_types.h>
11#include <asm/frame.h>
12
13/* struct aria_ctx: */
14#define enc_key 0
15#define dec_key 272
16#define rounds 544
17
18/* register macros */
19#define CTX %rdi
20
21
22#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
23 ( (((a0) & 1) << 0) | \
24 (((a1) & 1) << 1) | \
25 (((a2) & 1) << 2) | \
26 (((a3) & 1) << 3) | \
27 (((a4) & 1) << 4) | \
28 (((a5) & 1) << 5) | \
29 (((a6) & 1) << 6) | \
30 (((a7) & 1) << 7) )
31
32#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
33 ( ((l7) << (0 * 8)) | \
34 ((l6) << (1 * 8)) | \
35 ((l5) << (2 * 8)) | \
36 ((l4) << (3 * 8)) | \
37 ((l3) << (4 * 8)) | \
38 ((l2) << (5 * 8)) | \
39 ((l1) << (6 * 8)) | \
40 ((l0) << (7 * 8)) )
41
42#define inc_le128(x, minus_one, tmp) \
43 vpcmpeqq minus_one, x, tmp; \
44 vpsubq minus_one, x, x; \
45 vpslldq $8, tmp, tmp; \
46 vpsubq tmp, x, x;
47
48#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
49 vpand x, mask4bit, tmp0; \
50 vpandn x, mask4bit, x; \
51 vpsrld $4, x, x; \
52 \
53 vpshufb tmp0, lo_t, tmp0; \
54 vpshufb x, hi_t, x; \
55 vpxor tmp0, x, x;
56
57#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
58 vpunpckhdq x1, x0, t2; \
59 vpunpckldq x1, x0, x0; \
60 \
61 vpunpckldq x3, x2, t1; \
62 vpunpckhdq x3, x2, x2; \
63 \
64 vpunpckhqdq t1, x0, x1; \
65 vpunpcklqdq t1, x0, x0; \
66 \
67 vpunpckhqdq x2, t2, x3; \
68 vpunpcklqdq x2, t2, x2;
69
70#define byteslice_16x16b(a0, b0, c0, d0, \
71 a1, b1, c1, d1, \
72 a2, b2, c2, d2, \
73 a3, b3, c3, d3, \
74 st0, st1) \
75 vmovdqu d2, st0; \
76 vmovdqu d3, st1; \
77 transpose_4x4(a0, a1, a2, a3, d2, d3); \
78 transpose_4x4(b0, b1, b2, b3, d2, d3); \
79 vmovdqu st0, d2; \
80 vmovdqu st1, d3; \
81 \
82 vmovdqu a0, st0; \
83 vmovdqu a1, st1; \
84 transpose_4x4(c0, c1, c2, c3, a0, a1); \
85 transpose_4x4(d0, d1, d2, d3, a0, a1); \
86 \
87 vmovdqu .Lshufb_16x16b, a0; \
88 vmovdqu st1, a1; \
89 vpshufb a0, a2, a2; \
90 vpshufb a0, a3, a3; \
91 vpshufb a0, b0, b0; \
92 vpshufb a0, b1, b1; \
93 vpshufb a0, b2, b2; \
94 vpshufb a0, b3, b3; \
95 vpshufb a0, a1, a1; \
96 vpshufb a0, c0, c0; \
97 vpshufb a0, c1, c1; \
98 vpshufb a0, c2, c2; \
99 vpshufb a0, c3, c3; \
100 vpshufb a0, d0, d0; \
101 vpshufb a0, d1, d1; \
102 vpshufb a0, d2, d2; \
103 vpshufb a0, d3, d3; \
104 vmovdqu d3, st1; \
105 vmovdqu st0, d3; \
106 vpshufb a0, d3, a0; \
107 vmovdqu d2, st0; \
108 \
109 transpose_4x4(a0, b0, c0, d0, d2, d3); \
110 transpose_4x4(a1, b1, c1, d1, d2, d3); \
111 vmovdqu st0, d2; \
112 vmovdqu st1, d3; \
113 \
114 vmovdqu b0, st0; \
115 vmovdqu b1, st1; \
116 transpose_4x4(a2, b2, c2, d2, b0, b1); \
117 transpose_4x4(a3, b3, c3, d3, b0, b1); \
118 vmovdqu st0, b0; \
119 vmovdqu st1, b1; \
120 /* does not adjust output bytes inside vectors */
121
122#define debyteslice_16x16b(a0, b0, c0, d0, \
123 a1, b1, c1, d1, \
124 a2, b2, c2, d2, \
125 a3, b3, c3, d3, \
126 st0, st1) \
127 vmovdqu d2, st0; \
128 vmovdqu d3, st1; \
129 transpose_4x4(a0, a1, a2, a3, d2, d3); \
130 transpose_4x4(b0, b1, b2, b3, d2, d3); \
131 vmovdqu st0, d2; \
132 vmovdqu st1, d3; \
133 \
134 vmovdqu a0, st0; \
135 vmovdqu a1, st1; \
136 transpose_4x4(c0, c1, c2, c3, a0, a1); \
137 transpose_4x4(d0, d1, d2, d3, a0, a1); \
138 \
139 vmovdqu .Lshufb_16x16b, a0; \
140 vmovdqu st1, a1; \
141 vpshufb a0, a2, a2; \
142 vpshufb a0, a3, a3; \
143 vpshufb a0, b0, b0; \
144 vpshufb a0, b1, b1; \
145 vpshufb a0, b2, b2; \
146 vpshufb a0, b3, b3; \
147 vpshufb a0, a1, a1; \
148 vpshufb a0, c0, c0; \
149 vpshufb a0, c1, c1; \
150 vpshufb a0, c2, c2; \
151 vpshufb a0, c3, c3; \
152 vpshufb a0, d0, d0; \
153 vpshufb a0, d1, d1; \
154 vpshufb a0, d2, d2; \
155 vpshufb a0, d3, d3; \
156 vmovdqu d3, st1; \
157 vmovdqu st0, d3; \
158 vpshufb a0, d3, a0; \
159 vmovdqu d2, st0; \
160 \
161 transpose_4x4(c0, d0, a0, b0, d2, d3); \
162 transpose_4x4(c1, d1, a1, b1, d2, d3); \
163 vmovdqu st0, d2; \
164 vmovdqu st1, d3; \
165 \
166 vmovdqu b0, st0; \
167 vmovdqu b1, st1; \
168 transpose_4x4(c2, d2, a2, b2, b0, b1); \
169 transpose_4x4(c3, d3, a3, b3, b0, b1); \
170 vmovdqu st0, b0; \
171 vmovdqu st1, b1; \
172 /* does not adjust output bytes inside vectors */
173
174/* load blocks to registers and apply pre-whitening */
175#define inpack16_pre(x0, x1, x2, x3, \
176 x4, x5, x6, x7, \
177 y0, y1, y2, y3, \
178 y4, y5, y6, y7, \
179 rio) \
180 vmovdqu (0 * 16)(rio), x0; \
181 vmovdqu (1 * 16)(rio), x1; \
182 vmovdqu (2 * 16)(rio), x2; \
183 vmovdqu (3 * 16)(rio), x3; \
184 vmovdqu (4 * 16)(rio), x4; \
185 vmovdqu (5 * 16)(rio), x5; \
186 vmovdqu (6 * 16)(rio), x6; \
187 vmovdqu (7 * 16)(rio), x7; \
188 vmovdqu (8 * 16)(rio), y0; \
189 vmovdqu (9 * 16)(rio), y1; \
190 vmovdqu (10 * 16)(rio), y2; \
191 vmovdqu (11 * 16)(rio), y3; \
192 vmovdqu (12 * 16)(rio), y4; \
193 vmovdqu (13 * 16)(rio), y5; \
194 vmovdqu (14 * 16)(rio), y6; \
195 vmovdqu (15 * 16)(rio), y7;
196
197/* byteslice pre-whitened blocks and store to temporary memory */
198#define inpack16_post(x0, x1, x2, x3, \
199 x4, x5, x6, x7, \
200 y0, y1, y2, y3, \
201 y4, y5, y6, y7, \
202 mem_ab, mem_cd) \
203 byteslice_16x16b(x0, x1, x2, x3, \
204 x4, x5, x6, x7, \
205 y0, y1, y2, y3, \
206 y4, y5, y6, y7, \
207 (mem_ab), (mem_cd)); \
208 \
209 vmovdqu x0, 0 * 16(mem_ab); \
210 vmovdqu x1, 1 * 16(mem_ab); \
211 vmovdqu x2, 2 * 16(mem_ab); \
212 vmovdqu x3, 3 * 16(mem_ab); \
213 vmovdqu x4, 4 * 16(mem_ab); \
214 vmovdqu x5, 5 * 16(mem_ab); \
215 vmovdqu x6, 6 * 16(mem_ab); \
216 vmovdqu x7, 7 * 16(mem_ab); \
217 vmovdqu y0, 0 * 16(mem_cd); \
218 vmovdqu y1, 1 * 16(mem_cd); \
219 vmovdqu y2, 2 * 16(mem_cd); \
220 vmovdqu y3, 3 * 16(mem_cd); \
221 vmovdqu y4, 4 * 16(mem_cd); \
222 vmovdqu y5, 5 * 16(mem_cd); \
223 vmovdqu y6, 6 * 16(mem_cd); \
224 vmovdqu y7, 7 * 16(mem_cd);
225
226#define write_output(x0, x1, x2, x3, \
227 x4, x5, x6, x7, \
228 y0, y1, y2, y3, \
229 y4, y5, y6, y7, \
230 mem) \
231 vmovdqu x0, 0 * 16(mem); \
232 vmovdqu x1, 1 * 16(mem); \
233 vmovdqu x2, 2 * 16(mem); \
234 vmovdqu x3, 3 * 16(mem); \
235 vmovdqu x4, 4 * 16(mem); \
236 vmovdqu x5, 5 * 16(mem); \
237 vmovdqu x6, 6 * 16(mem); \
238 vmovdqu x7, 7 * 16(mem); \
239 vmovdqu y0, 8 * 16(mem); \
240 vmovdqu y1, 9 * 16(mem); \
241 vmovdqu y2, 10 * 16(mem); \
242 vmovdqu y3, 11 * 16(mem); \
243 vmovdqu y4, 12 * 16(mem); \
244 vmovdqu y5, 13 * 16(mem); \
245 vmovdqu y6, 14 * 16(mem); \
246 vmovdqu y7, 15 * 16(mem); \
247
248#define aria_store_state_8way(x0, x1, x2, x3, \
249 x4, x5, x6, x7, \
250 mem_tmp, idx) \
251 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
252 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
253 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
254 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
255 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
256 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
257 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
258 vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
259
260#define aria_load_state_8way(x0, x1, x2, x3, \
261 x4, x5, x6, x7, \
262 mem_tmp, idx) \
263 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
264 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
265 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
266 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
267 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
268 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
269 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
270 vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
271
272#define aria_ark_8way(x0, x1, x2, x3, \
273 x4, x5, x6, x7, \
274 t0, rk, idx, round) \
275 /* AddRoundKey */ \
276 vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
277 vpxor t0, x0, x0; \
278 vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
279 vpxor t0, x1, x1; \
280 vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
281 vpxor t0, x2, x2; \
282 vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
283 vpxor t0, x3, x3; \
284 vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
285 vpxor t0, x4, x4; \
286 vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
287 vpxor t0, x5, x5; \
288 vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
289 vpxor t0, x6, x6; \
290 vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
291 vpxor t0, x7, x7;
292
293#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
294 x4, x5, x6, x7, \
295 t0, t1, t2, t3, \
296 t4, t5, t6, t7) \
297 vpbroadcastq .Ltf_s2_bitmatrix, t0; \
298 vpbroadcastq .Ltf_inv_bitmatrix, t1; \
299 vpbroadcastq .Ltf_id_bitmatrix, t2; \
300 vpbroadcastq .Ltf_aff_bitmatrix, t3; \
301 vpbroadcastq .Ltf_x2_bitmatrix, t4; \
302 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
303 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
304 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
305 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
306 vgf2p8affineinvqb $0, t2, x2, x2; \
307 vgf2p8affineinvqb $0, t2, x6, x6; \
308 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
309 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
310 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
311 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
312 vgf2p8affineinvqb $0, t2, x3, x3; \
313 vgf2p8affineinvqb $0, t2, x7, x7
314
315#define aria_sbox_8way(x0, x1, x2, x3, \
316 x4, x5, x6, x7, \
317 t0, t1, t2, t3, \
318 t4, t5, t6, t7) \
319 vpxor t7, t7, t7; \
320 vmovdqa .Linv_shift_row, t0; \
321 vmovdqa .Lshift_row, t1; \
322 vpbroadcastd .L0f0f0f0f, t6; \
323 vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
324 vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
325 vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
326 vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
327 \
328 vaesenclast t7, x0, x0; \
329 vaesenclast t7, x4, x4; \
330 vaesenclast t7, x1, x1; \
331 vaesenclast t7, x5, x5; \
332 vaesdeclast t7, x2, x2; \
333 vaesdeclast t7, x6, x6; \
334 \
335 /* AES inverse shift rows */ \
336 vpshufb t0, x0, x0; \
337 vpshufb t0, x4, x4; \
338 vpshufb t0, x1, x1; \
339 vpshufb t0, x5, x5; \
340 vpshufb t1, x3, x3; \
341 vpshufb t1, x7, x7; \
342 vpshufb t1, x2, x2; \
343 vpshufb t1, x6, x6; \
344 \
345 /* affine transformation for S2 */ \
346 filter_8bit(x1, t2, t3, t6, t0); \
347 /* affine transformation for S2 */ \
348 filter_8bit(x5, t2, t3, t6, t0); \
349 \
350 /* affine transformation for X2 */ \
351 filter_8bit(x3, t4, t5, t6, t0); \
352 /* affine transformation for X2 */ \
353 filter_8bit(x7, t4, t5, t6, t0); \
354 vaesdeclast t7, x3, x3; \
355 vaesdeclast t7, x7, x7;
356
357#define aria_diff_m(x0, x1, x2, x3, \
358 t0, t1, t2, t3) \
359 /* T = rotr32(X, 8); */ \
360 /* X ^= T */ \
361 vpxor x0, x3, t0; \
362 vpxor x1, x0, t1; \
363 vpxor x2, x1, t2; \
364 vpxor x3, x2, t3; \
365 /* X = T ^ rotr(X, 16); */ \
366 vpxor t2, x0, x0; \
367 vpxor x1, t3, t3; \
368 vpxor t0, x2, x2; \
369 vpxor t1, x3, x1; \
370 vmovdqu t3, x3;
371
372#define aria_diff_word(x0, x1, x2, x3, \
373 x4, x5, x6, x7, \
374 y0, y1, y2, y3, \
375 y4, y5, y6, y7) \
376 /* t1 ^= t2; */ \
377 vpxor y0, x4, x4; \
378 vpxor y1, x5, x5; \
379 vpxor y2, x6, x6; \
380 vpxor y3, x7, x7; \
381 \
382 /* t2 ^= t3; */ \
383 vpxor y4, y0, y0; \
384 vpxor y5, y1, y1; \
385 vpxor y6, y2, y2; \
386 vpxor y7, y3, y3; \
387 \
388 /* t0 ^= t1; */ \
389 vpxor x4, x0, x0; \
390 vpxor x5, x1, x1; \
391 vpxor x6, x2, x2; \
392 vpxor x7, x3, x3; \
393 \
394 /* t3 ^= t1; */ \
395 vpxor x4, y4, y4; \
396 vpxor x5, y5, y5; \
397 vpxor x6, y6, y6; \
398 vpxor x7, y7, y7; \
399 \
400 /* t2 ^= t0; */ \
401 vpxor x0, y0, y0; \
402 vpxor x1, y1, y1; \
403 vpxor x2, y2, y2; \
404 vpxor x3, y3, y3; \
405 \
406 /* t1 ^= t2; */ \
407 vpxor y0, x4, x4; \
408 vpxor y1, x5, x5; \
409 vpxor y2, x6, x6; \
410 vpxor y3, x7, x7;
411
412#define aria_fe(x0, x1, x2, x3, \
413 x4, x5, x6, x7, \
414 y0, y1, y2, y3, \
415 y4, y5, y6, y7, \
416 mem_tmp, rk, round) \
417 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
418 y0, rk, 8, round); \
419 \
420 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
421 y0, y1, y2, y3, y4, y5, y6, y7); \
422 \
423 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
424 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
425 aria_store_state_8way(x0, x1, x2, x3, \
426 x4, x5, x6, x7, \
427 mem_tmp, 8); \
428 \
429 aria_load_state_8way(x0, x1, x2, x3, \
430 x4, x5, x6, x7, \
431 mem_tmp, 0); \
432 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
433 y0, rk, 0, round); \
434 \
435 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
436 y0, y1, y2, y3, y4, y5, y6, y7); \
437 \
438 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
439 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
440 aria_store_state_8way(x0, x1, x2, x3, \
441 x4, x5, x6, x7, \
442 mem_tmp, 0); \
443 aria_load_state_8way(y0, y1, y2, y3, \
444 y4, y5, y6, y7, \
445 mem_tmp, 8); \
446 aria_diff_word(x0, x1, x2, x3, \
447 x4, x5, x6, x7, \
448 y0, y1, y2, y3, \
449 y4, y5, y6, y7); \
450 /* aria_diff_byte() \
451 * T3 = ABCD -> BADC \
452 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
453 * T0 = ABCD -> CDAB \
454 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
455 * T1 = ABCD -> DCBA \
456 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
457 */ \
458 aria_diff_word(x2, x3, x0, x1, \
459 x7, x6, x5, x4, \
460 y0, y1, y2, y3, \
461 y5, y4, y7, y6); \
462 aria_store_state_8way(x3, x2, x1, x0, \
463 x6, x7, x4, x5, \
464 mem_tmp, 0);
465
466#define aria_fo(x0, x1, x2, x3, \
467 x4, x5, x6, x7, \
468 y0, y1, y2, y3, \
469 y4, y5, y6, y7, \
470 mem_tmp, rk, round) \
471 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
472 y0, rk, 8, round); \
473 \
474 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
475 y0, y1, y2, y3, y4, y5, y6, y7); \
476 \
477 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
478 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
479 aria_store_state_8way(x0, x1, x2, x3, \
480 x4, x5, x6, x7, \
481 mem_tmp, 8); \
482 \
483 aria_load_state_8way(x0, x1, x2, x3, \
484 x4, x5, x6, x7, \
485 mem_tmp, 0); \
486 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
487 y0, rk, 0, round); \
488 \
489 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
490 y0, y1, y2, y3, y4, y5, y6, y7); \
491 \
492 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
493 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
494 aria_store_state_8way(x0, x1, x2, x3, \
495 x4, x5, x6, x7, \
496 mem_tmp, 0); \
497 aria_load_state_8way(y0, y1, y2, y3, \
498 y4, y5, y6, y7, \
499 mem_tmp, 8); \
500 aria_diff_word(x0, x1, x2, x3, \
501 x4, x5, x6, x7, \
502 y0, y1, y2, y3, \
503 y4, y5, y6, y7); \
504 /* aria_diff_byte() \
505 * T1 = ABCD -> BADC \
506 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
507 * T2 = ABCD -> CDAB \
508 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
509 * T3 = ABCD -> DCBA \
510 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
511 */ \
512 aria_diff_word(x0, x1, x2, x3, \
513 x5, x4, x7, x6, \
514 y2, y3, y0, y1, \
515 y7, y6, y5, y4); \
516 aria_store_state_8way(x3, x2, x1, x0, \
517 x6, x7, x4, x5, \
518 mem_tmp, 0);
519
520#define aria_ff(x0, x1, x2, x3, \
521 x4, x5, x6, x7, \
522 y0, y1, y2, y3, \
523 y4, y5, y6, y7, \
524 mem_tmp, rk, round, last_round) \
525 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
526 y0, rk, 8, round); \
527 \
528 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
529 y0, y1, y2, y3, y4, y5, y6, y7); \
530 \
531 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
532 y0, rk, 8, last_round); \
533 \
534 aria_store_state_8way(x0, x1, x2, x3, \
535 x4, x5, x6, x7, \
536 mem_tmp, 8); \
537 \
538 aria_load_state_8way(x0, x1, x2, x3, \
539 x4, x5, x6, x7, \
540 mem_tmp, 0); \
541 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
542 y0, rk, 0, round); \
543 \
544 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
545 y0, y1, y2, y3, y4, y5, y6, y7); \
546 \
547 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
548 y0, rk, 0, last_round); \
549 \
550 aria_load_state_8way(y0, y1, y2, y3, \
551 y4, y5, y6, y7, \
552 mem_tmp, 8);
553
554#define aria_fe_gfni(x0, x1, x2, x3, \
555 x4, x5, x6, x7, \
556 y0, y1, y2, y3, \
557 y4, y5, y6, y7, \
558 mem_tmp, rk, round) \
559 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
560 y0, rk, 8, round); \
561 \
562 aria_sbox_8way_gfni(x2, x3, x0, x1, \
563 x6, x7, x4, x5, \
564 y0, y1, y2, y3, \
565 y4, y5, y6, y7); \
566 \
567 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
568 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
569 aria_store_state_8way(x0, x1, x2, x3, \
570 x4, x5, x6, x7, \
571 mem_tmp, 8); \
572 \
573 aria_load_state_8way(x0, x1, x2, x3, \
574 x4, x5, x6, x7, \
575 mem_tmp, 0); \
576 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
577 y0, rk, 0, round); \
578 \
579 aria_sbox_8way_gfni(x2, x3, x0, x1, \
580 x6, x7, x4, x5, \
581 y0, y1, y2, y3, \
582 y4, y5, y6, y7); \
583 \
584 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
585 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
586 aria_store_state_8way(x0, x1, x2, x3, \
587 x4, x5, x6, x7, \
588 mem_tmp, 0); \
589 aria_load_state_8way(y0, y1, y2, y3, \
590 y4, y5, y6, y7, \
591 mem_tmp, 8); \
592 aria_diff_word(x0, x1, x2, x3, \
593 x4, x5, x6, x7, \
594 y0, y1, y2, y3, \
595 y4, y5, y6, y7); \
596 /* aria_diff_byte() \
597 * T3 = ABCD -> BADC \
598 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
599 * T0 = ABCD -> CDAB \
600 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
601 * T1 = ABCD -> DCBA \
602 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
603 */ \
604 aria_diff_word(x2, x3, x0, x1, \
605 x7, x6, x5, x4, \
606 y0, y1, y2, y3, \
607 y5, y4, y7, y6); \
608 aria_store_state_8way(x3, x2, x1, x0, \
609 x6, x7, x4, x5, \
610 mem_tmp, 0);
611
612#define aria_fo_gfni(x0, x1, x2, x3, \
613 x4, x5, x6, x7, \
614 y0, y1, y2, y3, \
615 y4, y5, y6, y7, \
616 mem_tmp, rk, round) \
617 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
618 y0, rk, 8, round); \
619 \
620 aria_sbox_8way_gfni(x0, x1, x2, x3, \
621 x4, x5, x6, x7, \
622 y0, y1, y2, y3, \
623 y4, y5, y6, y7); \
624 \
625 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
626 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
627 aria_store_state_8way(x0, x1, x2, x3, \
628 x4, x5, x6, x7, \
629 mem_tmp, 8); \
630 \
631 aria_load_state_8way(x0, x1, x2, x3, \
632 x4, x5, x6, x7, \
633 mem_tmp, 0); \
634 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
635 y0, rk, 0, round); \
636 \
637 aria_sbox_8way_gfni(x0, x1, x2, x3, \
638 x4, x5, x6, x7, \
639 y0, y1, y2, y3, \
640 y4, y5, y6, y7); \
641 \
642 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
643 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
644 aria_store_state_8way(x0, x1, x2, x3, \
645 x4, x5, x6, x7, \
646 mem_tmp, 0); \
647 aria_load_state_8way(y0, y1, y2, y3, \
648 y4, y5, y6, y7, \
649 mem_tmp, 8); \
650 aria_diff_word(x0, x1, x2, x3, \
651 x4, x5, x6, x7, \
652 y0, y1, y2, y3, \
653 y4, y5, y6, y7); \
654 /* aria_diff_byte() \
655 * T1 = ABCD -> BADC \
656 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
657 * T2 = ABCD -> CDAB \
658 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
659 * T3 = ABCD -> DCBA \
660 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
661 */ \
662 aria_diff_word(x0, x1, x2, x3, \
663 x5, x4, x7, x6, \
664 y2, y3, y0, y1, \
665 y7, y6, y5, y4); \
666 aria_store_state_8way(x3, x2, x1, x0, \
667 x6, x7, x4, x5, \
668 mem_tmp, 0);
669
670#define aria_ff_gfni(x0, x1, x2, x3, \
671 x4, x5, x6, x7, \
672 y0, y1, y2, y3, \
673 y4, y5, y6, y7, \
674 mem_tmp, rk, round, last_round) \
675 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
676 y0, rk, 8, round); \
677 \
678 aria_sbox_8way_gfni(x2, x3, x0, x1, \
679 x6, x7, x4, x5, \
680 y0, y1, y2, y3, \
681 y4, y5, y6, y7); \
682 \
683 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
684 y0, rk, 8, last_round); \
685 \
686 aria_store_state_8way(x0, x1, x2, x3, \
687 x4, x5, x6, x7, \
688 mem_tmp, 8); \
689 \
690 aria_load_state_8way(x0, x1, x2, x3, \
691 x4, x5, x6, x7, \
692 mem_tmp, 0); \
693 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
694 y0, rk, 0, round); \
695 \
696 aria_sbox_8way_gfni(x2, x3, x0, x1, \
697 x6, x7, x4, x5, \
698 y0, y1, y2, y3, \
699 y4, y5, y6, y7); \
700 \
701 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
702 y0, rk, 0, last_round); \
703 \
704 aria_load_state_8way(y0, y1, y2, y3, \
705 y4, y5, y6, y7, \
706 mem_tmp, 8);
707
708/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
709.section .rodata.cst16, "aM", @progbits, 16
710.align 16
711
712#define SHUFB_BYTES(idx) \
713 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
714
715.Lshufb_16x16b:
716 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
717/* For isolating SubBytes from AESENCLAST, inverse shift row */
718.Linv_shift_row:
719 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
720 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
721.Lshift_row:
722 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
723 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
724/* For CTR-mode IV byteswap */
725.Lbswap128_mask:
726 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
727 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
728
729/* AES inverse affine and S2 combined:
730 * 1 1 0 0 0 0 0 1 x0 0
731 * 0 1 0 0 1 0 0 0 x1 0
732 * 1 1 0 0 1 1 1 1 x2 0
733 * 0 1 1 0 1 0 0 1 x3 1
734 * 0 1 0 0 1 1 0 0 * x4 + 0
735 * 0 1 0 1 1 0 0 0 x5 0
736 * 0 0 0 0 0 1 0 1 x6 0
737 * 1 1 1 0 0 1 1 1 x7 1
738 */
739.Ltf_lo__inv_aff__and__s2:
740 .octa 0x92172DA81A9FA520B2370D883ABF8500
741.Ltf_hi__inv_aff__and__s2:
742 .octa 0x2B15FFC1AF917B45E6D8320C625CB688
743
744/* X2 and AES forward affine combined:
745 * 1 0 1 1 0 0 0 1 x0 0
746 * 0 1 1 1 1 0 1 1 x1 0
747 * 0 0 0 1 1 0 1 0 x2 1
748 * 0 1 0 0 0 1 0 0 x3 0
749 * 0 0 1 1 1 0 1 1 * x4 + 0
750 * 0 1 0 0 1 0 0 0 x5 0
751 * 1 1 0 1 0 0 1 1 x6 0
752 * 0 1 0 0 1 0 1 0 x7 0
753 */
754.Ltf_lo__x2__and__fwd_aff:
755 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
756.Ltf_hi__x2__and__fwd_aff:
757 .octa 0x3F893781E95FE1576CDA64D2BA0CB204
758
759.section .rodata.cst8, "aM", @progbits, 8
760.align 8
761/* AES affine: */
762#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
763.Ltf_aff_bitmatrix:
764 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
765 BV8(1, 1, 0, 0, 0, 1, 1, 1),
766 BV8(1, 1, 1, 0, 0, 0, 1, 1),
767 BV8(1, 1, 1, 1, 0, 0, 0, 1),
768 BV8(1, 1, 1, 1, 1, 0, 0, 0),
769 BV8(0, 1, 1, 1, 1, 1, 0, 0),
770 BV8(0, 0, 1, 1, 1, 1, 1, 0),
771 BV8(0, 0, 0, 1, 1, 1, 1, 1))
772
773/* AES inverse affine: */
774#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
775.Ltf_inv_bitmatrix:
776 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
777 BV8(1, 0, 0, 1, 0, 0, 1, 0),
778 BV8(0, 1, 0, 0, 1, 0, 0, 1),
779 BV8(1, 0, 1, 0, 0, 1, 0, 0),
780 BV8(0, 1, 0, 1, 0, 0, 1, 0),
781 BV8(0, 0, 1, 0, 1, 0, 0, 1),
782 BV8(1, 0, 0, 1, 0, 1, 0, 0),
783 BV8(0, 1, 0, 0, 1, 0, 1, 0))
784
785/* S2: */
786#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
787.Ltf_s2_bitmatrix:
788 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
789 BV8(0, 0, 1, 1, 1, 1, 1, 1),
790 BV8(1, 1, 1, 0, 1, 1, 0, 1),
791 BV8(1, 1, 0, 0, 0, 0, 1, 1),
792 BV8(0, 1, 0, 0, 0, 0, 1, 1),
793 BV8(1, 1, 0, 0, 1, 1, 1, 0),
794 BV8(0, 1, 1, 0, 0, 0, 1, 1),
795 BV8(1, 1, 1, 1, 0, 1, 1, 0))
796
797/* X2: */
798#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
799.Ltf_x2_bitmatrix:
800 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
801 BV8(0, 0, 1, 0, 0, 1, 1, 0),
802 BV8(0, 0, 0, 0, 1, 0, 1, 0),
803 BV8(1, 1, 1, 0, 0, 0, 1, 1),
804 BV8(1, 1, 1, 0, 1, 1, 0, 0),
805 BV8(0, 1, 1, 0, 1, 0, 1, 1),
806 BV8(1, 0, 1, 1, 1, 1, 0, 1),
807 BV8(1, 0, 0, 1, 0, 0, 1, 1))
808
809/* Identity matrix: */
810.Ltf_id_bitmatrix:
811 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
812 BV8(0, 1, 0, 0, 0, 0, 0, 0),
813 BV8(0, 0, 1, 0, 0, 0, 0, 0),
814 BV8(0, 0, 0, 1, 0, 0, 0, 0),
815 BV8(0, 0, 0, 0, 1, 0, 0, 0),
816 BV8(0, 0, 0, 0, 0, 1, 0, 0),
817 BV8(0, 0, 0, 0, 0, 0, 1, 0),
818 BV8(0, 0, 0, 0, 0, 0, 0, 1))
819
820/* 4-bit mask */
821.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
822.align 4
823.L0f0f0f0f:
824 .long 0x0f0f0f0f
825
826.text
827
828SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
829 /* input:
830 * %r9: rk
831 * %rsi: dst
832 * %rdx: src
833 * %xmm0..%xmm15: 16 byte-sliced blocks
834 */
835
836 FRAME_BEGIN
837
838 movq %rsi, %rax;
839 leaq 8 * 16(%rax), %r8;
840
841 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
842 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
843 %xmm15, %rax, %r8);
844 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
845 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
846 %rax, %r9, 0);
847 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
848 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
849 %xmm15, %rax, %r9, 1);
850 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
851 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
852 %rax, %r9, 2);
853 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
854 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
855 %xmm15, %rax, %r9, 3);
856 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
857 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
858 %rax, %r9, 4);
859 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
860 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
861 %xmm15, %rax, %r9, 5);
862 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
863 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
864 %rax, %r9, 6);
865 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
866 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
867 %xmm15, %rax, %r9, 7);
868 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
869 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
870 %rax, %r9, 8);
871 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
872 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873 %xmm15, %rax, %r9, 9);
874 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
875 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
876 %rax, %r9, 10);
877 cmpl $12, rounds(CTX);
878 jne .Laria_192;
879 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
880 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
881 %xmm15, %rax, %r9, 11, 12);
882 jmp .Laria_end;
883.Laria_192:
884 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
885 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
886 %xmm15, %rax, %r9, 11);
887 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
888 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
889 %rax, %r9, 12);
890 cmpl $14, rounds(CTX);
891 jne .Laria_256;
892 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
893 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
894 %xmm15, %rax, %r9, 13, 14);
895 jmp .Laria_end;
896.Laria_256:
897 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
898 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899 %xmm15, %rax, %r9, 13);
900 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
901 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902 %rax, %r9, 14);
903 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905 %xmm15, %rax, %r9, 15, 16);
906.Laria_end:
907 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
908 %xmm9, %xmm13, %xmm0, %xmm5,
909 %xmm10, %xmm14, %xmm3, %xmm6,
910 %xmm11, %xmm15, %xmm2, %xmm7,
911 (%rax), (%r8));
912
913 FRAME_END
914 RET;
915SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
916
917SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
918 /* input:
919 * %rdi: ctx, CTX
920 * %rsi: dst
921 * %rdx: src
922 */
923
924 FRAME_BEGIN
925
926 leaq enc_key(CTX), %r9;
927
928 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
929 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
930 %xmm15, %rdx);
931
932 call __aria_aesni_avx_crypt_16way;
933
934 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
935 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
936 %xmm15, %rax);
937
938 FRAME_END
939 RET;
940SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
941
942SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
943 /* input:
944 * %rdi: ctx, CTX
945 * %rsi: dst
946 * %rdx: src
947 */
948
949 FRAME_BEGIN
950
951 leaq dec_key(CTX), %r9;
952
953 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
954 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955 %xmm15, %rdx);
956
957 call __aria_aesni_avx_crypt_16way;
958
959 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961 %xmm15, %rax);
962
963 FRAME_END
964 RET;
965SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
966
967SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
968 /* input:
969 * %rdi: ctx
970 * %rsi: dst
971 * %rdx: src
972 * %rcx: keystream
973 * %r8: iv (big endian, 128bit)
974 */
975
976 FRAME_BEGIN
977 /* load IV and byteswap */
978 vmovdqu (%r8), %xmm8;
979
980 vmovdqa .Lbswap128_mask (%rip), %xmm1;
981 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
982
983 vpcmpeqd %xmm0, %xmm0, %xmm0;
984 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
985
986 /* construct IVs */
987 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
988 vpshufb %xmm1, %xmm3, %xmm9;
989 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
990 vpshufb %xmm1, %xmm3, %xmm10;
991 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
992 vpshufb %xmm1, %xmm3, %xmm11;
993 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
994 vpshufb %xmm1, %xmm3, %xmm12;
995 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
996 vpshufb %xmm1, %xmm3, %xmm13;
997 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
998 vpshufb %xmm1, %xmm3, %xmm14;
999 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1000 vpshufb %xmm1, %xmm3, %xmm15;
1001 vmovdqu %xmm8, (0 * 16)(%rcx);
1002 vmovdqu %xmm9, (1 * 16)(%rcx);
1003 vmovdqu %xmm10, (2 * 16)(%rcx);
1004 vmovdqu %xmm11, (3 * 16)(%rcx);
1005 vmovdqu %xmm12, (4 * 16)(%rcx);
1006 vmovdqu %xmm13, (5 * 16)(%rcx);
1007 vmovdqu %xmm14, (6 * 16)(%rcx);
1008 vmovdqu %xmm15, (7 * 16)(%rcx);
1009
1010 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1011 vpshufb %xmm1, %xmm3, %xmm8;
1012 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1013 vpshufb %xmm1, %xmm3, %xmm9;
1014 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1015 vpshufb %xmm1, %xmm3, %xmm10;
1016 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1017 vpshufb %xmm1, %xmm3, %xmm11;
1018 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1019 vpshufb %xmm1, %xmm3, %xmm12;
1020 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1021 vpshufb %xmm1, %xmm3, %xmm13;
1022 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1023 vpshufb %xmm1, %xmm3, %xmm14;
1024 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1025 vpshufb %xmm1, %xmm3, %xmm15;
1026 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1027 vpshufb %xmm1, %xmm3, %xmm4;
1028 vmovdqu %xmm4, (%r8);
1029
1030 vmovdqu (0 * 16)(%rcx), %xmm0;
1031 vmovdqu (1 * 16)(%rcx), %xmm1;
1032 vmovdqu (2 * 16)(%rcx), %xmm2;
1033 vmovdqu (3 * 16)(%rcx), %xmm3;
1034 vmovdqu (4 * 16)(%rcx), %xmm4;
1035 vmovdqu (5 * 16)(%rcx), %xmm5;
1036 vmovdqu (6 * 16)(%rcx), %xmm6;
1037 vmovdqu (7 * 16)(%rcx), %xmm7;
1038
1039 FRAME_END
1040 RET;
1041SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1042
1043SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1044 /* input:
1045 * %rdi: ctx
1046 * %rsi: dst
1047 * %rdx: src
1048 * %rcx: keystream
1049 * %r8: iv (big endian, 128bit)
1050 */
1051 FRAME_BEGIN
1052
1053 call __aria_aesni_avx_ctr_gen_keystream_16way;
1054
1055 leaq (%rsi), %r10;
1056 leaq (%rdx), %r11;
1057 leaq (%rcx), %rsi;
1058 leaq (%rcx), %rdx;
1059 leaq enc_key(CTX), %r9;
1060
1061 call __aria_aesni_avx_crypt_16way;
1062
1063 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1064 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1065 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1066 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1067 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1068 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1069 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1070 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1071 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1072 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1073 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1074 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1075 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1076 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1077 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1078 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1079 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1080 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1081 %xmm15, %r10);
1082
1083 FRAME_END
1084 RET;
1085SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1086
1087SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1088 /* input:
1089 * %r9: rk
1090 * %rsi: dst
1091 * %rdx: src
1092 * %xmm0..%xmm15: 16 byte-sliced blocks
1093 */
1094
1095 FRAME_BEGIN
1096
1097 movq %rsi, %rax;
1098 leaq 8 * 16(%rax), %r8;
1099
1100 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1101 %xmm4, %xmm5, %xmm6, %xmm7,
1102 %xmm8, %xmm9, %xmm10, %xmm11,
1103 %xmm12, %xmm13, %xmm14,
1104 %xmm15, %rax, %r8);
1105 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1106 %xmm12, %xmm13, %xmm14, %xmm15,
1107 %xmm0, %xmm1, %xmm2, %xmm3,
1108 %xmm4, %xmm5, %xmm6, %xmm7,
1109 %rax, %r9, 0);
1110 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1111 %xmm4, %xmm5, %xmm6, %xmm7,
1112 %xmm8, %xmm9, %xmm10, %xmm11,
1113 %xmm12, %xmm13, %xmm14,
1114 %xmm15, %rax, %r9, 1);
1115 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1116 %xmm12, %xmm13, %xmm14, %xmm15,
1117 %xmm0, %xmm1, %xmm2, %xmm3,
1118 %xmm4, %xmm5, %xmm6, %xmm7,
1119 %rax, %r9, 2);
1120 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1121 %xmm4, %xmm5, %xmm6, %xmm7,
1122 %xmm8, %xmm9, %xmm10, %xmm11,
1123 %xmm12, %xmm13, %xmm14,
1124 %xmm15, %rax, %r9, 3);
1125 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1126 %xmm12, %xmm13, %xmm14, %xmm15,
1127 %xmm0, %xmm1, %xmm2, %xmm3,
1128 %xmm4, %xmm5, %xmm6, %xmm7,
1129 %rax, %r9, 4);
1130 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1131 %xmm4, %xmm5, %xmm6, %xmm7,
1132 %xmm8, %xmm9, %xmm10, %xmm11,
1133 %xmm12, %xmm13, %xmm14,
1134 %xmm15, %rax, %r9, 5);
1135 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1136 %xmm12, %xmm13, %xmm14, %xmm15,
1137 %xmm0, %xmm1, %xmm2, %xmm3,
1138 %xmm4, %xmm5, %xmm6, %xmm7,
1139 %rax, %r9, 6);
1140 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1141 %xmm4, %xmm5, %xmm6, %xmm7,
1142 %xmm8, %xmm9, %xmm10, %xmm11,
1143 %xmm12, %xmm13, %xmm14,
1144 %xmm15, %rax, %r9, 7);
1145 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1146 %xmm12, %xmm13, %xmm14, %xmm15,
1147 %xmm0, %xmm1, %xmm2, %xmm3,
1148 %xmm4, %xmm5, %xmm6, %xmm7,
1149 %rax, %r9, 8);
1150 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1151 %xmm4, %xmm5, %xmm6, %xmm7,
1152 %xmm8, %xmm9, %xmm10, %xmm11,
1153 %xmm12, %xmm13, %xmm14,
1154 %xmm15, %rax, %r9, 9);
1155 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1156 %xmm12, %xmm13, %xmm14, %xmm15,
1157 %xmm0, %xmm1, %xmm2, %xmm3,
1158 %xmm4, %xmm5, %xmm6, %xmm7,
1159 %rax, %r9, 10);
1160 cmpl $12, rounds(CTX);
1161 jne .Laria_gfni_192;
1162 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1163 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1164 %xmm15, %rax, %r9, 11, 12);
1165 jmp .Laria_gfni_end;
1166.Laria_gfni_192:
1167 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168 %xmm4, %xmm5, %xmm6, %xmm7,
1169 %xmm8, %xmm9, %xmm10, %xmm11,
1170 %xmm12, %xmm13, %xmm14,
1171 %xmm15, %rax, %r9, 11);
1172 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173 %xmm12, %xmm13, %xmm14, %xmm15,
1174 %xmm0, %xmm1, %xmm2, %xmm3,
1175 %xmm4, %xmm5, %xmm6, %xmm7,
1176 %rax, %r9, 12);
1177 cmpl $14, rounds(CTX);
1178 jne .Laria_gfni_256;
1179 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1180 %xmm4, %xmm5, %xmm6, %xmm7,
1181 %xmm8, %xmm9, %xmm10, %xmm11,
1182 %xmm12, %xmm13, %xmm14,
1183 %xmm15, %rax, %r9, 13, 14);
1184 jmp .Laria_gfni_end;
1185.Laria_gfni_256:
1186 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1187 %xmm4, %xmm5, %xmm6, %xmm7,
1188 %xmm8, %xmm9, %xmm10, %xmm11,
1189 %xmm12, %xmm13, %xmm14,
1190 %xmm15, %rax, %r9, 13);
1191 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1192 %xmm12, %xmm13, %xmm14, %xmm15,
1193 %xmm0, %xmm1, %xmm2, %xmm3,
1194 %xmm4, %xmm5, %xmm6, %xmm7,
1195 %rax, %r9, 14);
1196 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1197 %xmm4, %xmm5, %xmm6, %xmm7,
1198 %xmm8, %xmm9, %xmm10, %xmm11,
1199 %xmm12, %xmm13, %xmm14,
1200 %xmm15, %rax, %r9, 15, 16);
1201.Laria_gfni_end:
1202 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1203 %xmm9, %xmm13, %xmm0, %xmm5,
1204 %xmm10, %xmm14, %xmm3, %xmm6,
1205 %xmm11, %xmm15, %xmm2, %xmm7,
1206 (%rax), (%r8));
1207
1208 FRAME_END
1209 RET;
1210SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1211
1212SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1213 /* input:
1214 * %rdi: ctx, CTX
1215 * %rsi: dst
1216 * %rdx: src
1217 */
1218
1219 FRAME_BEGIN
1220
1221 leaq enc_key(CTX), %r9;
1222
1223 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1224 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1225 %xmm15, %rdx);
1226
1227 call __aria_aesni_avx_gfni_crypt_16way;
1228
1229 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1230 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1231 %xmm15, %rax);
1232
1233 FRAME_END
1234 RET;
1235SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1236
1237SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1238 /* input:
1239 * %rdi: ctx, CTX
1240 * %rsi: dst
1241 * %rdx: src
1242 */
1243
1244 FRAME_BEGIN
1245
1246 leaq dec_key(CTX), %r9;
1247
1248 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1249 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1250 %xmm15, %rdx);
1251
1252 call __aria_aesni_avx_gfni_crypt_16way;
1253
1254 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1255 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1256 %xmm15, %rax);
1257
1258 FRAME_END
1259 RET;
1260SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1261
1262SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1263 /* input:
1264 * %rdi: ctx
1265 * %rsi: dst
1266 * %rdx: src
1267 * %rcx: keystream
1268 * %r8: iv (big endian, 128bit)
1269 */
1270 FRAME_BEGIN
1271
1272 call __aria_aesni_avx_ctr_gen_keystream_16way
1273
1274 leaq (%rsi), %r10;
1275 leaq (%rdx), %r11;
1276 leaq (%rcx), %rsi;
1277 leaq (%rcx), %rdx;
1278 leaq enc_key(CTX), %r9;
1279
1280 call __aria_aesni_avx_gfni_crypt_16way;
1281
1282 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1283 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1284 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1285 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1286 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1287 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1288 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1289 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1290 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1291 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1292 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1293 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1294 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1295 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1296 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1297 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1298 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1299 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1300 %xmm15, %r10);
1301
1302 FRAME_END
1303 RET;
1304SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)