Loading...
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Camellia Cipher Algorithm (x86_64)
4 *
5 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9
10.file "camellia-x86_64-asm_64.S"
11.text
12
13.extern camellia_sp10011110;
14.extern camellia_sp22000222;
15.extern camellia_sp03303033;
16.extern camellia_sp00444404;
17.extern camellia_sp02220222;
18.extern camellia_sp30333033;
19.extern camellia_sp44044404;
20.extern camellia_sp11101110;
21
22#define sp10011110 camellia_sp10011110
23#define sp22000222 camellia_sp22000222
24#define sp03303033 camellia_sp03303033
25#define sp00444404 camellia_sp00444404
26#define sp02220222 camellia_sp02220222
27#define sp30333033 camellia_sp30333033
28#define sp44044404 camellia_sp44044404
29#define sp11101110 camellia_sp11101110
30
31#define CAMELLIA_TABLE_BYTE_LEN 272
32
33/* struct camellia_ctx: */
34#define key_table 0
35#define key_length CAMELLIA_TABLE_BYTE_LEN
36
37/* register macros */
38#define CTX %rdi
39#define RIO %rsi
40#define RIOd %esi
41
42#define RAB0 %rax
43#define RCD0 %rcx
44#define RAB1 %rbx
45#define RCD1 %rdx
46
47#define RAB0d %eax
48#define RCD0d %ecx
49#define RAB1d %ebx
50#define RCD1d %edx
51
52#define RAB0bl %al
53#define RCD0bl %cl
54#define RAB1bl %bl
55#define RCD1bl %dl
56
57#define RAB0bh %ah
58#define RCD0bh %ch
59#define RAB1bh %bh
60#define RCD1bh %dh
61
62#define RT0 %rsi
63#define RT1 %r12
64#define RT2 %r8
65
66#define RT0d %esi
67#define RT1d %r12d
68#define RT2d %r8d
69
70#define RT2bl %r8b
71
72#define RXOR %r9
73#define RR12 %r10
74#define RDST %r11
75
76#define RXORd %r9d
77#define RXORbl %r9b
78
79#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
80 leaq T0(%rip), tmp1; \
81 movzbl ab ## bl, tmp2 ## d; \
82 xorq (tmp1, tmp2, 8), dst; \
83 leaq T1(%rip), tmp2; \
84 movzbl ab ## bh, tmp1 ## d; \
85 rorq $16, ab; \
86 xorq (tmp2, tmp1, 8), dst;
87
88/**********************************************************************
89 1-way camellia
90 **********************************************************************/
91#define roundsm(ab, subkey, cd) \
92 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
93 \
94 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
95 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
96 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
97 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
98 \
99 xorq RT2, cd ## 0;
100
101#define fls(l, r, kl, kr) \
102 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
103 andl l ## 0d, RT0d; \
104 roll $1, RT0d; \
105 shlq $32, RT0; \
106 xorq RT0, l ## 0; \
107 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
108 orq r ## 0, RT1; \
109 shrq $32, RT1; \
110 xorq RT1, r ## 0; \
111 \
112 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
113 orq l ## 0, RT2; \
114 shrq $32, RT2; \
115 xorq RT2, l ## 0; \
116 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
117 andl r ## 0d, RT0d; \
118 roll $1, RT0d; \
119 shlq $32, RT0; \
120 xorq RT0, r ## 0;
121
122#define enc_rounds(i) \
123 roundsm(RAB, i + 2, RCD); \
124 roundsm(RCD, i + 3, RAB); \
125 roundsm(RAB, i + 4, RCD); \
126 roundsm(RCD, i + 5, RAB); \
127 roundsm(RAB, i + 6, RCD); \
128 roundsm(RCD, i + 7, RAB);
129
130#define enc_fls(i) \
131 fls(RAB, RCD, i + 0, i + 1);
132
133#define enc_inpack() \
134 movq (RIO), RAB0; \
135 bswapq RAB0; \
136 rolq $32, RAB0; \
137 movq 4*2(RIO), RCD0; \
138 bswapq RCD0; \
139 rorq $32, RCD0; \
140 xorq key_table(CTX), RAB0;
141
142#define enc_outunpack(op, max) \
143 xorq key_table(CTX, max, 8), RCD0; \
144 rorq $32, RCD0; \
145 bswapq RCD0; \
146 op ## q RCD0, (RIO); \
147 rolq $32, RAB0; \
148 bswapq RAB0; \
149 op ## q RAB0, 4*2(RIO);
150
151#define dec_rounds(i) \
152 roundsm(RAB, i + 7, RCD); \
153 roundsm(RCD, i + 6, RAB); \
154 roundsm(RAB, i + 5, RCD); \
155 roundsm(RCD, i + 4, RAB); \
156 roundsm(RAB, i + 3, RCD); \
157 roundsm(RCD, i + 2, RAB);
158
159#define dec_fls(i) \
160 fls(RAB, RCD, i + 1, i + 0);
161
162#define dec_inpack(max) \
163 movq (RIO), RAB0; \
164 bswapq RAB0; \
165 rolq $32, RAB0; \
166 movq 4*2(RIO), RCD0; \
167 bswapq RCD0; \
168 rorq $32, RCD0; \
169 xorq key_table(CTX, max, 8), RAB0;
170
171#define dec_outunpack() \
172 xorq key_table(CTX), RCD0; \
173 rorq $32, RCD0; \
174 bswapq RCD0; \
175 movq RCD0, (RIO); \
176 rolq $32, RAB0; \
177 bswapq RAB0; \
178 movq RAB0, 4*2(RIO);
179
180SYM_FUNC_START(__camellia_enc_blk)
181 /* input:
182 * %rdi: ctx, CTX
183 * %rsi: dst
184 * %rdx: src
185 * %rcx: bool xor
186 */
187 movq %r12, RR12;
188
189 movq %rcx, RXOR;
190 movq %rsi, RDST;
191 movq %rdx, RIO;
192
193 enc_inpack();
194
195 enc_rounds(0);
196 enc_fls(8);
197 enc_rounds(8);
198 enc_fls(16);
199 enc_rounds(16);
200 movl $24, RT1d; /* max */
201
202 cmpb $16, key_length(CTX);
203 je .L__enc_done;
204
205 enc_fls(24);
206 enc_rounds(24);
207 movl $32, RT1d; /* max */
208
209.L__enc_done:
210 testb RXORbl, RXORbl;
211 movq RDST, RIO;
212
213 jnz .L__enc_xor;
214
215 enc_outunpack(mov, RT1);
216
217 movq RR12, %r12;
218 RET;
219
220.L__enc_xor:
221 enc_outunpack(xor, RT1);
222
223 movq RR12, %r12;
224 RET;
225SYM_FUNC_END(__camellia_enc_blk)
226
227SYM_FUNC_START(camellia_dec_blk)
228 /* input:
229 * %rdi: ctx, CTX
230 * %rsi: dst
231 * %rdx: src
232 */
233 cmpl $16, key_length(CTX);
234 movl $32, RT2d;
235 movl $24, RXORd;
236 cmovel RXORd, RT2d; /* max */
237
238 movq %r12, RR12;
239 movq %rsi, RDST;
240 movq %rdx, RIO;
241
242 dec_inpack(RT2);
243
244 cmpb $24, RT2bl;
245 je .L__dec_rounds16;
246
247 dec_rounds(24);
248 dec_fls(24);
249
250.L__dec_rounds16:
251 dec_rounds(16);
252 dec_fls(16);
253 dec_rounds(8);
254 dec_fls(8);
255 dec_rounds(0);
256
257 movq RDST, RIO;
258
259 dec_outunpack();
260
261 movq RR12, %r12;
262 RET;
263SYM_FUNC_END(camellia_dec_blk)
264
265/**********************************************************************
266 2-way camellia
267 **********************************************************************/
268#define roundsm2(ab, subkey, cd) \
269 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
270 xorq RT2, cd ## 1; \
271 \
272 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
273 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
274 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
275 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
276 \
277 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
278 xorq RT2, cd ## 0; \
279 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
280 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
281 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
282
283#define fls2(l, r, kl, kr) \
284 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
285 andl l ## 0d, RT0d; \
286 roll $1, RT0d; \
287 shlq $32, RT0; \
288 xorq RT0, l ## 0; \
289 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
290 orq r ## 0, RT1; \
291 shrq $32, RT1; \
292 xorq RT1, r ## 0; \
293 \
294 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
295 andl l ## 1d, RT2d; \
296 roll $1, RT2d; \
297 shlq $32, RT2; \
298 xorq RT2, l ## 1; \
299 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
300 orq r ## 1, RT0; \
301 shrq $32, RT0; \
302 xorq RT0, r ## 1; \
303 \
304 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
305 orq l ## 0, RT1; \
306 shrq $32, RT1; \
307 xorq RT1, l ## 0; \
308 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
309 andl r ## 0d, RT2d; \
310 roll $1, RT2d; \
311 shlq $32, RT2; \
312 xorq RT2, r ## 0; \
313 \
314 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
315 orq l ## 1, RT0; \
316 shrq $32, RT0; \
317 xorq RT0, l ## 1; \
318 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
319 andl r ## 1d, RT1d; \
320 roll $1, RT1d; \
321 shlq $32, RT1; \
322 xorq RT1, r ## 1;
323
324#define enc_rounds2(i) \
325 roundsm2(RAB, i + 2, RCD); \
326 roundsm2(RCD, i + 3, RAB); \
327 roundsm2(RAB, i + 4, RCD); \
328 roundsm2(RCD, i + 5, RAB); \
329 roundsm2(RAB, i + 6, RCD); \
330 roundsm2(RCD, i + 7, RAB);
331
332#define enc_fls2(i) \
333 fls2(RAB, RCD, i + 0, i + 1);
334
335#define enc_inpack2() \
336 movq (RIO), RAB0; \
337 bswapq RAB0; \
338 rorq $32, RAB0; \
339 movq 4*2(RIO), RCD0; \
340 bswapq RCD0; \
341 rolq $32, RCD0; \
342 xorq key_table(CTX), RAB0; \
343 \
344 movq 8*2(RIO), RAB1; \
345 bswapq RAB1; \
346 rorq $32, RAB1; \
347 movq 12*2(RIO), RCD1; \
348 bswapq RCD1; \
349 rolq $32, RCD1; \
350 xorq key_table(CTX), RAB1;
351
352#define enc_outunpack2(op, max) \
353 xorq key_table(CTX, max, 8), RCD0; \
354 rolq $32, RCD0; \
355 bswapq RCD0; \
356 op ## q RCD0, (RIO); \
357 rorq $32, RAB0; \
358 bswapq RAB0; \
359 op ## q RAB0, 4*2(RIO); \
360 \
361 xorq key_table(CTX, max, 8), RCD1; \
362 rolq $32, RCD1; \
363 bswapq RCD1; \
364 op ## q RCD1, 8*2(RIO); \
365 rorq $32, RAB1; \
366 bswapq RAB1; \
367 op ## q RAB1, 12*2(RIO);
368
369#define dec_rounds2(i) \
370 roundsm2(RAB, i + 7, RCD); \
371 roundsm2(RCD, i + 6, RAB); \
372 roundsm2(RAB, i + 5, RCD); \
373 roundsm2(RCD, i + 4, RAB); \
374 roundsm2(RAB, i + 3, RCD); \
375 roundsm2(RCD, i + 2, RAB);
376
377#define dec_fls2(i) \
378 fls2(RAB, RCD, i + 1, i + 0);
379
380#define dec_inpack2(max) \
381 movq (RIO), RAB0; \
382 bswapq RAB0; \
383 rorq $32, RAB0; \
384 movq 4*2(RIO), RCD0; \
385 bswapq RCD0; \
386 rolq $32, RCD0; \
387 xorq key_table(CTX, max, 8), RAB0; \
388 \
389 movq 8*2(RIO), RAB1; \
390 bswapq RAB1; \
391 rorq $32, RAB1; \
392 movq 12*2(RIO), RCD1; \
393 bswapq RCD1; \
394 rolq $32, RCD1; \
395 xorq key_table(CTX, max, 8), RAB1;
396
397#define dec_outunpack2() \
398 xorq key_table(CTX), RCD0; \
399 rolq $32, RCD0; \
400 bswapq RCD0; \
401 movq RCD0, (RIO); \
402 rorq $32, RAB0; \
403 bswapq RAB0; \
404 movq RAB0, 4*2(RIO); \
405 \
406 xorq key_table(CTX), RCD1; \
407 rolq $32, RCD1; \
408 bswapq RCD1; \
409 movq RCD1, 8*2(RIO); \
410 rorq $32, RAB1; \
411 bswapq RAB1; \
412 movq RAB1, 12*2(RIO);
413
414SYM_FUNC_START(__camellia_enc_blk_2way)
415 /* input:
416 * %rdi: ctx, CTX
417 * %rsi: dst
418 * %rdx: src
419 * %rcx: bool xor
420 */
421 pushq %rbx;
422
423 movq %r12, RR12;
424 movq %rcx, RXOR;
425 movq %rsi, RDST;
426 movq %rdx, RIO;
427
428 enc_inpack2();
429
430 enc_rounds2(0);
431 enc_fls2(8);
432 enc_rounds2(8);
433 enc_fls2(16);
434 enc_rounds2(16);
435 movl $24, RT2d; /* max */
436
437 cmpb $16, key_length(CTX);
438 je .L__enc2_done;
439
440 enc_fls2(24);
441 enc_rounds2(24);
442 movl $32, RT2d; /* max */
443
444.L__enc2_done:
445 test RXORbl, RXORbl;
446 movq RDST, RIO;
447 jnz .L__enc2_xor;
448
449 enc_outunpack2(mov, RT2);
450
451 movq RR12, %r12;
452 popq %rbx;
453 RET;
454
455.L__enc2_xor:
456 enc_outunpack2(xor, RT2);
457
458 movq RR12, %r12;
459 popq %rbx;
460 RET;
461SYM_FUNC_END(__camellia_enc_blk_2way)
462
463SYM_FUNC_START(camellia_dec_blk_2way)
464 /* input:
465 * %rdi: ctx, CTX
466 * %rsi: dst
467 * %rdx: src
468 */
469 cmpl $16, key_length(CTX);
470 movl $32, RT2d;
471 movl $24, RXORd;
472 cmovel RXORd, RT2d; /* max */
473
474 movq %rbx, RXOR;
475 movq %r12, RR12;
476 movq %rsi, RDST;
477 movq %rdx, RIO;
478
479 dec_inpack2(RT2);
480
481 cmpb $24, RT2bl;
482 je .L__dec2_rounds16;
483
484 dec_rounds2(24);
485 dec_fls2(24);
486
487.L__dec2_rounds16:
488 dec_rounds2(16);
489 dec_fls2(16);
490 dec_rounds2(8);
491 dec_fls2(8);
492 dec_rounds2(0);
493
494 movq RDST, RIO;
495
496 dec_outunpack2();
497
498 movq RR12, %r12;
499 movq RXOR, %rbx;
500 RET;
501SYM_FUNC_END(camellia_dec_blk_2way)
1/*
2 * Camellia Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
23#include <linux/linkage.h>
24
25.file "camellia-x86_64-asm_64.S"
26.text
27
28.extern camellia_sp10011110;
29.extern camellia_sp22000222;
30.extern camellia_sp03303033;
31.extern camellia_sp00444404;
32.extern camellia_sp02220222;
33.extern camellia_sp30333033;
34.extern camellia_sp44044404;
35.extern camellia_sp11101110;
36
37#define sp10011110 camellia_sp10011110
38#define sp22000222 camellia_sp22000222
39#define sp03303033 camellia_sp03303033
40#define sp00444404 camellia_sp00444404
41#define sp02220222 camellia_sp02220222
42#define sp30333033 camellia_sp30333033
43#define sp44044404 camellia_sp44044404
44#define sp11101110 camellia_sp11101110
45
46#define CAMELLIA_TABLE_BYTE_LEN 272
47
48/* struct camellia_ctx: */
49#define key_table 0
50#define key_length CAMELLIA_TABLE_BYTE_LEN
51
52/* register macros */
53#define CTX %rdi
54#define RIO %rsi
55#define RIOd %esi
56
57#define RAB0 %rax
58#define RCD0 %rcx
59#define RAB1 %rbx
60#define RCD1 %rdx
61
62#define RAB0d %eax
63#define RCD0d %ecx
64#define RAB1d %ebx
65#define RCD1d %edx
66
67#define RAB0bl %al
68#define RCD0bl %cl
69#define RAB1bl %bl
70#define RCD1bl %dl
71
72#define RAB0bh %ah
73#define RCD0bh %ch
74#define RAB1bh %bh
75#define RCD1bh %dh
76
77#define RT0 %rsi
78#define RT1 %rbp
79#define RT2 %r8
80
81#define RT0d %esi
82#define RT1d %ebp
83#define RT2d %r8d
84
85#define RT2bl %r8b
86
87#define RXOR %r9
88#define RRBP %r10
89#define RDST %r11
90
91#define RXORd %r9d
92#define RXORbl %r9b
93
94#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
95 movzbl ab ## bl, tmp2 ## d; \
96 movzbl ab ## bh, tmp1 ## d; \
97 rorq $16, ab; \
98 xorq T0(, tmp2, 8), dst; \
99 xorq T1(, tmp1, 8), dst;
100
101/**********************************************************************
102 1-way camellia
103 **********************************************************************/
104#define roundsm(ab, subkey, cd) \
105 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
106 \
107 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
108 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
109 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
110 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
111 \
112 xorq RT2, cd ## 0;
113
114#define fls(l, r, kl, kr) \
115 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
116 andl l ## 0d, RT0d; \
117 roll $1, RT0d; \
118 shlq $32, RT0; \
119 xorq RT0, l ## 0; \
120 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
121 orq r ## 0, RT1; \
122 shrq $32, RT1; \
123 xorq RT1, r ## 0; \
124 \
125 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
126 orq l ## 0, RT2; \
127 shrq $32, RT2; \
128 xorq RT2, l ## 0; \
129 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
130 andl r ## 0d, RT0d; \
131 roll $1, RT0d; \
132 shlq $32, RT0; \
133 xorq RT0, r ## 0;
134
135#define enc_rounds(i) \
136 roundsm(RAB, i + 2, RCD); \
137 roundsm(RCD, i + 3, RAB); \
138 roundsm(RAB, i + 4, RCD); \
139 roundsm(RCD, i + 5, RAB); \
140 roundsm(RAB, i + 6, RCD); \
141 roundsm(RCD, i + 7, RAB);
142
143#define enc_fls(i) \
144 fls(RAB, RCD, i + 0, i + 1);
145
146#define enc_inpack() \
147 movq (RIO), RAB0; \
148 bswapq RAB0; \
149 rolq $32, RAB0; \
150 movq 4*2(RIO), RCD0; \
151 bswapq RCD0; \
152 rorq $32, RCD0; \
153 xorq key_table(CTX), RAB0;
154
155#define enc_outunpack(op, max) \
156 xorq key_table(CTX, max, 8), RCD0; \
157 rorq $32, RCD0; \
158 bswapq RCD0; \
159 op ## q RCD0, (RIO); \
160 rolq $32, RAB0; \
161 bswapq RAB0; \
162 op ## q RAB0, 4*2(RIO);
163
164#define dec_rounds(i) \
165 roundsm(RAB, i + 7, RCD); \
166 roundsm(RCD, i + 6, RAB); \
167 roundsm(RAB, i + 5, RCD); \
168 roundsm(RCD, i + 4, RAB); \
169 roundsm(RAB, i + 3, RCD); \
170 roundsm(RCD, i + 2, RAB);
171
172#define dec_fls(i) \
173 fls(RAB, RCD, i + 1, i + 0);
174
175#define dec_inpack(max) \
176 movq (RIO), RAB0; \
177 bswapq RAB0; \
178 rolq $32, RAB0; \
179 movq 4*2(RIO), RCD0; \
180 bswapq RCD0; \
181 rorq $32, RCD0; \
182 xorq key_table(CTX, max, 8), RAB0;
183
184#define dec_outunpack() \
185 xorq key_table(CTX), RCD0; \
186 rorq $32, RCD0; \
187 bswapq RCD0; \
188 movq RCD0, (RIO); \
189 rolq $32, RAB0; \
190 bswapq RAB0; \
191 movq RAB0, 4*2(RIO);
192
193ENTRY(__camellia_enc_blk)
194 /* input:
195 * %rdi: ctx, CTX
196 * %rsi: dst
197 * %rdx: src
198 * %rcx: bool xor
199 */
200 movq %rbp, RRBP;
201
202 movq %rcx, RXOR;
203 movq %rsi, RDST;
204 movq %rdx, RIO;
205
206 enc_inpack();
207
208 enc_rounds(0);
209 enc_fls(8);
210 enc_rounds(8);
211 enc_fls(16);
212 enc_rounds(16);
213 movl $24, RT1d; /* max */
214
215 cmpb $16, key_length(CTX);
216 je .L__enc_done;
217
218 enc_fls(24);
219 enc_rounds(24);
220 movl $32, RT1d; /* max */
221
222.L__enc_done:
223 testb RXORbl, RXORbl;
224 movq RDST, RIO;
225
226 jnz .L__enc_xor;
227
228 enc_outunpack(mov, RT1);
229
230 movq RRBP, %rbp;
231 ret;
232
233.L__enc_xor:
234 enc_outunpack(xor, RT1);
235
236 movq RRBP, %rbp;
237 ret;
238ENDPROC(__camellia_enc_blk)
239
240ENTRY(camellia_dec_blk)
241 /* input:
242 * %rdi: ctx, CTX
243 * %rsi: dst
244 * %rdx: src
245 */
246 cmpl $16, key_length(CTX);
247 movl $32, RT2d;
248 movl $24, RXORd;
249 cmovel RXORd, RT2d; /* max */
250
251 movq %rbp, RRBP;
252 movq %rsi, RDST;
253 movq %rdx, RIO;
254
255 dec_inpack(RT2);
256
257 cmpb $24, RT2bl;
258 je .L__dec_rounds16;
259
260 dec_rounds(24);
261 dec_fls(24);
262
263.L__dec_rounds16:
264 dec_rounds(16);
265 dec_fls(16);
266 dec_rounds(8);
267 dec_fls(8);
268 dec_rounds(0);
269
270 movq RDST, RIO;
271
272 dec_outunpack();
273
274 movq RRBP, %rbp;
275 ret;
276ENDPROC(camellia_dec_blk)
277
278/**********************************************************************
279 2-way camellia
280 **********************************************************************/
281#define roundsm2(ab, subkey, cd) \
282 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
283 xorq RT2, cd ## 1; \
284 \
285 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
286 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
287 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
288 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
289 \
290 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
291 xorq RT2, cd ## 0; \
292 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
293 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
294 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
295
296#define fls2(l, r, kl, kr) \
297 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
298 andl l ## 0d, RT0d; \
299 roll $1, RT0d; \
300 shlq $32, RT0; \
301 xorq RT0, l ## 0; \
302 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
303 orq r ## 0, RT1; \
304 shrq $32, RT1; \
305 xorq RT1, r ## 0; \
306 \
307 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
308 andl l ## 1d, RT2d; \
309 roll $1, RT2d; \
310 shlq $32, RT2; \
311 xorq RT2, l ## 1; \
312 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
313 orq r ## 1, RT0; \
314 shrq $32, RT0; \
315 xorq RT0, r ## 1; \
316 \
317 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
318 orq l ## 0, RT1; \
319 shrq $32, RT1; \
320 xorq RT1, l ## 0; \
321 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
322 andl r ## 0d, RT2d; \
323 roll $1, RT2d; \
324 shlq $32, RT2; \
325 xorq RT2, r ## 0; \
326 \
327 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
328 orq l ## 1, RT0; \
329 shrq $32, RT0; \
330 xorq RT0, l ## 1; \
331 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
332 andl r ## 1d, RT1d; \
333 roll $1, RT1d; \
334 shlq $32, RT1; \
335 xorq RT1, r ## 1;
336
337#define enc_rounds2(i) \
338 roundsm2(RAB, i + 2, RCD); \
339 roundsm2(RCD, i + 3, RAB); \
340 roundsm2(RAB, i + 4, RCD); \
341 roundsm2(RCD, i + 5, RAB); \
342 roundsm2(RAB, i + 6, RCD); \
343 roundsm2(RCD, i + 7, RAB);
344
345#define enc_fls2(i) \
346 fls2(RAB, RCD, i + 0, i + 1);
347
348#define enc_inpack2() \
349 movq (RIO), RAB0; \
350 bswapq RAB0; \
351 rorq $32, RAB0; \
352 movq 4*2(RIO), RCD0; \
353 bswapq RCD0; \
354 rolq $32, RCD0; \
355 xorq key_table(CTX), RAB0; \
356 \
357 movq 8*2(RIO), RAB1; \
358 bswapq RAB1; \
359 rorq $32, RAB1; \
360 movq 12*2(RIO), RCD1; \
361 bswapq RCD1; \
362 rolq $32, RCD1; \
363 xorq key_table(CTX), RAB1;
364
365#define enc_outunpack2(op, max) \
366 xorq key_table(CTX, max, 8), RCD0; \
367 rolq $32, RCD0; \
368 bswapq RCD0; \
369 op ## q RCD0, (RIO); \
370 rorq $32, RAB0; \
371 bswapq RAB0; \
372 op ## q RAB0, 4*2(RIO); \
373 \
374 xorq key_table(CTX, max, 8), RCD1; \
375 rolq $32, RCD1; \
376 bswapq RCD1; \
377 op ## q RCD1, 8*2(RIO); \
378 rorq $32, RAB1; \
379 bswapq RAB1; \
380 op ## q RAB1, 12*2(RIO);
381
382#define dec_rounds2(i) \
383 roundsm2(RAB, i + 7, RCD); \
384 roundsm2(RCD, i + 6, RAB); \
385 roundsm2(RAB, i + 5, RCD); \
386 roundsm2(RCD, i + 4, RAB); \
387 roundsm2(RAB, i + 3, RCD); \
388 roundsm2(RCD, i + 2, RAB);
389
390#define dec_fls2(i) \
391 fls2(RAB, RCD, i + 1, i + 0);
392
393#define dec_inpack2(max) \
394 movq (RIO), RAB0; \
395 bswapq RAB0; \
396 rorq $32, RAB0; \
397 movq 4*2(RIO), RCD0; \
398 bswapq RCD0; \
399 rolq $32, RCD0; \
400 xorq key_table(CTX, max, 8), RAB0; \
401 \
402 movq 8*2(RIO), RAB1; \
403 bswapq RAB1; \
404 rorq $32, RAB1; \
405 movq 12*2(RIO), RCD1; \
406 bswapq RCD1; \
407 rolq $32, RCD1; \
408 xorq key_table(CTX, max, 8), RAB1;
409
410#define dec_outunpack2() \
411 xorq key_table(CTX), RCD0; \
412 rolq $32, RCD0; \
413 bswapq RCD0; \
414 movq RCD0, (RIO); \
415 rorq $32, RAB0; \
416 bswapq RAB0; \
417 movq RAB0, 4*2(RIO); \
418 \
419 xorq key_table(CTX), RCD1; \
420 rolq $32, RCD1; \
421 bswapq RCD1; \
422 movq RCD1, 8*2(RIO); \
423 rorq $32, RAB1; \
424 bswapq RAB1; \
425 movq RAB1, 12*2(RIO);
426
427ENTRY(__camellia_enc_blk_2way)
428 /* input:
429 * %rdi: ctx, CTX
430 * %rsi: dst
431 * %rdx: src
432 * %rcx: bool xor
433 */
434 pushq %rbx;
435
436 movq %rbp, RRBP;
437 movq %rcx, RXOR;
438 movq %rsi, RDST;
439 movq %rdx, RIO;
440
441 enc_inpack2();
442
443 enc_rounds2(0);
444 enc_fls2(8);
445 enc_rounds2(8);
446 enc_fls2(16);
447 enc_rounds2(16);
448 movl $24, RT2d; /* max */
449
450 cmpb $16, key_length(CTX);
451 je .L__enc2_done;
452
453 enc_fls2(24);
454 enc_rounds2(24);
455 movl $32, RT2d; /* max */
456
457.L__enc2_done:
458 test RXORbl, RXORbl;
459 movq RDST, RIO;
460 jnz .L__enc2_xor;
461
462 enc_outunpack2(mov, RT2);
463
464 movq RRBP, %rbp;
465 popq %rbx;
466 ret;
467
468.L__enc2_xor:
469 enc_outunpack2(xor, RT2);
470
471 movq RRBP, %rbp;
472 popq %rbx;
473 ret;
474ENDPROC(__camellia_enc_blk_2way)
475
476ENTRY(camellia_dec_blk_2way)
477 /* input:
478 * %rdi: ctx, CTX
479 * %rsi: dst
480 * %rdx: src
481 */
482 cmpl $16, key_length(CTX);
483 movl $32, RT2d;
484 movl $24, RXORd;
485 cmovel RXORd, RT2d; /* max */
486
487 movq %rbx, RXOR;
488 movq %rbp, RRBP;
489 movq %rsi, RDST;
490 movq %rdx, RIO;
491
492 dec_inpack2(RT2);
493
494 cmpb $24, RT2bl;
495 je .L__dec2_rounds16;
496
497 dec_rounds2(24);
498 dec_fls2(24);
499
500.L__dec2_rounds16:
501 dec_rounds2(16);
502 dec_fls2(16);
503 dec_rounds2(8);
504 dec_fls2(8);
505 dec_rounds2(0);
506
507 movq RDST, RIO;
508
509 dec_outunpack2();
510
511 movq RRBP, %rbp;
512 movq RXOR, %rbx;
513 ret;
514ENDPROC(camellia_dec_blk_2way)