Loading...
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
13 * Copyright (c) 2010, Intel Corporation.
14 *
15 * Ported x86_64 version to x86:
16 * Author: Mathias Krause <minipli@googlemail.com>
17 */
18
19#include <linux/linkage.h>
20#include <asm/frame.h>
21
22#define STATE1 %xmm0
23#define STATE2 %xmm4
24#define STATE3 %xmm5
25#define STATE4 %xmm6
26#define STATE STATE1
27#define IN1 %xmm1
28#define IN2 %xmm7
29#define IN3 %xmm8
30#define IN4 %xmm9
31#define IN IN1
32#define KEY %xmm2
33#define IV %xmm3
34
35#define BSWAP_MASK %xmm10
36#define CTR %xmm11
37#define INC %xmm12
38
39#define GF128MUL_MASK %xmm7
40
41#ifdef __x86_64__
42#define AREG %rax
43#define KEYP %rdi
44#define OUTP %rsi
45#define UKEYP OUTP
46#define INP %rdx
47#define LEN %rcx
48#define IVP %r8
49#define KLEN %r9d
50#define T1 %r10
51#define TKEYP T1
52#define T2 %r11
53#define TCTR_LOW T2
54#else
55#define AREG %eax
56#define KEYP %edi
57#define OUTP AREG
58#define UKEYP OUTP
59#define INP %edx
60#define LEN %esi
61#define IVP %ebp
62#define KLEN %ebx
63#define T1 %ecx
64#define TKEYP T1
65#endif
66
67SYM_FUNC_START_LOCAL(_key_expansion_256a)
68 pshufd $0b11111111, %xmm1, %xmm1
69 shufps $0b00010000, %xmm0, %xmm4
70 pxor %xmm4, %xmm0
71 shufps $0b10001100, %xmm0, %xmm4
72 pxor %xmm4, %xmm0
73 pxor %xmm1, %xmm0
74 movaps %xmm0, (TKEYP)
75 add $0x10, TKEYP
76 RET
77SYM_FUNC_END(_key_expansion_256a)
78SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
79
80SYM_FUNC_START_LOCAL(_key_expansion_192a)
81 pshufd $0b01010101, %xmm1, %xmm1
82 shufps $0b00010000, %xmm0, %xmm4
83 pxor %xmm4, %xmm0
84 shufps $0b10001100, %xmm0, %xmm4
85 pxor %xmm4, %xmm0
86 pxor %xmm1, %xmm0
87
88 movaps %xmm2, %xmm5
89 movaps %xmm2, %xmm6
90 pslldq $4, %xmm5
91 pshufd $0b11111111, %xmm0, %xmm3
92 pxor %xmm3, %xmm2
93 pxor %xmm5, %xmm2
94
95 movaps %xmm0, %xmm1
96 shufps $0b01000100, %xmm0, %xmm6
97 movaps %xmm6, (TKEYP)
98 shufps $0b01001110, %xmm2, %xmm1
99 movaps %xmm1, 0x10(TKEYP)
100 add $0x20, TKEYP
101 RET
102SYM_FUNC_END(_key_expansion_192a)
103
104SYM_FUNC_START_LOCAL(_key_expansion_192b)
105 pshufd $0b01010101, %xmm1, %xmm1
106 shufps $0b00010000, %xmm0, %xmm4
107 pxor %xmm4, %xmm0
108 shufps $0b10001100, %xmm0, %xmm4
109 pxor %xmm4, %xmm0
110 pxor %xmm1, %xmm0
111
112 movaps %xmm2, %xmm5
113 pslldq $4, %xmm5
114 pshufd $0b11111111, %xmm0, %xmm3
115 pxor %xmm3, %xmm2
116 pxor %xmm5, %xmm2
117
118 movaps %xmm0, (TKEYP)
119 add $0x10, TKEYP
120 RET
121SYM_FUNC_END(_key_expansion_192b)
122
123SYM_FUNC_START_LOCAL(_key_expansion_256b)
124 pshufd $0b10101010, %xmm1, %xmm1
125 shufps $0b00010000, %xmm2, %xmm4
126 pxor %xmm4, %xmm2
127 shufps $0b10001100, %xmm2, %xmm4
128 pxor %xmm4, %xmm2
129 pxor %xmm1, %xmm2
130 movaps %xmm2, (TKEYP)
131 add $0x10, TKEYP
132 RET
133SYM_FUNC_END(_key_expansion_256b)
134
135/*
136 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
137 * unsigned int key_len)
138 */
139SYM_FUNC_START(aesni_set_key)
140 FRAME_BEGIN
141#ifndef __x86_64__
142 pushl KEYP
143 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
144 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
145 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
146#endif
147 movups (UKEYP), %xmm0 # user key (first 16 bytes)
148 movaps %xmm0, (KEYP)
149 lea 0x10(KEYP), TKEYP # key addr
150 movl %edx, 480(KEYP)
151 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
152 cmp $24, %dl
153 jb .Lenc_key128
154 je .Lenc_key192
155 movups 0x10(UKEYP), %xmm2 # other user key
156 movaps %xmm2, (TKEYP)
157 add $0x10, TKEYP
158 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
159 call _key_expansion_256a
160 aeskeygenassist $0x1, %xmm0, %xmm1
161 call _key_expansion_256b
162 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
163 call _key_expansion_256a
164 aeskeygenassist $0x2, %xmm0, %xmm1
165 call _key_expansion_256b
166 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
167 call _key_expansion_256a
168 aeskeygenassist $0x4, %xmm0, %xmm1
169 call _key_expansion_256b
170 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
171 call _key_expansion_256a
172 aeskeygenassist $0x8, %xmm0, %xmm1
173 call _key_expansion_256b
174 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
175 call _key_expansion_256a
176 aeskeygenassist $0x10, %xmm0, %xmm1
177 call _key_expansion_256b
178 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
179 call _key_expansion_256a
180 aeskeygenassist $0x20, %xmm0, %xmm1
181 call _key_expansion_256b
182 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
183 call _key_expansion_256a
184 jmp .Ldec_key
185.Lenc_key192:
186 movq 0x10(UKEYP), %xmm2 # other user key
187 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
188 call _key_expansion_192a
189 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
190 call _key_expansion_192b
191 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
192 call _key_expansion_192a
193 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
194 call _key_expansion_192b
195 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
196 call _key_expansion_192a
197 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
198 call _key_expansion_192b
199 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
200 call _key_expansion_192a
201 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
202 call _key_expansion_192b
203 jmp .Ldec_key
204.Lenc_key128:
205 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
206 call _key_expansion_128
207 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
208 call _key_expansion_128
209 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
210 call _key_expansion_128
211 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
212 call _key_expansion_128
213 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
214 call _key_expansion_128
215 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
216 call _key_expansion_128
217 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
218 call _key_expansion_128
219 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
220 call _key_expansion_128
221 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
222 call _key_expansion_128
223 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
224 call _key_expansion_128
225.Ldec_key:
226 sub $0x10, TKEYP
227 movaps (KEYP), %xmm0
228 movaps (TKEYP), %xmm1
229 movaps %xmm0, 240(TKEYP)
230 movaps %xmm1, 240(KEYP)
231 add $0x10, KEYP
232 lea 240-16(TKEYP), UKEYP
233.align 4
234.Ldec_key_loop:
235 movaps (KEYP), %xmm0
236 aesimc %xmm0, %xmm1
237 movaps %xmm1, (UKEYP)
238 add $0x10, KEYP
239 sub $0x10, UKEYP
240 cmp TKEYP, KEYP
241 jb .Ldec_key_loop
242#ifndef __x86_64__
243 popl KEYP
244#endif
245 FRAME_END
246 RET
247SYM_FUNC_END(aesni_set_key)
248
249/*
250 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
251 */
252SYM_FUNC_START(aesni_enc)
253 FRAME_BEGIN
254#ifndef __x86_64__
255 pushl KEYP
256 pushl KLEN
257 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
258 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
259 movl (FRAME_OFFSET+20)(%esp), INP # src
260#endif
261 movl 480(KEYP), KLEN # key length
262 movups (INP), STATE # input
263 call _aesni_enc1
264 movups STATE, (OUTP) # output
265#ifndef __x86_64__
266 popl KLEN
267 popl KEYP
268#endif
269 FRAME_END
270 RET
271SYM_FUNC_END(aesni_enc)
272
273/*
274 * _aesni_enc1: internal ABI
275 * input:
276 * KEYP: key struct pointer
277 * KLEN: round count
278 * STATE: initial state (input)
279 * output:
280 * STATE: finial state (output)
281 * changed:
282 * KEY
283 * TKEYP (T1)
284 */
285SYM_FUNC_START_LOCAL(_aesni_enc1)
286 movaps (KEYP), KEY # key
287 mov KEYP, TKEYP
288 pxor KEY, STATE # round 0
289 add $0x30, TKEYP
290 cmp $24, KLEN
291 jb .Lenc128
292 lea 0x20(TKEYP), TKEYP
293 je .Lenc192
294 add $0x20, TKEYP
295 movaps -0x60(TKEYP), KEY
296 aesenc KEY, STATE
297 movaps -0x50(TKEYP), KEY
298 aesenc KEY, STATE
299.align 4
300.Lenc192:
301 movaps -0x40(TKEYP), KEY
302 aesenc KEY, STATE
303 movaps -0x30(TKEYP), KEY
304 aesenc KEY, STATE
305.align 4
306.Lenc128:
307 movaps -0x20(TKEYP), KEY
308 aesenc KEY, STATE
309 movaps -0x10(TKEYP), KEY
310 aesenc KEY, STATE
311 movaps (TKEYP), KEY
312 aesenc KEY, STATE
313 movaps 0x10(TKEYP), KEY
314 aesenc KEY, STATE
315 movaps 0x20(TKEYP), KEY
316 aesenc KEY, STATE
317 movaps 0x30(TKEYP), KEY
318 aesenc KEY, STATE
319 movaps 0x40(TKEYP), KEY
320 aesenc KEY, STATE
321 movaps 0x50(TKEYP), KEY
322 aesenc KEY, STATE
323 movaps 0x60(TKEYP), KEY
324 aesenc KEY, STATE
325 movaps 0x70(TKEYP), KEY
326 aesenclast KEY, STATE
327 RET
328SYM_FUNC_END(_aesni_enc1)
329
330/*
331 * _aesni_enc4: internal ABI
332 * input:
333 * KEYP: key struct pointer
334 * KLEN: round count
335 * STATE1: initial state (input)
336 * STATE2
337 * STATE3
338 * STATE4
339 * output:
340 * STATE1: finial state (output)
341 * STATE2
342 * STATE3
343 * STATE4
344 * changed:
345 * KEY
346 * TKEYP (T1)
347 */
348SYM_FUNC_START_LOCAL(_aesni_enc4)
349 movaps (KEYP), KEY # key
350 mov KEYP, TKEYP
351 pxor KEY, STATE1 # round 0
352 pxor KEY, STATE2
353 pxor KEY, STATE3
354 pxor KEY, STATE4
355 add $0x30, TKEYP
356 cmp $24, KLEN
357 jb .L4enc128
358 lea 0x20(TKEYP), TKEYP
359 je .L4enc192
360 add $0x20, TKEYP
361 movaps -0x60(TKEYP), KEY
362 aesenc KEY, STATE1
363 aesenc KEY, STATE2
364 aesenc KEY, STATE3
365 aesenc KEY, STATE4
366 movaps -0x50(TKEYP), KEY
367 aesenc KEY, STATE1
368 aesenc KEY, STATE2
369 aesenc KEY, STATE3
370 aesenc KEY, STATE4
371#.align 4
372.L4enc192:
373 movaps -0x40(TKEYP), KEY
374 aesenc KEY, STATE1
375 aesenc KEY, STATE2
376 aesenc KEY, STATE3
377 aesenc KEY, STATE4
378 movaps -0x30(TKEYP), KEY
379 aesenc KEY, STATE1
380 aesenc KEY, STATE2
381 aesenc KEY, STATE3
382 aesenc KEY, STATE4
383#.align 4
384.L4enc128:
385 movaps -0x20(TKEYP), KEY
386 aesenc KEY, STATE1
387 aesenc KEY, STATE2
388 aesenc KEY, STATE3
389 aesenc KEY, STATE4
390 movaps -0x10(TKEYP), KEY
391 aesenc KEY, STATE1
392 aesenc KEY, STATE2
393 aesenc KEY, STATE3
394 aesenc KEY, STATE4
395 movaps (TKEYP), KEY
396 aesenc KEY, STATE1
397 aesenc KEY, STATE2
398 aesenc KEY, STATE3
399 aesenc KEY, STATE4
400 movaps 0x10(TKEYP), KEY
401 aesenc KEY, STATE1
402 aesenc KEY, STATE2
403 aesenc KEY, STATE3
404 aesenc KEY, STATE4
405 movaps 0x20(TKEYP), KEY
406 aesenc KEY, STATE1
407 aesenc KEY, STATE2
408 aesenc KEY, STATE3
409 aesenc KEY, STATE4
410 movaps 0x30(TKEYP), KEY
411 aesenc KEY, STATE1
412 aesenc KEY, STATE2
413 aesenc KEY, STATE3
414 aesenc KEY, STATE4
415 movaps 0x40(TKEYP), KEY
416 aesenc KEY, STATE1
417 aesenc KEY, STATE2
418 aesenc KEY, STATE3
419 aesenc KEY, STATE4
420 movaps 0x50(TKEYP), KEY
421 aesenc KEY, STATE1
422 aesenc KEY, STATE2
423 aesenc KEY, STATE3
424 aesenc KEY, STATE4
425 movaps 0x60(TKEYP), KEY
426 aesenc KEY, STATE1
427 aesenc KEY, STATE2
428 aesenc KEY, STATE3
429 aesenc KEY, STATE4
430 movaps 0x70(TKEYP), KEY
431 aesenclast KEY, STATE1 # last round
432 aesenclast KEY, STATE2
433 aesenclast KEY, STATE3
434 aesenclast KEY, STATE4
435 RET
436SYM_FUNC_END(_aesni_enc4)
437
438/*
439 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
440 */
441SYM_FUNC_START(aesni_dec)
442 FRAME_BEGIN
443#ifndef __x86_64__
444 pushl KEYP
445 pushl KLEN
446 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
447 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
448 movl (FRAME_OFFSET+20)(%esp), INP # src
449#endif
450 mov 480(KEYP), KLEN # key length
451 add $240, KEYP
452 movups (INP), STATE # input
453 call _aesni_dec1
454 movups STATE, (OUTP) #output
455#ifndef __x86_64__
456 popl KLEN
457 popl KEYP
458#endif
459 FRAME_END
460 RET
461SYM_FUNC_END(aesni_dec)
462
463/*
464 * _aesni_dec1: internal ABI
465 * input:
466 * KEYP: key struct pointer
467 * KLEN: key length
468 * STATE: initial state (input)
469 * output:
470 * STATE: finial state (output)
471 * changed:
472 * KEY
473 * TKEYP (T1)
474 */
475SYM_FUNC_START_LOCAL(_aesni_dec1)
476 movaps (KEYP), KEY # key
477 mov KEYP, TKEYP
478 pxor KEY, STATE # round 0
479 add $0x30, TKEYP
480 cmp $24, KLEN
481 jb .Ldec128
482 lea 0x20(TKEYP), TKEYP
483 je .Ldec192
484 add $0x20, TKEYP
485 movaps -0x60(TKEYP), KEY
486 aesdec KEY, STATE
487 movaps -0x50(TKEYP), KEY
488 aesdec KEY, STATE
489.align 4
490.Ldec192:
491 movaps -0x40(TKEYP), KEY
492 aesdec KEY, STATE
493 movaps -0x30(TKEYP), KEY
494 aesdec KEY, STATE
495.align 4
496.Ldec128:
497 movaps -0x20(TKEYP), KEY
498 aesdec KEY, STATE
499 movaps -0x10(TKEYP), KEY
500 aesdec KEY, STATE
501 movaps (TKEYP), KEY
502 aesdec KEY, STATE
503 movaps 0x10(TKEYP), KEY
504 aesdec KEY, STATE
505 movaps 0x20(TKEYP), KEY
506 aesdec KEY, STATE
507 movaps 0x30(TKEYP), KEY
508 aesdec KEY, STATE
509 movaps 0x40(TKEYP), KEY
510 aesdec KEY, STATE
511 movaps 0x50(TKEYP), KEY
512 aesdec KEY, STATE
513 movaps 0x60(TKEYP), KEY
514 aesdec KEY, STATE
515 movaps 0x70(TKEYP), KEY
516 aesdeclast KEY, STATE
517 RET
518SYM_FUNC_END(_aesni_dec1)
519
520/*
521 * _aesni_dec4: internal ABI
522 * input:
523 * KEYP: key struct pointer
524 * KLEN: key length
525 * STATE1: initial state (input)
526 * STATE2
527 * STATE3
528 * STATE4
529 * output:
530 * STATE1: finial state (output)
531 * STATE2
532 * STATE3
533 * STATE4
534 * changed:
535 * KEY
536 * TKEYP (T1)
537 */
538SYM_FUNC_START_LOCAL(_aesni_dec4)
539 movaps (KEYP), KEY # key
540 mov KEYP, TKEYP
541 pxor KEY, STATE1 # round 0
542 pxor KEY, STATE2
543 pxor KEY, STATE3
544 pxor KEY, STATE4
545 add $0x30, TKEYP
546 cmp $24, KLEN
547 jb .L4dec128
548 lea 0x20(TKEYP), TKEYP
549 je .L4dec192
550 add $0x20, TKEYP
551 movaps -0x60(TKEYP), KEY
552 aesdec KEY, STATE1
553 aesdec KEY, STATE2
554 aesdec KEY, STATE3
555 aesdec KEY, STATE4
556 movaps -0x50(TKEYP), KEY
557 aesdec KEY, STATE1
558 aesdec KEY, STATE2
559 aesdec KEY, STATE3
560 aesdec KEY, STATE4
561.align 4
562.L4dec192:
563 movaps -0x40(TKEYP), KEY
564 aesdec KEY, STATE1
565 aesdec KEY, STATE2
566 aesdec KEY, STATE3
567 aesdec KEY, STATE4
568 movaps -0x30(TKEYP), KEY
569 aesdec KEY, STATE1
570 aesdec KEY, STATE2
571 aesdec KEY, STATE3
572 aesdec KEY, STATE4
573.align 4
574.L4dec128:
575 movaps -0x20(TKEYP), KEY
576 aesdec KEY, STATE1
577 aesdec KEY, STATE2
578 aesdec KEY, STATE3
579 aesdec KEY, STATE4
580 movaps -0x10(TKEYP), KEY
581 aesdec KEY, STATE1
582 aesdec KEY, STATE2
583 aesdec KEY, STATE3
584 aesdec KEY, STATE4
585 movaps (TKEYP), KEY
586 aesdec KEY, STATE1
587 aesdec KEY, STATE2
588 aesdec KEY, STATE3
589 aesdec KEY, STATE4
590 movaps 0x10(TKEYP), KEY
591 aesdec KEY, STATE1
592 aesdec KEY, STATE2
593 aesdec KEY, STATE3
594 aesdec KEY, STATE4
595 movaps 0x20(TKEYP), KEY
596 aesdec KEY, STATE1
597 aesdec KEY, STATE2
598 aesdec KEY, STATE3
599 aesdec KEY, STATE4
600 movaps 0x30(TKEYP), KEY
601 aesdec KEY, STATE1
602 aesdec KEY, STATE2
603 aesdec KEY, STATE3
604 aesdec KEY, STATE4
605 movaps 0x40(TKEYP), KEY
606 aesdec KEY, STATE1
607 aesdec KEY, STATE2
608 aesdec KEY, STATE3
609 aesdec KEY, STATE4
610 movaps 0x50(TKEYP), KEY
611 aesdec KEY, STATE1
612 aesdec KEY, STATE2
613 aesdec KEY, STATE3
614 aesdec KEY, STATE4
615 movaps 0x60(TKEYP), KEY
616 aesdec KEY, STATE1
617 aesdec KEY, STATE2
618 aesdec KEY, STATE3
619 aesdec KEY, STATE4
620 movaps 0x70(TKEYP), KEY
621 aesdeclast KEY, STATE1 # last round
622 aesdeclast KEY, STATE2
623 aesdeclast KEY, STATE3
624 aesdeclast KEY, STATE4
625 RET
626SYM_FUNC_END(_aesni_dec4)
627
628/*
629 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
630 * size_t len)
631 */
632SYM_FUNC_START(aesni_ecb_enc)
633 FRAME_BEGIN
634#ifndef __x86_64__
635 pushl LEN
636 pushl KEYP
637 pushl KLEN
638 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
639 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
640 movl (FRAME_OFFSET+24)(%esp), INP # src
641 movl (FRAME_OFFSET+28)(%esp), LEN # len
642#endif
643 test LEN, LEN # check length
644 jz .Lecb_enc_ret
645 mov 480(KEYP), KLEN
646 cmp $16, LEN
647 jb .Lecb_enc_ret
648 cmp $64, LEN
649 jb .Lecb_enc_loop1
650.align 4
651.Lecb_enc_loop4:
652 movups (INP), STATE1
653 movups 0x10(INP), STATE2
654 movups 0x20(INP), STATE3
655 movups 0x30(INP), STATE4
656 call _aesni_enc4
657 movups STATE1, (OUTP)
658 movups STATE2, 0x10(OUTP)
659 movups STATE3, 0x20(OUTP)
660 movups STATE4, 0x30(OUTP)
661 sub $64, LEN
662 add $64, INP
663 add $64, OUTP
664 cmp $64, LEN
665 jge .Lecb_enc_loop4
666 cmp $16, LEN
667 jb .Lecb_enc_ret
668.align 4
669.Lecb_enc_loop1:
670 movups (INP), STATE1
671 call _aesni_enc1
672 movups STATE1, (OUTP)
673 sub $16, LEN
674 add $16, INP
675 add $16, OUTP
676 cmp $16, LEN
677 jge .Lecb_enc_loop1
678.Lecb_enc_ret:
679#ifndef __x86_64__
680 popl KLEN
681 popl KEYP
682 popl LEN
683#endif
684 FRAME_END
685 RET
686SYM_FUNC_END(aesni_ecb_enc)
687
688/*
689 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
690 * size_t len);
691 */
692SYM_FUNC_START(aesni_ecb_dec)
693 FRAME_BEGIN
694#ifndef __x86_64__
695 pushl LEN
696 pushl KEYP
697 pushl KLEN
698 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
699 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
700 movl (FRAME_OFFSET+24)(%esp), INP # src
701 movl (FRAME_OFFSET+28)(%esp), LEN # len
702#endif
703 test LEN, LEN
704 jz .Lecb_dec_ret
705 mov 480(KEYP), KLEN
706 add $240, KEYP
707 cmp $16, LEN
708 jb .Lecb_dec_ret
709 cmp $64, LEN
710 jb .Lecb_dec_loop1
711.align 4
712.Lecb_dec_loop4:
713 movups (INP), STATE1
714 movups 0x10(INP), STATE2
715 movups 0x20(INP), STATE3
716 movups 0x30(INP), STATE4
717 call _aesni_dec4
718 movups STATE1, (OUTP)
719 movups STATE2, 0x10(OUTP)
720 movups STATE3, 0x20(OUTP)
721 movups STATE4, 0x30(OUTP)
722 sub $64, LEN
723 add $64, INP
724 add $64, OUTP
725 cmp $64, LEN
726 jge .Lecb_dec_loop4
727 cmp $16, LEN
728 jb .Lecb_dec_ret
729.align 4
730.Lecb_dec_loop1:
731 movups (INP), STATE1
732 call _aesni_dec1
733 movups STATE1, (OUTP)
734 sub $16, LEN
735 add $16, INP
736 add $16, OUTP
737 cmp $16, LEN
738 jge .Lecb_dec_loop1
739.Lecb_dec_ret:
740#ifndef __x86_64__
741 popl KLEN
742 popl KEYP
743 popl LEN
744#endif
745 FRAME_END
746 RET
747SYM_FUNC_END(aesni_ecb_dec)
748
749/*
750 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
751 * size_t len, u8 *iv)
752 */
753SYM_FUNC_START(aesni_cbc_enc)
754 FRAME_BEGIN
755#ifndef __x86_64__
756 pushl IVP
757 pushl LEN
758 pushl KEYP
759 pushl KLEN
760 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
761 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
762 movl (FRAME_OFFSET+28)(%esp), INP # src
763 movl (FRAME_OFFSET+32)(%esp), LEN # len
764 movl (FRAME_OFFSET+36)(%esp), IVP # iv
765#endif
766 cmp $16, LEN
767 jb .Lcbc_enc_ret
768 mov 480(KEYP), KLEN
769 movups (IVP), STATE # load iv as initial state
770.align 4
771.Lcbc_enc_loop:
772 movups (INP), IN # load input
773 pxor IN, STATE
774 call _aesni_enc1
775 movups STATE, (OUTP) # store output
776 sub $16, LEN
777 add $16, INP
778 add $16, OUTP
779 cmp $16, LEN
780 jge .Lcbc_enc_loop
781 movups STATE, (IVP)
782.Lcbc_enc_ret:
783#ifndef __x86_64__
784 popl KLEN
785 popl KEYP
786 popl LEN
787 popl IVP
788#endif
789 FRAME_END
790 RET
791SYM_FUNC_END(aesni_cbc_enc)
792
793/*
794 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
795 * size_t len, u8 *iv)
796 */
797SYM_FUNC_START(aesni_cbc_dec)
798 FRAME_BEGIN
799#ifndef __x86_64__
800 pushl IVP
801 pushl LEN
802 pushl KEYP
803 pushl KLEN
804 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
805 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
806 movl (FRAME_OFFSET+28)(%esp), INP # src
807 movl (FRAME_OFFSET+32)(%esp), LEN # len
808 movl (FRAME_OFFSET+36)(%esp), IVP # iv
809#endif
810 cmp $16, LEN
811 jb .Lcbc_dec_just_ret
812 mov 480(KEYP), KLEN
813 add $240, KEYP
814 movups (IVP), IV
815 cmp $64, LEN
816 jb .Lcbc_dec_loop1
817.align 4
818.Lcbc_dec_loop4:
819 movups (INP), IN1
820 movaps IN1, STATE1
821 movups 0x10(INP), IN2
822 movaps IN2, STATE2
823#ifdef __x86_64__
824 movups 0x20(INP), IN3
825 movaps IN3, STATE3
826 movups 0x30(INP), IN4
827 movaps IN4, STATE4
828#else
829 movups 0x20(INP), IN1
830 movaps IN1, STATE3
831 movups 0x30(INP), IN2
832 movaps IN2, STATE4
833#endif
834 call _aesni_dec4
835 pxor IV, STATE1
836#ifdef __x86_64__
837 pxor IN1, STATE2
838 pxor IN2, STATE3
839 pxor IN3, STATE4
840 movaps IN4, IV
841#else
842 pxor IN1, STATE4
843 movaps IN2, IV
844 movups (INP), IN1
845 pxor IN1, STATE2
846 movups 0x10(INP), IN2
847 pxor IN2, STATE3
848#endif
849 movups STATE1, (OUTP)
850 movups STATE2, 0x10(OUTP)
851 movups STATE3, 0x20(OUTP)
852 movups STATE4, 0x30(OUTP)
853 sub $64, LEN
854 add $64, INP
855 add $64, OUTP
856 cmp $64, LEN
857 jge .Lcbc_dec_loop4
858 cmp $16, LEN
859 jb .Lcbc_dec_ret
860.align 4
861.Lcbc_dec_loop1:
862 movups (INP), IN
863 movaps IN, STATE
864 call _aesni_dec1
865 pxor IV, STATE
866 movups STATE, (OUTP)
867 movaps IN, IV
868 sub $16, LEN
869 add $16, INP
870 add $16, OUTP
871 cmp $16, LEN
872 jge .Lcbc_dec_loop1
873.Lcbc_dec_ret:
874 movups IV, (IVP)
875.Lcbc_dec_just_ret:
876#ifndef __x86_64__
877 popl KLEN
878 popl KEYP
879 popl LEN
880 popl IVP
881#endif
882 FRAME_END
883 RET
884SYM_FUNC_END(aesni_cbc_dec)
885
886/*
887 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
888 * size_t len, u8 *iv)
889 */
890SYM_FUNC_START(aesni_cts_cbc_enc)
891 FRAME_BEGIN
892#ifndef __x86_64__
893 pushl IVP
894 pushl LEN
895 pushl KEYP
896 pushl KLEN
897 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
898 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
899 movl (FRAME_OFFSET+28)(%esp), INP # src
900 movl (FRAME_OFFSET+32)(%esp), LEN # len
901 movl (FRAME_OFFSET+36)(%esp), IVP # iv
902 lea .Lcts_permute_table, T1
903#else
904 lea .Lcts_permute_table(%rip), T1
905#endif
906 mov 480(KEYP), KLEN
907 movups (IVP), STATE
908 sub $16, LEN
909 mov T1, IVP
910 add $32, IVP
911 add LEN, T1
912 sub LEN, IVP
913 movups (T1), %xmm4
914 movups (IVP), %xmm5
915
916 movups (INP), IN1
917 add LEN, INP
918 movups (INP), IN2
919
920 pxor IN1, STATE
921 call _aesni_enc1
922
923 pshufb %xmm5, IN2
924 pxor STATE, IN2
925 pshufb %xmm4, STATE
926 add OUTP, LEN
927 movups STATE, (LEN)
928
929 movaps IN2, STATE
930 call _aesni_enc1
931 movups STATE, (OUTP)
932
933#ifndef __x86_64__
934 popl KLEN
935 popl KEYP
936 popl LEN
937 popl IVP
938#endif
939 FRAME_END
940 RET
941SYM_FUNC_END(aesni_cts_cbc_enc)
942
943/*
944 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
945 * size_t len, u8 *iv)
946 */
947SYM_FUNC_START(aesni_cts_cbc_dec)
948 FRAME_BEGIN
949#ifndef __x86_64__
950 pushl IVP
951 pushl LEN
952 pushl KEYP
953 pushl KLEN
954 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
955 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
956 movl (FRAME_OFFSET+28)(%esp), INP # src
957 movl (FRAME_OFFSET+32)(%esp), LEN # len
958 movl (FRAME_OFFSET+36)(%esp), IVP # iv
959 lea .Lcts_permute_table, T1
960#else
961 lea .Lcts_permute_table(%rip), T1
962#endif
963 mov 480(KEYP), KLEN
964 add $240, KEYP
965 movups (IVP), IV
966 sub $16, LEN
967 mov T1, IVP
968 add $32, IVP
969 add LEN, T1
970 sub LEN, IVP
971 movups (T1), %xmm4
972
973 movups (INP), STATE
974 add LEN, INP
975 movups (INP), IN1
976
977 call _aesni_dec1
978 movaps STATE, IN2
979 pshufb %xmm4, STATE
980 pxor IN1, STATE
981
982 add OUTP, LEN
983 movups STATE, (LEN)
984
985 movups (IVP), %xmm0
986 pshufb %xmm0, IN1
987 pblendvb IN2, IN1
988 movaps IN1, STATE
989 call _aesni_dec1
990
991 pxor IV, STATE
992 movups STATE, (OUTP)
993
994#ifndef __x86_64__
995 popl KLEN
996 popl KEYP
997 popl LEN
998 popl IVP
999#endif
1000 FRAME_END
1001 RET
1002SYM_FUNC_END(aesni_cts_cbc_dec)
1003
1004.pushsection .rodata
1005.align 16
1006.Lcts_permute_table:
1007 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1008 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1009 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
1010 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
1011 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1012 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1013#ifdef __x86_64__
1014.Lbswap_mask:
1015 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1016#endif
1017.popsection
1018
1019#ifdef __x86_64__
1020/*
1021 * _aesni_inc_init: internal ABI
1022 * setup registers used by _aesni_inc
1023 * input:
1024 * IV
1025 * output:
1026 * CTR: == IV, in little endian
1027 * TCTR_LOW: == lower qword of CTR
1028 * INC: == 1, in little endian
1029 * BSWAP_MASK == endian swapping mask
1030 */
1031SYM_FUNC_START_LOCAL(_aesni_inc_init)
1032 movaps .Lbswap_mask(%rip), BSWAP_MASK
1033 movaps IV, CTR
1034 pshufb BSWAP_MASK, CTR
1035 mov $1, TCTR_LOW
1036 movq TCTR_LOW, INC
1037 movq CTR, TCTR_LOW
1038 RET
1039SYM_FUNC_END(_aesni_inc_init)
1040
1041/*
1042 * _aesni_inc: internal ABI
1043 * Increase IV by 1, IV is in big endian
1044 * input:
1045 * IV
1046 * CTR: == IV, in little endian
1047 * TCTR_LOW: == lower qword of CTR
1048 * INC: == 1, in little endian
1049 * BSWAP_MASK == endian swapping mask
1050 * output:
1051 * IV: Increase by 1
1052 * changed:
1053 * CTR: == output IV, in little endian
1054 * TCTR_LOW: == lower qword of CTR
1055 */
1056SYM_FUNC_START_LOCAL(_aesni_inc)
1057 paddq INC, CTR
1058 add $1, TCTR_LOW
1059 jnc .Linc_low
1060 pslldq $8, INC
1061 paddq INC, CTR
1062 psrldq $8, INC
1063.Linc_low:
1064 movaps CTR, IV
1065 pshufb BSWAP_MASK, IV
1066 RET
1067SYM_FUNC_END(_aesni_inc)
1068
1069/*
1070 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1071 * size_t len, u8 *iv)
1072 */
1073SYM_FUNC_START(aesni_ctr_enc)
1074 FRAME_BEGIN
1075 cmp $16, LEN
1076 jb .Lctr_enc_just_ret
1077 mov 480(KEYP), KLEN
1078 movups (IVP), IV
1079 call _aesni_inc_init
1080 cmp $64, LEN
1081 jb .Lctr_enc_loop1
1082.align 4
1083.Lctr_enc_loop4:
1084 movaps IV, STATE1
1085 call _aesni_inc
1086 movups (INP), IN1
1087 movaps IV, STATE2
1088 call _aesni_inc
1089 movups 0x10(INP), IN2
1090 movaps IV, STATE3
1091 call _aesni_inc
1092 movups 0x20(INP), IN3
1093 movaps IV, STATE4
1094 call _aesni_inc
1095 movups 0x30(INP), IN4
1096 call _aesni_enc4
1097 pxor IN1, STATE1
1098 movups STATE1, (OUTP)
1099 pxor IN2, STATE2
1100 movups STATE2, 0x10(OUTP)
1101 pxor IN3, STATE3
1102 movups STATE3, 0x20(OUTP)
1103 pxor IN4, STATE4
1104 movups STATE4, 0x30(OUTP)
1105 sub $64, LEN
1106 add $64, INP
1107 add $64, OUTP
1108 cmp $64, LEN
1109 jge .Lctr_enc_loop4
1110 cmp $16, LEN
1111 jb .Lctr_enc_ret
1112.align 4
1113.Lctr_enc_loop1:
1114 movaps IV, STATE
1115 call _aesni_inc
1116 movups (INP), IN
1117 call _aesni_enc1
1118 pxor IN, STATE
1119 movups STATE, (OUTP)
1120 sub $16, LEN
1121 add $16, INP
1122 add $16, OUTP
1123 cmp $16, LEN
1124 jge .Lctr_enc_loop1
1125.Lctr_enc_ret:
1126 movups IV, (IVP)
1127.Lctr_enc_just_ret:
1128 FRAME_END
1129 RET
1130SYM_FUNC_END(aesni_ctr_enc)
1131
1132#endif
1133
1134.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
1135.align 16
1136.Lgf128mul_x_ble_mask:
1137 .octa 0x00000000000000010000000000000087
1138.previous
1139
1140/*
1141 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
1142 * input:
1143 * IV: current IV
1144 * GF128MUL_MASK == mask with 0x87 and 0x01
1145 * output:
1146 * IV: next IV
1147 * changed:
1148 * KEY: == temporary value
1149 */
1150.macro _aesni_gf128mul_x_ble
1151 pshufd $0x13, IV, KEY
1152 paddq IV, IV
1153 psrad $31, KEY
1154 pand GF128MUL_MASK, KEY
1155 pxor KEY, IV
1156.endm
1157
1158.macro _aesni_xts_crypt enc
1159 FRAME_BEGIN
1160#ifndef __x86_64__
1161 pushl IVP
1162 pushl LEN
1163 pushl KEYP
1164 pushl KLEN
1165 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
1166 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
1167 movl (FRAME_OFFSET+28)(%esp), INP # src
1168 movl (FRAME_OFFSET+32)(%esp), LEN # len
1169 movl (FRAME_OFFSET+36)(%esp), IVP # iv
1170 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
1171#else
1172 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
1173#endif
1174 movups (IVP), IV
1175
1176 mov 480(KEYP), KLEN
1177.if !\enc
1178 add $240, KEYP
1179
1180 test $15, LEN
1181 jz .Lxts_loop4\@
1182 sub $16, LEN
1183.endif
1184
1185.Lxts_loop4\@:
1186 sub $64, LEN
1187 jl .Lxts_1x\@
1188
1189 movdqa IV, STATE1
1190 movdqu 0x00(INP), IN
1191 pxor IN, STATE1
1192 movdqu IV, 0x00(OUTP)
1193
1194 _aesni_gf128mul_x_ble
1195 movdqa IV, STATE2
1196 movdqu 0x10(INP), IN
1197 pxor IN, STATE2
1198 movdqu IV, 0x10(OUTP)
1199
1200 _aesni_gf128mul_x_ble
1201 movdqa IV, STATE3
1202 movdqu 0x20(INP), IN
1203 pxor IN, STATE3
1204 movdqu IV, 0x20(OUTP)
1205
1206 _aesni_gf128mul_x_ble
1207 movdqa IV, STATE4
1208 movdqu 0x30(INP), IN
1209 pxor IN, STATE4
1210 movdqu IV, 0x30(OUTP)
1211
1212.if \enc
1213 call _aesni_enc4
1214.else
1215 call _aesni_dec4
1216.endif
1217
1218 movdqu 0x00(OUTP), IN
1219 pxor IN, STATE1
1220 movdqu STATE1, 0x00(OUTP)
1221
1222 movdqu 0x10(OUTP), IN
1223 pxor IN, STATE2
1224 movdqu STATE2, 0x10(OUTP)
1225
1226 movdqu 0x20(OUTP), IN
1227 pxor IN, STATE3
1228 movdqu STATE3, 0x20(OUTP)
1229
1230 movdqu 0x30(OUTP), IN
1231 pxor IN, STATE4
1232 movdqu STATE4, 0x30(OUTP)
1233
1234 _aesni_gf128mul_x_ble
1235
1236 add $64, INP
1237 add $64, OUTP
1238 test LEN, LEN
1239 jnz .Lxts_loop4\@
1240
1241.Lxts_ret_iv\@:
1242 movups IV, (IVP)
1243
1244.Lxts_ret\@:
1245#ifndef __x86_64__
1246 popl KLEN
1247 popl KEYP
1248 popl LEN
1249 popl IVP
1250#endif
1251 FRAME_END
1252 RET
1253
1254.Lxts_1x\@:
1255 add $64, LEN
1256 jz .Lxts_ret_iv\@
1257.if \enc
1258 sub $16, LEN
1259 jl .Lxts_cts4\@
1260.endif
1261
1262.Lxts_loop1\@:
1263 movdqu (INP), STATE
1264.if \enc
1265 pxor IV, STATE
1266 call _aesni_enc1
1267.else
1268 add $16, INP
1269 sub $16, LEN
1270 jl .Lxts_cts1\@
1271 pxor IV, STATE
1272 call _aesni_dec1
1273.endif
1274 pxor IV, STATE
1275 _aesni_gf128mul_x_ble
1276
1277 test LEN, LEN
1278 jz .Lxts_out\@
1279
1280.if \enc
1281 add $16, INP
1282 sub $16, LEN
1283 jl .Lxts_cts1\@
1284.endif
1285
1286 movdqu STATE, (OUTP)
1287 add $16, OUTP
1288 jmp .Lxts_loop1\@
1289
1290.Lxts_out\@:
1291 movdqu STATE, (OUTP)
1292 jmp .Lxts_ret_iv\@
1293
1294.if \enc
1295.Lxts_cts4\@:
1296 movdqa STATE4, STATE
1297 sub $16, OUTP
1298.Lxts_cts1\@:
1299.else
1300.Lxts_cts1\@:
1301 movdqa IV, STATE4
1302 _aesni_gf128mul_x_ble
1303
1304 pxor IV, STATE
1305 call _aesni_dec1
1306 pxor IV, STATE
1307.endif
1308#ifndef __x86_64__
1309 lea .Lcts_permute_table, T1
1310#else
1311 lea .Lcts_permute_table(%rip), T1
1312#endif
1313 add LEN, INP /* rewind input pointer */
1314 add $16, LEN /* # bytes in final block */
1315 movups (INP), IN1
1316
1317 mov T1, IVP
1318 add $32, IVP
1319 add LEN, T1
1320 sub LEN, IVP
1321 add OUTP, LEN
1322
1323 movups (T1), %xmm4
1324 movaps STATE, IN2
1325 pshufb %xmm4, STATE
1326 movups STATE, (LEN)
1327
1328 movups (IVP), %xmm0
1329 pshufb %xmm0, IN1
1330 pblendvb IN2, IN1
1331 movaps IN1, STATE
1332
1333.if \enc
1334 pxor IV, STATE
1335 call _aesni_enc1
1336 pxor IV, STATE
1337.else
1338 pxor STATE4, STATE
1339 call _aesni_dec1
1340 pxor STATE4, STATE
1341.endif
1342
1343 movups STATE, (OUTP)
1344 jmp .Lxts_ret\@
1345.endm
1346
1347/*
1348 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
1349 * const u8 *src, unsigned int len, le128 *iv)
1350 */
1351SYM_FUNC_START(aesni_xts_enc)
1352 _aesni_xts_crypt 1
1353SYM_FUNC_END(aesni_xts_enc)
1354
1355/*
1356 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
1357 * const u8 *src, unsigned int len, le128 *iv)
1358 */
1359SYM_FUNC_START(aesni_xts_dec)
1360 _aesni_xts_crypt 0
1361SYM_FUNC_END(aesni_xts_dec)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34
35#ifdef __x86_64__
36.data
37.align 16
38.Lgf128mul_x_ble_mask:
39 .octa 0x00000000000000010000000000000087
40
41POLY: .octa 0xC2000000000000000000000000000001
42TWOONE: .octa 0x00000001000000000000000000000001
43
44# order of these constants should not change.
45# more specifically, ALL_F should follow SHIFT_MASK,
46# and ZERO should follow ALL_F
47
48SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
49MASK1: .octa 0x0000000000000000ffffffffffffffff
50MASK2: .octa 0xffffffffffffffff0000000000000000
51SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
52ALL_F: .octa 0xffffffffffffffffffffffffffffffff
53ZERO: .octa 0x00000000000000000000000000000000
54ONE: .octa 0x00000000000000000000000000000001
55F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
56dec: .octa 0x1
57enc: .octa 0x2
58
59
60.text
61
62
63#define STACK_OFFSET 8*3
64#define HashKey 16*0 // store HashKey <<1 mod poly here
65#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
66#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
67#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
68#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
69 // bits of HashKey <<1 mod poly here
70 //(for Karatsuba purposes)
71#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
72 // bits of HashKey^2 <<1 mod poly here
73 // (for Karatsuba purposes)
74#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
75 // bits of HashKey^3 <<1 mod poly here
76 // (for Karatsuba purposes)
77#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
78 // bits of HashKey^4 <<1 mod poly here
79 // (for Karatsuba purposes)
80#define VARIABLE_OFFSET 16*8
81
82#define arg1 rdi
83#define arg2 rsi
84#define arg3 rdx
85#define arg4 rcx
86#define arg5 r8
87#define arg6 r9
88#define arg7 STACK_OFFSET+8(%r14)
89#define arg8 STACK_OFFSET+16(%r14)
90#define arg9 STACK_OFFSET+24(%r14)
91#define arg10 STACK_OFFSET+32(%r14)
92#endif
93
94
95#define STATE1 %xmm0
96#define STATE2 %xmm4
97#define STATE3 %xmm5
98#define STATE4 %xmm6
99#define STATE STATE1
100#define IN1 %xmm1
101#define IN2 %xmm7
102#define IN3 %xmm8
103#define IN4 %xmm9
104#define IN IN1
105#define KEY %xmm2
106#define IV %xmm3
107
108#define BSWAP_MASK %xmm10
109#define CTR %xmm11
110#define INC %xmm12
111
112#define GF128MUL_MASK %xmm10
113
114#ifdef __x86_64__
115#define AREG %rax
116#define KEYP %rdi
117#define OUTP %rsi
118#define UKEYP OUTP
119#define INP %rdx
120#define LEN %rcx
121#define IVP %r8
122#define KLEN %r9d
123#define T1 %r10
124#define TKEYP T1
125#define T2 %r11
126#define TCTR_LOW T2
127#else
128#define AREG %eax
129#define KEYP %edi
130#define OUTP AREG
131#define UKEYP OUTP
132#define INP %edx
133#define LEN %esi
134#define IVP %ebp
135#define KLEN %ebx
136#define T1 %ecx
137#define TKEYP T1
138#endif
139
140
141#ifdef __x86_64__
142/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
143*
144*
145* Input: A and B (128-bits each, bit-reflected)
146* Output: C = A*B*x mod poly, (i.e. >>1 )
147* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
148* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
149*
150*/
151.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
152 movdqa \GH, \TMP1
153 pshufd $78, \GH, \TMP2
154 pshufd $78, \HK, \TMP3
155 pxor \GH, \TMP2 # TMP2 = a1+a0
156 pxor \HK, \TMP3 # TMP3 = b1+b0
157 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
158 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
159 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
160 pxor \GH, \TMP2
161 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
162 movdqa \TMP2, \TMP3
163 pslldq $8, \TMP3 # left shift TMP3 2 DWs
164 psrldq $8, \TMP2 # right shift TMP2 2 DWs
165 pxor \TMP3, \GH
166 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
167
168 # first phase of the reduction
169
170 movdqa \GH, \TMP2
171 movdqa \GH, \TMP3
172 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
173 # in in order to perform
174 # independent shifts
175 pslld $31, \TMP2 # packed right shift <<31
176 pslld $30, \TMP3 # packed right shift <<30
177 pslld $25, \TMP4 # packed right shift <<25
178 pxor \TMP3, \TMP2 # xor the shifted versions
179 pxor \TMP4, \TMP2
180 movdqa \TMP2, \TMP5
181 psrldq $4, \TMP5 # right shift TMP5 1 DW
182 pslldq $12, \TMP2 # left shift TMP2 3 DWs
183 pxor \TMP2, \GH
184
185 # second phase of the reduction
186
187 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
188 # in in order to perform
189 # independent shifts
190 movdqa \GH,\TMP3
191 movdqa \GH,\TMP4
192 psrld $1,\TMP2 # packed left shift >>1
193 psrld $2,\TMP3 # packed left shift >>2
194 psrld $7,\TMP4 # packed left shift >>7
195 pxor \TMP3,\TMP2 # xor the shifted versions
196 pxor \TMP4,\TMP2
197 pxor \TMP5, \TMP2
198 pxor \TMP2, \GH
199 pxor \TMP1, \GH # result is in TMP1
200.endm
201
202/*
203* if a = number of total plaintext bytes
204* b = floor(a/16)
205* num_initial_blocks = b mod 4
206* encrypt the initial num_initial_blocks blocks and apply ghash on
207* the ciphertext
208* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
209* are clobbered
210* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
211*/
212
213
214.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
215XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
216 mov arg7, %r10 # %r10 = AAD
217 mov arg8, %r12 # %r12 = aadLen
218 mov %r12, %r11
219 pxor %xmm\i, %xmm\i
220_get_AAD_loop\num_initial_blocks\operation:
221 movd (%r10), \TMP1
222 pslldq $12, \TMP1
223 psrldq $4, %xmm\i
224 pxor \TMP1, %xmm\i
225 add $4, %r10
226 sub $4, %r12
227 jne _get_AAD_loop\num_initial_blocks\operation
228 cmp $16, %r11
229 je _get_AAD_loop2_done\num_initial_blocks\operation
230 mov $16, %r12
231_get_AAD_loop2\num_initial_blocks\operation:
232 psrldq $4, %xmm\i
233 sub $4, %r12
234 cmp %r11, %r12
235 jne _get_AAD_loop2\num_initial_blocks\operation
236_get_AAD_loop2_done\num_initial_blocks\operation:
237 movdqa SHUF_MASK(%rip), %xmm14
238 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
239
240 xor %r11, %r11 # initialise the data pointer offset as zero
241
242 # start AES for num_initial_blocks blocks
243
244 mov %arg5, %rax # %rax = *Y0
245 movdqu (%rax), \XMM0 # XMM0 = Y0
246 movdqa SHUF_MASK(%rip), %xmm14
247 PSHUFB_XMM %xmm14, \XMM0
248
249.if (\i == 5) || (\i == 6) || (\i == 7)
250.irpc index, \i_seq
251 paddd ONE(%rip), \XMM0 # INCR Y0
252 movdqa \XMM0, %xmm\index
253 movdqa SHUF_MASK(%rip), %xmm14
254 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
255
256.endr
257.irpc index, \i_seq
258 pxor 16*0(%arg1), %xmm\index
259.endr
260.irpc index, \i_seq
261 movaps 0x10(%rdi), \TMP1
262 AESENC \TMP1, %xmm\index # Round 1
263.endr
264.irpc index, \i_seq
265 movaps 0x20(%arg1), \TMP1
266 AESENC \TMP1, %xmm\index # Round 2
267.endr
268.irpc index, \i_seq
269 movaps 0x30(%arg1), \TMP1
270 AESENC \TMP1, %xmm\index # Round 2
271.endr
272.irpc index, \i_seq
273 movaps 0x40(%arg1), \TMP1
274 AESENC \TMP1, %xmm\index # Round 2
275.endr
276.irpc index, \i_seq
277 movaps 0x50(%arg1), \TMP1
278 AESENC \TMP1, %xmm\index # Round 2
279.endr
280.irpc index, \i_seq
281 movaps 0x60(%arg1), \TMP1
282 AESENC \TMP1, %xmm\index # Round 2
283.endr
284.irpc index, \i_seq
285 movaps 0x70(%arg1), \TMP1
286 AESENC \TMP1, %xmm\index # Round 2
287.endr
288.irpc index, \i_seq
289 movaps 0x80(%arg1), \TMP1
290 AESENC \TMP1, %xmm\index # Round 2
291.endr
292.irpc index, \i_seq
293 movaps 0x90(%arg1), \TMP1
294 AESENC \TMP1, %xmm\index # Round 2
295.endr
296.irpc index, \i_seq
297 movaps 0xa0(%arg1), \TMP1
298 AESENCLAST \TMP1, %xmm\index # Round 10
299.endr
300.irpc index, \i_seq
301 movdqu (%arg3 , %r11, 1), \TMP1
302 pxor \TMP1, %xmm\index
303 movdqu %xmm\index, (%arg2 , %r11, 1)
304 # write back plaintext/ciphertext for num_initial_blocks
305 add $16, %r11
306
307 movdqa \TMP1, %xmm\index
308 movdqa SHUF_MASK(%rip), %xmm14
309 PSHUFB_XMM %xmm14, %xmm\index
310
311 # prepare plaintext/ciphertext for GHASH computation
312.endr
313.endif
314 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
315 # apply GHASH on num_initial_blocks blocks
316
317.if \i == 5
318 pxor %xmm5, %xmm6
319 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
320 pxor %xmm6, %xmm7
321 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
322 pxor %xmm7, %xmm8
323 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
324.elseif \i == 6
325 pxor %xmm6, %xmm7
326 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
327 pxor %xmm7, %xmm8
328 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
329.elseif \i == 7
330 pxor %xmm7, %xmm8
331 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
332.endif
333 cmp $64, %r13
334 jl _initial_blocks_done\num_initial_blocks\operation
335 # no need for precomputed values
336/*
337*
338* Precomputations for HashKey parallel with encryption of first 4 blocks.
339* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
340*/
341 paddd ONE(%rip), \XMM0 # INCR Y0
342 movdqa \XMM0, \XMM1
343 movdqa SHUF_MASK(%rip), %xmm14
344 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
345
346 paddd ONE(%rip), \XMM0 # INCR Y0
347 movdqa \XMM0, \XMM2
348 movdqa SHUF_MASK(%rip), %xmm14
349 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
350
351 paddd ONE(%rip), \XMM0 # INCR Y0
352 movdqa \XMM0, \XMM3
353 movdqa SHUF_MASK(%rip), %xmm14
354 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
355
356 paddd ONE(%rip), \XMM0 # INCR Y0
357 movdqa \XMM0, \XMM4
358 movdqa SHUF_MASK(%rip), %xmm14
359 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
360
361 pxor 16*0(%arg1), \XMM1
362 pxor 16*0(%arg1), \XMM2
363 pxor 16*0(%arg1), \XMM3
364 pxor 16*0(%arg1), \XMM4
365 movdqa \TMP3, \TMP5
366 pshufd $78, \TMP3, \TMP1
367 pxor \TMP3, \TMP1
368 movdqa \TMP1, HashKey_k(%rsp)
369 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
370# TMP5 = HashKey^2<<1 (mod poly)
371 movdqa \TMP5, HashKey_2(%rsp)
372# HashKey_2 = HashKey^2<<1 (mod poly)
373 pshufd $78, \TMP5, \TMP1
374 pxor \TMP5, \TMP1
375 movdqa \TMP1, HashKey_2_k(%rsp)
376.irpc index, 1234 # do 4 rounds
377 movaps 0x10*\index(%arg1), \TMP1
378 AESENC \TMP1, \XMM1
379 AESENC \TMP1, \XMM2
380 AESENC \TMP1, \XMM3
381 AESENC \TMP1, \XMM4
382.endr
383 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
384# TMP5 = HashKey^3<<1 (mod poly)
385 movdqa \TMP5, HashKey_3(%rsp)
386 pshufd $78, \TMP5, \TMP1
387 pxor \TMP5, \TMP1
388 movdqa \TMP1, HashKey_3_k(%rsp)
389.irpc index, 56789 # do next 5 rounds
390 movaps 0x10*\index(%arg1), \TMP1
391 AESENC \TMP1, \XMM1
392 AESENC \TMP1, \XMM2
393 AESENC \TMP1, \XMM3
394 AESENC \TMP1, \XMM4
395.endr
396 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
397# TMP5 = HashKey^3<<1 (mod poly)
398 movdqa \TMP5, HashKey_4(%rsp)
399 pshufd $78, \TMP5, \TMP1
400 pxor \TMP5, \TMP1
401 movdqa \TMP1, HashKey_4_k(%rsp)
402 movaps 0xa0(%arg1), \TMP2
403 AESENCLAST \TMP2, \XMM1
404 AESENCLAST \TMP2, \XMM2
405 AESENCLAST \TMP2, \XMM3
406 AESENCLAST \TMP2, \XMM4
407 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
408 pxor \TMP1, \XMM1
409 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
410 movdqa \TMP1, \XMM1
411 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
412 pxor \TMP1, \XMM2
413 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
414 movdqa \TMP1, \XMM2
415 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
416 pxor \TMP1, \XMM3
417 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
418 movdqa \TMP1, \XMM3
419 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
420 pxor \TMP1, \XMM4
421 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
422 movdqa \TMP1, \XMM4
423 add $64, %r11
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
426 pxor \XMMDst, \XMM1
427# combine GHASHed value with the corresponding ciphertext
428 movdqa SHUF_MASK(%rip), %xmm14
429 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
430 movdqa SHUF_MASK(%rip), %xmm14
431 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
432 movdqa SHUF_MASK(%rip), %xmm14
433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
434
435_initial_blocks_done\num_initial_blocks\operation:
436
437.endm
438
439
440/*
441* if a = number of total plaintext bytes
442* b = floor(a/16)
443* num_initial_blocks = b mod 4
444* encrypt the initial num_initial_blocks blocks and apply ghash on
445* the ciphertext
446* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
447* are clobbered
448* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
449*/
450
451
452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
454 mov arg7, %r10 # %r10 = AAD
455 mov arg8, %r12 # %r12 = aadLen
456 mov %r12, %r11
457 pxor %xmm\i, %xmm\i
458_get_AAD_loop\num_initial_blocks\operation:
459 movd (%r10), \TMP1
460 pslldq $12, \TMP1
461 psrldq $4, %xmm\i
462 pxor \TMP1, %xmm\i
463 add $4, %r10
464 sub $4, %r12
465 jne _get_AAD_loop\num_initial_blocks\operation
466 cmp $16, %r11
467 je _get_AAD_loop2_done\num_initial_blocks\operation
468 mov $16, %r12
469_get_AAD_loop2\num_initial_blocks\operation:
470 psrldq $4, %xmm\i
471 sub $4, %r12
472 cmp %r11, %r12
473 jne _get_AAD_loop2\num_initial_blocks\operation
474_get_AAD_loop2_done\num_initial_blocks\operation:
475 movdqa SHUF_MASK(%rip), %xmm14
476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
477
478 xor %r11, %r11 # initialise the data pointer offset as zero
479
480 # start AES for num_initial_blocks blocks
481
482 mov %arg5, %rax # %rax = *Y0
483 movdqu (%rax), \XMM0 # XMM0 = Y0
484 movdqa SHUF_MASK(%rip), %xmm14
485 PSHUFB_XMM %xmm14, \XMM0
486
487.if (\i == 5) || (\i == 6) || (\i == 7)
488.irpc index, \i_seq
489 paddd ONE(%rip), \XMM0 # INCR Y0
490 movdqa \XMM0, %xmm\index
491 movdqa SHUF_MASK(%rip), %xmm14
492 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
493
494.endr
495.irpc index, \i_seq
496 pxor 16*0(%arg1), %xmm\index
497.endr
498.irpc index, \i_seq
499 movaps 0x10(%rdi), \TMP1
500 AESENC \TMP1, %xmm\index # Round 1
501.endr
502.irpc index, \i_seq
503 movaps 0x20(%arg1), \TMP1
504 AESENC \TMP1, %xmm\index # Round 2
505.endr
506.irpc index, \i_seq
507 movaps 0x30(%arg1), \TMP1
508 AESENC \TMP1, %xmm\index # Round 2
509.endr
510.irpc index, \i_seq
511 movaps 0x40(%arg1), \TMP1
512 AESENC \TMP1, %xmm\index # Round 2
513.endr
514.irpc index, \i_seq
515 movaps 0x50(%arg1), \TMP1
516 AESENC \TMP1, %xmm\index # Round 2
517.endr
518.irpc index, \i_seq
519 movaps 0x60(%arg1), \TMP1
520 AESENC \TMP1, %xmm\index # Round 2
521.endr
522.irpc index, \i_seq
523 movaps 0x70(%arg1), \TMP1
524 AESENC \TMP1, %xmm\index # Round 2
525.endr
526.irpc index, \i_seq
527 movaps 0x80(%arg1), \TMP1
528 AESENC \TMP1, %xmm\index # Round 2
529.endr
530.irpc index, \i_seq
531 movaps 0x90(%arg1), \TMP1
532 AESENC \TMP1, %xmm\index # Round 2
533.endr
534.irpc index, \i_seq
535 movaps 0xa0(%arg1), \TMP1
536 AESENCLAST \TMP1, %xmm\index # Round 10
537.endr
538.irpc index, \i_seq
539 movdqu (%arg3 , %r11, 1), \TMP1
540 pxor \TMP1, %xmm\index
541 movdqu %xmm\index, (%arg2 , %r11, 1)
542 # write back plaintext/ciphertext for num_initial_blocks
543 add $16, %r11
544
545 movdqa SHUF_MASK(%rip), %xmm14
546 PSHUFB_XMM %xmm14, %xmm\index
547
548 # prepare plaintext/ciphertext for GHASH computation
549.endr
550.endif
551 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
552 # apply GHASH on num_initial_blocks blocks
553
554.if \i == 5
555 pxor %xmm5, %xmm6
556 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
557 pxor %xmm6, %xmm7
558 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
559 pxor %xmm7, %xmm8
560 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
561.elseif \i == 6
562 pxor %xmm6, %xmm7
563 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
564 pxor %xmm7, %xmm8
565 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
566.elseif \i == 7
567 pxor %xmm7, %xmm8
568 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
569.endif
570 cmp $64, %r13
571 jl _initial_blocks_done\num_initial_blocks\operation
572 # no need for precomputed values
573/*
574*
575* Precomputations for HashKey parallel with encryption of first 4 blocks.
576* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
577*/
578 paddd ONE(%rip), \XMM0 # INCR Y0
579 movdqa \XMM0, \XMM1
580 movdqa SHUF_MASK(%rip), %xmm14
581 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
582
583 paddd ONE(%rip), \XMM0 # INCR Y0
584 movdqa \XMM0, \XMM2
585 movdqa SHUF_MASK(%rip), %xmm14
586 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
587
588 paddd ONE(%rip), \XMM0 # INCR Y0
589 movdqa \XMM0, \XMM3
590 movdqa SHUF_MASK(%rip), %xmm14
591 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
592
593 paddd ONE(%rip), \XMM0 # INCR Y0
594 movdqa \XMM0, \XMM4
595 movdqa SHUF_MASK(%rip), %xmm14
596 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
597
598 pxor 16*0(%arg1), \XMM1
599 pxor 16*0(%arg1), \XMM2
600 pxor 16*0(%arg1), \XMM3
601 pxor 16*0(%arg1), \XMM4
602 movdqa \TMP3, \TMP5
603 pshufd $78, \TMP3, \TMP1
604 pxor \TMP3, \TMP1
605 movdqa \TMP1, HashKey_k(%rsp)
606 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
607# TMP5 = HashKey^2<<1 (mod poly)
608 movdqa \TMP5, HashKey_2(%rsp)
609# HashKey_2 = HashKey^2<<1 (mod poly)
610 pshufd $78, \TMP5, \TMP1
611 pxor \TMP5, \TMP1
612 movdqa \TMP1, HashKey_2_k(%rsp)
613.irpc index, 1234 # do 4 rounds
614 movaps 0x10*\index(%arg1), \TMP1
615 AESENC \TMP1, \XMM1
616 AESENC \TMP1, \XMM2
617 AESENC \TMP1, \XMM3
618 AESENC \TMP1, \XMM4
619.endr
620 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
621# TMP5 = HashKey^3<<1 (mod poly)
622 movdqa \TMP5, HashKey_3(%rsp)
623 pshufd $78, \TMP5, \TMP1
624 pxor \TMP5, \TMP1
625 movdqa \TMP1, HashKey_3_k(%rsp)
626.irpc index, 56789 # do next 5 rounds
627 movaps 0x10*\index(%arg1), \TMP1
628 AESENC \TMP1, \XMM1
629 AESENC \TMP1, \XMM2
630 AESENC \TMP1, \XMM3
631 AESENC \TMP1, \XMM4
632.endr
633 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
634# TMP5 = HashKey^3<<1 (mod poly)
635 movdqa \TMP5, HashKey_4(%rsp)
636 pshufd $78, \TMP5, \TMP1
637 pxor \TMP5, \TMP1
638 movdqa \TMP1, HashKey_4_k(%rsp)
639 movaps 0xa0(%arg1), \TMP2
640 AESENCLAST \TMP2, \XMM1
641 AESENCLAST \TMP2, \XMM2
642 AESENCLAST \TMP2, \XMM3
643 AESENCLAST \TMP2, \XMM4
644 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM1
646 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
647 pxor \TMP1, \XMM2
648 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
649 pxor \TMP1, \XMM3
650 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
651 pxor \TMP1, \XMM4
652 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
653 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
654 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
655 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
656
657 add $64, %r11
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
660 pxor \XMMDst, \XMM1
661# combine GHASHed value with the corresponding ciphertext
662 movdqa SHUF_MASK(%rip), %xmm14
663 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
664 movdqa SHUF_MASK(%rip), %xmm14
665 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
666 movdqa SHUF_MASK(%rip), %xmm14
667 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
668
669_initial_blocks_done\num_initial_blocks\operation:
670
671.endm
672
673/*
674* encrypt 4 blocks at a time
675* ghash the 4 previously encrypted ciphertext blocks
676* arg1, %arg2, %arg3 are used as pointers only, not modified
677* %r11 is the data offset value
678*/
679.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
680TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
681
682 movdqa \XMM1, \XMM5
683 movdqa \XMM2, \XMM6
684 movdqa \XMM3, \XMM7
685 movdqa \XMM4, \XMM8
686
687 movdqa SHUF_MASK(%rip), %xmm15
688 # multiply TMP5 * HashKey using karatsuba
689
690 movdqa \XMM5, \TMP4
691 pshufd $78, \XMM5, \TMP6
692 pxor \XMM5, \TMP6
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa HashKey_4(%rsp), \TMP5
695 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
696 movdqa \XMM0, \XMM1
697 paddd ONE(%rip), \XMM0 # INCR CNT
698 movdqa \XMM0, \XMM2
699 paddd ONE(%rip), \XMM0 # INCR CNT
700 movdqa \XMM0, \XMM3
701 paddd ONE(%rip), \XMM0 # INCR CNT
702 movdqa \XMM0, \XMM4
703 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
704 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
705 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
706 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
707 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
708
709 pxor (%arg1), \XMM1
710 pxor (%arg1), \XMM2
711 pxor (%arg1), \XMM3
712 pxor (%arg1), \XMM4
713 movdqa HashKey_4_k(%rsp), \TMP5
714 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
715 movaps 0x10(%arg1), \TMP1
716 AESENC \TMP1, \XMM1 # Round 1
717 AESENC \TMP1, \XMM2
718 AESENC \TMP1, \XMM3
719 AESENC \TMP1, \XMM4
720 movaps 0x20(%arg1), \TMP1
721 AESENC \TMP1, \XMM1 # Round 2
722 AESENC \TMP1, \XMM2
723 AESENC \TMP1, \XMM3
724 AESENC \TMP1, \XMM4
725 movdqa \XMM6, \TMP1
726 pshufd $78, \XMM6, \TMP2
727 pxor \XMM6, \TMP2
728 movdqa HashKey_3(%rsp), \TMP5
729 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
730 movaps 0x30(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 3
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
736 movaps 0x40(%arg1), \TMP3
737 AESENC \TMP3, \XMM1 # Round 4
738 AESENC \TMP3, \XMM2
739 AESENC \TMP3, \XMM3
740 AESENC \TMP3, \XMM4
741 movdqa HashKey_3_k(%rsp), \TMP5
742 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
743 movaps 0x50(%arg1), \TMP3
744 AESENC \TMP3, \XMM1 # Round 5
745 AESENC \TMP3, \XMM2
746 AESENC \TMP3, \XMM3
747 AESENC \TMP3, \XMM4
748 pxor \TMP1, \TMP4
749# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
750 pxor \XMM6, \XMM5
751 pxor \TMP2, \TMP6
752 movdqa \XMM7, \TMP1
753 pshufd $78, \XMM7, \TMP2
754 pxor \XMM7, \TMP2
755 movdqa HashKey_2(%rsp ), \TMP5
756
757 # Multiply TMP5 * HashKey using karatsuba
758
759 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
760 movaps 0x60(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 6
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
766 movaps 0x70(%arg1), \TMP3
767 AESENC \TMP3, \XMM1 # Round 7
768 AESENC \TMP3, \XMM2
769 AESENC \TMP3, \XMM3
770 AESENC \TMP3, \XMM4
771 movdqa HashKey_2_k(%rsp), \TMP5
772 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
773 movaps 0x80(%arg1), \TMP3
774 AESENC \TMP3, \XMM1 # Round 8
775 AESENC \TMP3, \XMM2
776 AESENC \TMP3, \XMM3
777 AESENC \TMP3, \XMM4
778 pxor \TMP1, \TMP4
779# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
780 pxor \XMM7, \XMM5
781 pxor \TMP2, \TMP6
782
783 # Multiply XMM8 * HashKey
784 # XMM8 and TMP5 hold the values for the two operands
785
786 movdqa \XMM8, \TMP1
787 pshufd $78, \XMM8, \TMP2
788 pxor \XMM8, \TMP2
789 movdqa HashKey(%rsp), \TMP5
790 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
791 movaps 0x90(%arg1), \TMP3
792 AESENC \TMP3, \XMM1 # Round 9
793 AESENC \TMP3, \XMM2
794 AESENC \TMP3, \XMM3
795 AESENC \TMP3, \XMM4
796 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
797 movaps 0xa0(%arg1), \TMP3
798 AESENCLAST \TMP3, \XMM1 # Round 10
799 AESENCLAST \TMP3, \XMM2
800 AESENCLAST \TMP3, \XMM3
801 AESENCLAST \TMP3, \XMM4
802 movdqa HashKey_k(%rsp), \TMP5
803 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
804 movdqu (%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
806 movdqu 16(%arg3,%r11,1), \TMP3
807 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
808 movdqu 32(%arg3,%r11,1), \TMP3
809 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
810 movdqu 48(%arg3,%r11,1), \TMP3
811 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
812 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
813 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
814 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
815 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
816 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
817 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
818 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
819 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
820
821 pxor \TMP4, \TMP1
822 pxor \XMM8, \XMM5
823 pxor \TMP6, \TMP2
824 pxor \TMP1, \TMP2
825 pxor \XMM5, \TMP2
826 movdqa \TMP2, \TMP3
827 pslldq $8, \TMP3 # left shift TMP3 2 DWs
828 psrldq $8, \TMP2 # right shift TMP2 2 DWs
829 pxor \TMP3, \XMM5
830 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
831
832 # first phase of reduction
833
834 movdqa \XMM5, \TMP2
835 movdqa \XMM5, \TMP3
836 movdqa \XMM5, \TMP4
837# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
838 pslld $31, \TMP2 # packed right shift << 31
839 pslld $30, \TMP3 # packed right shift << 30
840 pslld $25, \TMP4 # packed right shift << 25
841 pxor \TMP3, \TMP2 # xor the shifted versions
842 pxor \TMP4, \TMP2
843 movdqa \TMP2, \TMP5
844 psrldq $4, \TMP5 # right shift T5 1 DW
845 pslldq $12, \TMP2 # left shift T2 3 DWs
846 pxor \TMP2, \XMM5
847
848 # second phase of reduction
849
850 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
851 movdqa \XMM5,\TMP3
852 movdqa \XMM5,\TMP4
853 psrld $1, \TMP2 # packed left shift >>1
854 psrld $2, \TMP3 # packed left shift >>2
855 psrld $7, \TMP4 # packed left shift >>7
856 pxor \TMP3,\TMP2 # xor the shifted versions
857 pxor \TMP4,\TMP2
858 pxor \TMP5, \TMP2
859 pxor \TMP2, \XMM5
860 pxor \TMP1, \XMM5 # result is in TMP1
861
862 pxor \XMM5, \XMM1
863.endm
864
865/*
866* decrypt 4 blocks at a time
867* ghash the 4 previously decrypted ciphertext blocks
868* arg1, %arg2, %arg3 are used as pointers only, not modified
869* %r11 is the data offset value
870*/
871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
873
874 movdqa \XMM1, \XMM5
875 movdqa \XMM2, \XMM6
876 movdqa \XMM3, \XMM7
877 movdqa \XMM4, \XMM8
878
879 movdqa SHUF_MASK(%rip), %xmm15
880 # multiply TMP5 * HashKey using karatsuba
881
882 movdqa \XMM5, \TMP4
883 pshufd $78, \XMM5, \TMP6
884 pxor \XMM5, \TMP6
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa HashKey_4(%rsp), \TMP5
887 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
888 movdqa \XMM0, \XMM1
889 paddd ONE(%rip), \XMM0 # INCR CNT
890 movdqa \XMM0, \XMM2
891 paddd ONE(%rip), \XMM0 # INCR CNT
892 movdqa \XMM0, \XMM3
893 paddd ONE(%rip), \XMM0 # INCR CNT
894 movdqa \XMM0, \XMM4
895 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
896 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
897 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
898 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
899 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
900
901 pxor (%arg1), \XMM1
902 pxor (%arg1), \XMM2
903 pxor (%arg1), \XMM3
904 pxor (%arg1), \XMM4
905 movdqa HashKey_4_k(%rsp), \TMP5
906 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
907 movaps 0x10(%arg1), \TMP1
908 AESENC \TMP1, \XMM1 # Round 1
909 AESENC \TMP1, \XMM2
910 AESENC \TMP1, \XMM3
911 AESENC \TMP1, \XMM4
912 movaps 0x20(%arg1), \TMP1
913 AESENC \TMP1, \XMM1 # Round 2
914 AESENC \TMP1, \XMM2
915 AESENC \TMP1, \XMM3
916 AESENC \TMP1, \XMM4
917 movdqa \XMM6, \TMP1
918 pshufd $78, \XMM6, \TMP2
919 pxor \XMM6, \TMP2
920 movdqa HashKey_3(%rsp), \TMP5
921 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
922 movaps 0x30(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 3
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
928 movaps 0x40(%arg1), \TMP3
929 AESENC \TMP3, \XMM1 # Round 4
930 AESENC \TMP3, \XMM2
931 AESENC \TMP3, \XMM3
932 AESENC \TMP3, \XMM4
933 movdqa HashKey_3_k(%rsp), \TMP5
934 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
935 movaps 0x50(%arg1), \TMP3
936 AESENC \TMP3, \XMM1 # Round 5
937 AESENC \TMP3, \XMM2
938 AESENC \TMP3, \XMM3
939 AESENC \TMP3, \XMM4
940 pxor \TMP1, \TMP4
941# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
942 pxor \XMM6, \XMM5
943 pxor \TMP2, \TMP6
944 movdqa \XMM7, \TMP1
945 pshufd $78, \XMM7, \TMP2
946 pxor \XMM7, \TMP2
947 movdqa HashKey_2(%rsp ), \TMP5
948
949 # Multiply TMP5 * HashKey using karatsuba
950
951 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
952 movaps 0x60(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 6
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
958 movaps 0x70(%arg1), \TMP3
959 AESENC \TMP3, \XMM1 # Round 7
960 AESENC \TMP3, \XMM2
961 AESENC \TMP3, \XMM3
962 AESENC \TMP3, \XMM4
963 movdqa HashKey_2_k(%rsp), \TMP5
964 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
965 movaps 0x80(%arg1), \TMP3
966 AESENC \TMP3, \XMM1 # Round 8
967 AESENC \TMP3, \XMM2
968 AESENC \TMP3, \XMM3
969 AESENC \TMP3, \XMM4
970 pxor \TMP1, \TMP4
971# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
972 pxor \XMM7, \XMM5
973 pxor \TMP2, \TMP6
974
975 # Multiply XMM8 * HashKey
976 # XMM8 and TMP5 hold the values for the two operands
977
978 movdqa \XMM8, \TMP1
979 pshufd $78, \XMM8, \TMP2
980 pxor \XMM8, \TMP2
981 movdqa HashKey(%rsp), \TMP5
982 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
983 movaps 0x90(%arg1), \TMP3
984 AESENC \TMP3, \XMM1 # Round 9
985 AESENC \TMP3, \XMM2
986 AESENC \TMP3, \XMM3
987 AESENC \TMP3, \XMM4
988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
989 movaps 0xa0(%arg1), \TMP3
990 AESENCLAST \TMP3, \XMM1 # Round 10
991 AESENCLAST \TMP3, \XMM2
992 AESENCLAST \TMP3, \XMM3
993 AESENCLAST \TMP3, \XMM4
994 movdqa HashKey_k(%rsp), \TMP5
995 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
996 movdqu (%arg3,%r11,1), \TMP3
997 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
998 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
999 movdqa \TMP3, \XMM1
1000 movdqu 16(%arg3,%r11,1), \TMP3
1001 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1002 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1003 movdqa \TMP3, \XMM2
1004 movdqu 32(%arg3,%r11,1), \TMP3
1005 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1006 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1007 movdqa \TMP3, \XMM3
1008 movdqu 48(%arg3,%r11,1), \TMP3
1009 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1010 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1011 movdqa \TMP3, \XMM4
1012 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1013 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1014 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1015 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1016
1017 pxor \TMP4, \TMP1
1018 pxor \XMM8, \XMM5
1019 pxor \TMP6, \TMP2
1020 pxor \TMP1, \TMP2
1021 pxor \XMM5, \TMP2
1022 movdqa \TMP2, \TMP3
1023 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1024 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1025 pxor \TMP3, \XMM5
1026 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1027
1028 # first phase of reduction
1029
1030 movdqa \XMM5, \TMP2
1031 movdqa \XMM5, \TMP3
1032 movdqa \XMM5, \TMP4
1033# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1034 pslld $31, \TMP2 # packed right shift << 31
1035 pslld $30, \TMP3 # packed right shift << 30
1036 pslld $25, \TMP4 # packed right shift << 25
1037 pxor \TMP3, \TMP2 # xor the shifted versions
1038 pxor \TMP4, \TMP2
1039 movdqa \TMP2, \TMP5
1040 psrldq $4, \TMP5 # right shift T5 1 DW
1041 pslldq $12, \TMP2 # left shift T2 3 DWs
1042 pxor \TMP2, \XMM5
1043
1044 # second phase of reduction
1045
1046 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1047 movdqa \XMM5,\TMP3
1048 movdqa \XMM5,\TMP4
1049 psrld $1, \TMP2 # packed left shift >>1
1050 psrld $2, \TMP3 # packed left shift >>2
1051 psrld $7, \TMP4 # packed left shift >>7
1052 pxor \TMP3,\TMP2 # xor the shifted versions
1053 pxor \TMP4,\TMP2
1054 pxor \TMP5, \TMP2
1055 pxor \TMP2, \XMM5
1056 pxor \TMP1, \XMM5 # result is in TMP1
1057
1058 pxor \XMM5, \XMM1
1059.endm
1060
1061/* GHASH the last 4 ciphertext blocks. */
1062.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1063TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1064
1065 # Multiply TMP6 * HashKey (using Karatsuba)
1066
1067 movdqa \XMM1, \TMP6
1068 pshufd $78, \XMM1, \TMP2
1069 pxor \XMM1, \TMP2
1070 movdqa HashKey_4(%rsp), \TMP5
1071 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1072 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1073 movdqa HashKey_4_k(%rsp), \TMP4
1074 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1075 movdqa \XMM1, \XMMDst
1076 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1077
1078 # Multiply TMP1 * HashKey (using Karatsuba)
1079
1080 movdqa \XMM2, \TMP1
1081 pshufd $78, \XMM2, \TMP2
1082 pxor \XMM2, \TMP2
1083 movdqa HashKey_3(%rsp), \TMP5
1084 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1085 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1086 movdqa HashKey_3_k(%rsp), \TMP4
1087 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1088 pxor \TMP1, \TMP6
1089 pxor \XMM2, \XMMDst
1090 pxor \TMP2, \XMM1
1091# results accumulated in TMP6, XMMDst, XMM1
1092
1093 # Multiply TMP1 * HashKey (using Karatsuba)
1094
1095 movdqa \XMM3, \TMP1
1096 pshufd $78, \XMM3, \TMP2
1097 pxor \XMM3, \TMP2
1098 movdqa HashKey_2(%rsp), \TMP5
1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1100 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1101 movdqa HashKey_2_k(%rsp), \TMP4
1102 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1103 pxor \TMP1, \TMP6
1104 pxor \XMM3, \XMMDst
1105 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1106
1107 # Multiply TMP1 * HashKey (using Karatsuba)
1108 movdqa \XMM4, \TMP1
1109 pshufd $78, \XMM4, \TMP2
1110 pxor \XMM4, \TMP2
1111 movdqa HashKey(%rsp), \TMP5
1112 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1113 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1114 movdqa HashKey_k(%rsp), \TMP4
1115 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1116 pxor \TMP1, \TMP6
1117 pxor \XMM4, \XMMDst
1118 pxor \XMM1, \TMP2
1119 pxor \TMP6, \TMP2
1120 pxor \XMMDst, \TMP2
1121 # middle section of the temp results combined as in karatsuba algorithm
1122 movdqa \TMP2, \TMP4
1123 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1124 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1125 pxor \TMP4, \XMMDst
1126 pxor \TMP2, \TMP6
1127# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1128 # first phase of the reduction
1129 movdqa \XMMDst, \TMP2
1130 movdqa \XMMDst, \TMP3
1131 movdqa \XMMDst, \TMP4
1132# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1133 pslld $31, \TMP2 # packed right shifting << 31
1134 pslld $30, \TMP3 # packed right shifting << 30
1135 pslld $25, \TMP4 # packed right shifting << 25
1136 pxor \TMP3, \TMP2 # xor the shifted versions
1137 pxor \TMP4, \TMP2
1138 movdqa \TMP2, \TMP7
1139 psrldq $4, \TMP7 # right shift TMP7 1 DW
1140 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1141 pxor \TMP2, \XMMDst
1142
1143 # second phase of the reduction
1144 movdqa \XMMDst, \TMP2
1145 # make 3 copies of XMMDst for doing 3 shift operations
1146 movdqa \XMMDst, \TMP3
1147 movdqa \XMMDst, \TMP4
1148 psrld $1, \TMP2 # packed left shift >> 1
1149 psrld $2, \TMP3 # packed left shift >> 2
1150 psrld $7, \TMP4 # packed left shift >> 7
1151 pxor \TMP3, \TMP2 # xor the shifted versions
1152 pxor \TMP4, \TMP2
1153 pxor \TMP7, \TMP2
1154 pxor \TMP2, \XMMDst
1155 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1156.endm
1157
1158/* Encryption of a single block done*/
1159.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1160
1161 pxor (%arg1), \XMM0
1162 movaps 16(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 32(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 48(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 64(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 80(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 96(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 112(%arg1), \TMP1
1175 AESENC \TMP1, \XMM0
1176 movaps 128(%arg1), \TMP1
1177 AESENC \TMP1, \XMM0
1178 movaps 144(%arg1), \TMP1
1179 AESENC \TMP1, \XMM0
1180 movaps 160(%arg1), \TMP1
1181 AESENCLAST \TMP1, \XMM0
1182.endm
1183
1184
1185/*****************************************************************************
1186* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1187* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1188* const u8 *in, // Ciphertext input
1189* u64 plaintext_len, // Length of data in bytes for decryption.
1190* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1191* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1192* // concatenated with 0x00000001. 16-byte aligned pointer.
1193* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1194* const u8 *aad, // Additional Authentication Data (AAD)
1195* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1196* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1197* // given authentication tag and only return the plaintext if they match.
1198* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1199* // (most likely), 12 or 8.
1200*
1201* Assumptions:
1202*
1203* keys:
1204* keys are pre-expanded and aligned to 16 bytes. we are using the first
1205* set of 11 keys in the data structure void *aes_ctx
1206*
1207* iv:
1208* 0 1 2 3
1209* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1210* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1211* | Salt (From the SA) |
1212* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1213* | Initialization Vector |
1214* | (This is the sequence number from IPSec header) |
1215* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1216* | 0x1 |
1217* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1218*
1219*
1220*
1221* AAD:
1222* AAD padded to 128 bits with 0
1223* for example, assume AAD is a u32 vector
1224*
1225* if AAD is 8 bytes:
1226* AAD[3] = {A0, A1};
1227* padded AAD in xmm register = {A1 A0 0 0}
1228*
1229* 0 1 2 3
1230* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232* | SPI (A1) |
1233* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1234* | 32-bit Sequence Number (A0) |
1235* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1236* | 0x0 |
1237* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1238*
1239* AAD Format with 32-bit Sequence Number
1240*
1241* if AAD is 12 bytes:
1242* AAD[3] = {A0, A1, A2};
1243* padded AAD in xmm register = {A2 A1 A0 0}
1244*
1245* 0 1 2 3
1246* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1247* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1248* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1249* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250* | SPI (A2) |
1251* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1252* | 64-bit Extended Sequence Number {A1,A0} |
1253* | |
1254* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1255* | 0x0 |
1256* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1257*
1258* AAD Format with 64-bit Extended Sequence Number
1259*
1260* aadLen:
1261* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1262* The code supports 16 too but for other sizes, the code will fail.
1263*
1264* TLen:
1265* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1266* For other sizes, the code will fail.
1267*
1268* poly = x^128 + x^127 + x^126 + x^121 + 1
1269*
1270*****************************************************************************/
1271ENTRY(aesni_gcm_dec)
1272 push %r12
1273 push %r13
1274 push %r14
1275 mov %rsp, %r14
1276/*
1277* states of %xmm registers %xmm6:%xmm15 not saved
1278* all %xmm registers are clobbered
1279*/
1280 sub $VARIABLE_OFFSET, %rsp
1281 and $~63, %rsp # align rsp to 64 bytes
1282 mov %arg6, %r12
1283 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1284 movdqa SHUF_MASK(%rip), %xmm2
1285 PSHUFB_XMM %xmm2, %xmm13
1286
1287
1288# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1289
1290 movdqa %xmm13, %xmm2
1291 psllq $1, %xmm13
1292 psrlq $63, %xmm2
1293 movdqa %xmm2, %xmm1
1294 pslldq $8, %xmm2
1295 psrldq $8, %xmm1
1296 por %xmm2, %xmm13
1297
1298 # Reduction
1299
1300 pshufd $0x24, %xmm1, %xmm2
1301 pcmpeqd TWOONE(%rip), %xmm2
1302 pand POLY(%rip), %xmm2
1303 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1304
1305
1306 # Decrypt first few blocks
1307
1308 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1309 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1310 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1311 mov %r13, %r12
1312 and $(3<<4), %r12
1313 jz _initial_num_blocks_is_0_decrypt
1314 cmp $(2<<4), %r12
1315 jb _initial_num_blocks_is_1_decrypt
1316 je _initial_num_blocks_is_2_decrypt
1317_initial_num_blocks_is_3_decrypt:
1318 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1320 sub $48, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_2_decrypt:
1323 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1325 sub $32, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_1_decrypt:
1328 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1330 sub $16, %r13
1331 jmp _initial_blocks_decrypted
1332_initial_num_blocks_is_0_decrypt:
1333 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1334%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1335_initial_blocks_decrypted:
1336 cmp $0, %r13
1337 je _zero_cipher_left_decrypt
1338 sub $64, %r13
1339 je _four_cipher_left_decrypt
1340_decrypt_by_4:
1341 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1342%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1343 add $64, %r11
1344 sub $64, %r13
1345 jne _decrypt_by_4
1346_four_cipher_left_decrypt:
1347 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1348%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1349_zero_cipher_left_decrypt:
1350 mov %arg4, %r13
1351 and $15, %r13 # %r13 = arg4 (mod 16)
1352 je _multiple_of_16_bytes_decrypt
1353
1354 # Handle the last <16 byte block separately
1355
1356 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1357 movdqa SHUF_MASK(%rip), %xmm10
1358 PSHUFB_XMM %xmm10, %xmm0
1359
1360 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1361 sub $16, %r11
1362 add %r13, %r11
1363 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1364 lea SHIFT_MASK+16(%rip), %r12
1365 sub %r13, %r12
1366# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1367# (%r13 is the number of bytes in plaintext mod 16)
1368 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1369 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1370
1371 movdqa %xmm1, %xmm2
1372 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1373 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1374 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1375 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1376 pand %xmm1, %xmm2
1377 movdqa SHUF_MASK(%rip), %xmm10
1378 PSHUFB_XMM %xmm10 ,%xmm2
1379
1380 pxor %xmm2, %xmm8
1381 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1382 # GHASH computation for the last <16 byte block
1383 sub %r13, %r11
1384 add $16, %r11
1385
1386 # output %r13 bytes
1387 MOVQ_R64_XMM %xmm0, %rax
1388 cmp $8, %r13
1389 jle _less_than_8_bytes_left_decrypt
1390 mov %rax, (%arg2 , %r11, 1)
1391 add $8, %r11
1392 psrldq $8, %xmm0
1393 MOVQ_R64_XMM %xmm0, %rax
1394 sub $8, %r13
1395_less_than_8_bytes_left_decrypt:
1396 mov %al, (%arg2, %r11, 1)
1397 add $1, %r11
1398 shr $8, %rax
1399 sub $1, %r13
1400 jne _less_than_8_bytes_left_decrypt
1401_multiple_of_16_bytes_decrypt:
1402 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1403 shl $3, %r12 # convert into number of bits
1404 movd %r12d, %xmm15 # len(A) in %xmm15
1405 shl $3, %arg4 # len(C) in bits (*128)
1406 MOVQ_R64_XMM %arg4, %xmm1
1407 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1408 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1409 pxor %xmm15, %xmm8
1410 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1411 # final GHASH computation
1412 movdqa SHUF_MASK(%rip), %xmm10
1413 PSHUFB_XMM %xmm10, %xmm8
1414
1415 mov %arg5, %rax # %rax = *Y0
1416 movdqu (%rax), %xmm0 # %xmm0 = Y0
1417 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1418 pxor %xmm8, %xmm0
1419_return_T_decrypt:
1420 mov arg9, %r10 # %r10 = authTag
1421 mov arg10, %r11 # %r11 = auth_tag_len
1422 cmp $16, %r11
1423 je _T_16_decrypt
1424 cmp $12, %r11
1425 je _T_12_decrypt
1426_T_8_decrypt:
1427 MOVQ_R64_XMM %xmm0, %rax
1428 mov %rax, (%r10)
1429 jmp _return_T_done_decrypt
1430_T_12_decrypt:
1431 MOVQ_R64_XMM %xmm0, %rax
1432 mov %rax, (%r10)
1433 psrldq $8, %xmm0
1434 movd %xmm0, %eax
1435 mov %eax, 8(%r10)
1436 jmp _return_T_done_decrypt
1437_T_16_decrypt:
1438 movdqu %xmm0, (%r10)
1439_return_T_done_decrypt:
1440 mov %r14, %rsp
1441 pop %r14
1442 pop %r13
1443 pop %r12
1444 ret
1445ENDPROC(aesni_gcm_dec)
1446
1447
1448/*****************************************************************************
1449* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1450* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1451* const u8 *in, // Plaintext input
1452* u64 plaintext_len, // Length of data in bytes for encryption.
1453* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1454* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1455* // concatenated with 0x00000001. 16-byte aligned pointer.
1456* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1457* const u8 *aad, // Additional Authentication Data (AAD)
1458* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1459* u8 *auth_tag, // Authenticated Tag output.
1460* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1461* // 12 or 8.
1462*
1463* Assumptions:
1464*
1465* keys:
1466* keys are pre-expanded and aligned to 16 bytes. we are using the
1467* first set of 11 keys in the data structure void *aes_ctx
1468*
1469*
1470* iv:
1471* 0 1 2 3
1472* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1473* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1474* | Salt (From the SA) |
1475* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1476* | Initialization Vector |
1477* | (This is the sequence number from IPSec header) |
1478* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1479* | 0x1 |
1480* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1481*
1482*
1483*
1484* AAD:
1485* AAD padded to 128 bits with 0
1486* for example, assume AAD is a u32 vector
1487*
1488* if AAD is 8 bytes:
1489* AAD[3] = {A0, A1};
1490* padded AAD in xmm register = {A1 A0 0 0}
1491*
1492* 0 1 2 3
1493* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495* | SPI (A1) |
1496* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1497* | 32-bit Sequence Number (A0) |
1498* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1499* | 0x0 |
1500* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1501*
1502* AAD Format with 32-bit Sequence Number
1503*
1504* if AAD is 12 bytes:
1505* AAD[3] = {A0, A1, A2};
1506* padded AAD in xmm register = {A2 A1 A0 0}
1507*
1508* 0 1 2 3
1509* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1510* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1511* | SPI (A2) |
1512* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513* | 64-bit Extended Sequence Number {A1,A0} |
1514* | |
1515* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1516* | 0x0 |
1517* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1518*
1519* AAD Format with 64-bit Extended Sequence Number
1520*
1521* aadLen:
1522* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1523* The code supports 16 too but for other sizes, the code will fail.
1524*
1525* TLen:
1526* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1527* For other sizes, the code will fail.
1528*
1529* poly = x^128 + x^127 + x^126 + x^121 + 1
1530***************************************************************************/
1531ENTRY(aesni_gcm_enc)
1532 push %r12
1533 push %r13
1534 push %r14
1535 mov %rsp, %r14
1536#
1537# states of %xmm registers %xmm6:%xmm15 not saved
1538# all %xmm registers are clobbered
1539#
1540 sub $VARIABLE_OFFSET, %rsp
1541 and $~63, %rsp
1542 mov %arg6, %r12
1543 movdqu (%r12), %xmm13
1544 movdqa SHUF_MASK(%rip), %xmm2
1545 PSHUFB_XMM %xmm2, %xmm13
1546
1547
1548# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1549
1550 movdqa %xmm13, %xmm2
1551 psllq $1, %xmm13
1552 psrlq $63, %xmm2
1553 movdqa %xmm2, %xmm1
1554 pslldq $8, %xmm2
1555 psrldq $8, %xmm1
1556 por %xmm2, %xmm13
1557
1558 # reduce HashKey<<1
1559
1560 pshufd $0x24, %xmm1, %xmm2
1561 pcmpeqd TWOONE(%rip), %xmm2
1562 pand POLY(%rip), %xmm2
1563 pxor %xmm2, %xmm13
1564 movdqa %xmm13, HashKey(%rsp)
1565 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1566 and $-16, %r13
1567 mov %r13, %r12
1568
1569 # Encrypt first few blocks
1570
1571 and $(3<<4), %r12
1572 jz _initial_num_blocks_is_0_encrypt
1573 cmp $(2<<4), %r12
1574 jb _initial_num_blocks_is_1_encrypt
1575 je _initial_num_blocks_is_2_encrypt
1576_initial_num_blocks_is_3_encrypt:
1577 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1578%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1579 sub $48, %r13
1580 jmp _initial_blocks_encrypted
1581_initial_num_blocks_is_2_encrypt:
1582 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1583%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1584 sub $32, %r13
1585 jmp _initial_blocks_encrypted
1586_initial_num_blocks_is_1_encrypt:
1587 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1588%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1589 sub $16, %r13
1590 jmp _initial_blocks_encrypted
1591_initial_num_blocks_is_0_encrypt:
1592 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1593%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1594_initial_blocks_encrypted:
1595
1596 # Main loop - Encrypt remaining blocks
1597
1598 cmp $0, %r13
1599 je _zero_cipher_left_encrypt
1600 sub $64, %r13
1601 je _four_cipher_left_encrypt
1602_encrypt_by_4_encrypt:
1603 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1604%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1605 add $64, %r11
1606 sub $64, %r13
1607 jne _encrypt_by_4_encrypt
1608_four_cipher_left_encrypt:
1609 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1610%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1611_zero_cipher_left_encrypt:
1612 mov %arg4, %r13
1613 and $15, %r13 # %r13 = arg4 (mod 16)
1614 je _multiple_of_16_bytes_encrypt
1615
1616 # Handle the last <16 Byte block separately
1617 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1618 movdqa SHUF_MASK(%rip), %xmm10
1619 PSHUFB_XMM %xmm10, %xmm0
1620
1621
1622 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1623 sub $16, %r11
1624 add %r13, %r11
1625 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1626 lea SHIFT_MASK+16(%rip), %r12
1627 sub %r13, %r12
1628 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1629 # (%r13 is the number of bytes in plaintext mod 16)
1630 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1631 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1632 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1633 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1634 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1635 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1636 movdqa SHUF_MASK(%rip), %xmm10
1637 PSHUFB_XMM %xmm10,%xmm0
1638
1639 pxor %xmm0, %xmm8
1640 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1641 # GHASH computation for the last <16 byte block
1642 sub %r13, %r11
1643 add $16, %r11
1644
1645 movdqa SHUF_MASK(%rip), %xmm10
1646 PSHUFB_XMM %xmm10, %xmm0
1647
1648 # shuffle xmm0 back to output as ciphertext
1649
1650 # Output %r13 bytes
1651 MOVQ_R64_XMM %xmm0, %rax
1652 cmp $8, %r13
1653 jle _less_than_8_bytes_left_encrypt
1654 mov %rax, (%arg2 , %r11, 1)
1655 add $8, %r11
1656 psrldq $8, %xmm0
1657 MOVQ_R64_XMM %xmm0, %rax
1658 sub $8, %r13
1659_less_than_8_bytes_left_encrypt:
1660 mov %al, (%arg2, %r11, 1)
1661 add $1, %r11
1662 shr $8, %rax
1663 sub $1, %r13
1664 jne _less_than_8_bytes_left_encrypt
1665_multiple_of_16_bytes_encrypt:
1666 mov arg8, %r12 # %r12 = addLen (number of bytes)
1667 shl $3, %r12
1668 movd %r12d, %xmm15 # len(A) in %xmm15
1669 shl $3, %arg4 # len(C) in bits (*128)
1670 MOVQ_R64_XMM %arg4, %xmm1
1671 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1672 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1673 pxor %xmm15, %xmm8
1674 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1675 # final GHASH computation
1676 movdqa SHUF_MASK(%rip), %xmm10
1677 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1678
1679 mov %arg5, %rax # %rax = *Y0
1680 movdqu (%rax), %xmm0 # %xmm0 = Y0
1681 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1682 pxor %xmm8, %xmm0
1683_return_T_encrypt:
1684 mov arg9, %r10 # %r10 = authTag
1685 mov arg10, %r11 # %r11 = auth_tag_len
1686 cmp $16, %r11
1687 je _T_16_encrypt
1688 cmp $12, %r11
1689 je _T_12_encrypt
1690_T_8_encrypt:
1691 MOVQ_R64_XMM %xmm0, %rax
1692 mov %rax, (%r10)
1693 jmp _return_T_done_encrypt
1694_T_12_encrypt:
1695 MOVQ_R64_XMM %xmm0, %rax
1696 mov %rax, (%r10)
1697 psrldq $8, %xmm0
1698 movd %xmm0, %eax
1699 mov %eax, 8(%r10)
1700 jmp _return_T_done_encrypt
1701_T_16_encrypt:
1702 movdqu %xmm0, (%r10)
1703_return_T_done_encrypt:
1704 mov %r14, %rsp
1705 pop %r14
1706 pop %r13
1707 pop %r12
1708 ret
1709ENDPROC(aesni_gcm_enc)
1710
1711#endif
1712
1713
1714.align 4
1715_key_expansion_128:
1716_key_expansion_256a:
1717 pshufd $0b11111111, %xmm1, %xmm1
1718 shufps $0b00010000, %xmm0, %xmm4
1719 pxor %xmm4, %xmm0
1720 shufps $0b10001100, %xmm0, %xmm4
1721 pxor %xmm4, %xmm0
1722 pxor %xmm1, %xmm0
1723 movaps %xmm0, (TKEYP)
1724 add $0x10, TKEYP
1725 ret
1726ENDPROC(_key_expansion_128)
1727ENDPROC(_key_expansion_256a)
1728
1729.align 4
1730_key_expansion_192a:
1731 pshufd $0b01010101, %xmm1, %xmm1
1732 shufps $0b00010000, %xmm0, %xmm4
1733 pxor %xmm4, %xmm0
1734 shufps $0b10001100, %xmm0, %xmm4
1735 pxor %xmm4, %xmm0
1736 pxor %xmm1, %xmm0
1737
1738 movaps %xmm2, %xmm5
1739 movaps %xmm2, %xmm6
1740 pslldq $4, %xmm5
1741 pshufd $0b11111111, %xmm0, %xmm3
1742 pxor %xmm3, %xmm2
1743 pxor %xmm5, %xmm2
1744
1745 movaps %xmm0, %xmm1
1746 shufps $0b01000100, %xmm0, %xmm6
1747 movaps %xmm6, (TKEYP)
1748 shufps $0b01001110, %xmm2, %xmm1
1749 movaps %xmm1, 0x10(TKEYP)
1750 add $0x20, TKEYP
1751 ret
1752ENDPROC(_key_expansion_192a)
1753
1754.align 4
1755_key_expansion_192b:
1756 pshufd $0b01010101, %xmm1, %xmm1
1757 shufps $0b00010000, %xmm0, %xmm4
1758 pxor %xmm4, %xmm0
1759 shufps $0b10001100, %xmm0, %xmm4
1760 pxor %xmm4, %xmm0
1761 pxor %xmm1, %xmm0
1762
1763 movaps %xmm2, %xmm5
1764 pslldq $4, %xmm5
1765 pshufd $0b11111111, %xmm0, %xmm3
1766 pxor %xmm3, %xmm2
1767 pxor %xmm5, %xmm2
1768
1769 movaps %xmm0, (TKEYP)
1770 add $0x10, TKEYP
1771 ret
1772ENDPROC(_key_expansion_192b)
1773
1774.align 4
1775_key_expansion_256b:
1776 pshufd $0b10101010, %xmm1, %xmm1
1777 shufps $0b00010000, %xmm2, %xmm4
1778 pxor %xmm4, %xmm2
1779 shufps $0b10001100, %xmm2, %xmm4
1780 pxor %xmm4, %xmm2
1781 pxor %xmm1, %xmm2
1782 movaps %xmm2, (TKEYP)
1783 add $0x10, TKEYP
1784 ret
1785ENDPROC(_key_expansion_256b)
1786
1787/*
1788 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1789 * unsigned int key_len)
1790 */
1791ENTRY(aesni_set_key)
1792#ifndef __x86_64__
1793 pushl KEYP
1794 movl 8(%esp), KEYP # ctx
1795 movl 12(%esp), UKEYP # in_key
1796 movl 16(%esp), %edx # key_len
1797#endif
1798 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1799 movaps %xmm0, (KEYP)
1800 lea 0x10(KEYP), TKEYP # key addr
1801 movl %edx, 480(KEYP)
1802 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1803 cmp $24, %dl
1804 jb .Lenc_key128
1805 je .Lenc_key192
1806 movups 0x10(UKEYP), %xmm2 # other user key
1807 movaps %xmm2, (TKEYP)
1808 add $0x10, TKEYP
1809 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1810 call _key_expansion_256a
1811 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1812 call _key_expansion_256b
1813 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1814 call _key_expansion_256a
1815 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1816 call _key_expansion_256b
1817 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1818 call _key_expansion_256a
1819 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1820 call _key_expansion_256b
1821 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1822 call _key_expansion_256a
1823 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1824 call _key_expansion_256b
1825 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1826 call _key_expansion_256a
1827 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1828 call _key_expansion_256b
1829 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1830 call _key_expansion_256a
1831 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1832 call _key_expansion_256b
1833 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1834 call _key_expansion_256a
1835 jmp .Ldec_key
1836.Lenc_key192:
1837 movq 0x10(UKEYP), %xmm2 # other user key
1838 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1839 call _key_expansion_192a
1840 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1841 call _key_expansion_192b
1842 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1843 call _key_expansion_192a
1844 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1845 call _key_expansion_192b
1846 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1847 call _key_expansion_192a
1848 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1849 call _key_expansion_192b
1850 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1851 call _key_expansion_192a
1852 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1853 call _key_expansion_192b
1854 jmp .Ldec_key
1855.Lenc_key128:
1856 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1857 call _key_expansion_128
1858 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1859 call _key_expansion_128
1860 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1861 call _key_expansion_128
1862 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1863 call _key_expansion_128
1864 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1865 call _key_expansion_128
1866 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1867 call _key_expansion_128
1868 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1869 call _key_expansion_128
1870 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1871 call _key_expansion_128
1872 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1873 call _key_expansion_128
1874 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1875 call _key_expansion_128
1876.Ldec_key:
1877 sub $0x10, TKEYP
1878 movaps (KEYP), %xmm0
1879 movaps (TKEYP), %xmm1
1880 movaps %xmm0, 240(TKEYP)
1881 movaps %xmm1, 240(KEYP)
1882 add $0x10, KEYP
1883 lea 240-16(TKEYP), UKEYP
1884.align 4
1885.Ldec_key_loop:
1886 movaps (KEYP), %xmm0
1887 AESIMC %xmm0 %xmm1
1888 movaps %xmm1, (UKEYP)
1889 add $0x10, KEYP
1890 sub $0x10, UKEYP
1891 cmp TKEYP, KEYP
1892 jb .Ldec_key_loop
1893 xor AREG, AREG
1894#ifndef __x86_64__
1895 popl KEYP
1896#endif
1897 ret
1898ENDPROC(aesni_set_key)
1899
1900/*
1901 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1902 */
1903ENTRY(aesni_enc)
1904#ifndef __x86_64__
1905 pushl KEYP
1906 pushl KLEN
1907 movl 12(%esp), KEYP
1908 movl 16(%esp), OUTP
1909 movl 20(%esp), INP
1910#endif
1911 movl 480(KEYP), KLEN # key length
1912 movups (INP), STATE # input
1913 call _aesni_enc1
1914 movups STATE, (OUTP) # output
1915#ifndef __x86_64__
1916 popl KLEN
1917 popl KEYP
1918#endif
1919 ret
1920ENDPROC(aesni_enc)
1921
1922/*
1923 * _aesni_enc1: internal ABI
1924 * input:
1925 * KEYP: key struct pointer
1926 * KLEN: round count
1927 * STATE: initial state (input)
1928 * output:
1929 * STATE: finial state (output)
1930 * changed:
1931 * KEY
1932 * TKEYP (T1)
1933 */
1934.align 4
1935_aesni_enc1:
1936 movaps (KEYP), KEY # key
1937 mov KEYP, TKEYP
1938 pxor KEY, STATE # round 0
1939 add $0x30, TKEYP
1940 cmp $24, KLEN
1941 jb .Lenc128
1942 lea 0x20(TKEYP), TKEYP
1943 je .Lenc192
1944 add $0x20, TKEYP
1945 movaps -0x60(TKEYP), KEY
1946 AESENC KEY STATE
1947 movaps -0x50(TKEYP), KEY
1948 AESENC KEY STATE
1949.align 4
1950.Lenc192:
1951 movaps -0x40(TKEYP), KEY
1952 AESENC KEY STATE
1953 movaps -0x30(TKEYP), KEY
1954 AESENC KEY STATE
1955.align 4
1956.Lenc128:
1957 movaps -0x20(TKEYP), KEY
1958 AESENC KEY STATE
1959 movaps -0x10(TKEYP), KEY
1960 AESENC KEY STATE
1961 movaps (TKEYP), KEY
1962 AESENC KEY STATE
1963 movaps 0x10(TKEYP), KEY
1964 AESENC KEY STATE
1965 movaps 0x20(TKEYP), KEY
1966 AESENC KEY STATE
1967 movaps 0x30(TKEYP), KEY
1968 AESENC KEY STATE
1969 movaps 0x40(TKEYP), KEY
1970 AESENC KEY STATE
1971 movaps 0x50(TKEYP), KEY
1972 AESENC KEY STATE
1973 movaps 0x60(TKEYP), KEY
1974 AESENC KEY STATE
1975 movaps 0x70(TKEYP), KEY
1976 AESENCLAST KEY STATE
1977 ret
1978ENDPROC(_aesni_enc1)
1979
1980/*
1981 * _aesni_enc4: internal ABI
1982 * input:
1983 * KEYP: key struct pointer
1984 * KLEN: round count
1985 * STATE1: initial state (input)
1986 * STATE2
1987 * STATE3
1988 * STATE4
1989 * output:
1990 * STATE1: finial state (output)
1991 * STATE2
1992 * STATE3
1993 * STATE4
1994 * changed:
1995 * KEY
1996 * TKEYP (T1)
1997 */
1998.align 4
1999_aesni_enc4:
2000 movaps (KEYP), KEY # key
2001 mov KEYP, TKEYP
2002 pxor KEY, STATE1 # round 0
2003 pxor KEY, STATE2
2004 pxor KEY, STATE3
2005 pxor KEY, STATE4
2006 add $0x30, TKEYP
2007 cmp $24, KLEN
2008 jb .L4enc128
2009 lea 0x20(TKEYP), TKEYP
2010 je .L4enc192
2011 add $0x20, TKEYP
2012 movaps -0x60(TKEYP), KEY
2013 AESENC KEY STATE1
2014 AESENC KEY STATE2
2015 AESENC KEY STATE3
2016 AESENC KEY STATE4
2017 movaps -0x50(TKEYP), KEY
2018 AESENC KEY STATE1
2019 AESENC KEY STATE2
2020 AESENC KEY STATE3
2021 AESENC KEY STATE4
2022#.align 4
2023.L4enc192:
2024 movaps -0x40(TKEYP), KEY
2025 AESENC KEY STATE1
2026 AESENC KEY STATE2
2027 AESENC KEY STATE3
2028 AESENC KEY STATE4
2029 movaps -0x30(TKEYP), KEY
2030 AESENC KEY STATE1
2031 AESENC KEY STATE2
2032 AESENC KEY STATE3
2033 AESENC KEY STATE4
2034#.align 4
2035.L4enc128:
2036 movaps -0x20(TKEYP), KEY
2037 AESENC KEY STATE1
2038 AESENC KEY STATE2
2039 AESENC KEY STATE3
2040 AESENC KEY STATE4
2041 movaps -0x10(TKEYP), KEY
2042 AESENC KEY STATE1
2043 AESENC KEY STATE2
2044 AESENC KEY STATE3
2045 AESENC KEY STATE4
2046 movaps (TKEYP), KEY
2047 AESENC KEY STATE1
2048 AESENC KEY STATE2
2049 AESENC KEY STATE3
2050 AESENC KEY STATE4
2051 movaps 0x10(TKEYP), KEY
2052 AESENC KEY STATE1
2053 AESENC KEY STATE2
2054 AESENC KEY STATE3
2055 AESENC KEY STATE4
2056 movaps 0x20(TKEYP), KEY
2057 AESENC KEY STATE1
2058 AESENC KEY STATE2
2059 AESENC KEY STATE3
2060 AESENC KEY STATE4
2061 movaps 0x30(TKEYP), KEY
2062 AESENC KEY STATE1
2063 AESENC KEY STATE2
2064 AESENC KEY STATE3
2065 AESENC KEY STATE4
2066 movaps 0x40(TKEYP), KEY
2067 AESENC KEY STATE1
2068 AESENC KEY STATE2
2069 AESENC KEY STATE3
2070 AESENC KEY STATE4
2071 movaps 0x50(TKEYP), KEY
2072 AESENC KEY STATE1
2073 AESENC KEY STATE2
2074 AESENC KEY STATE3
2075 AESENC KEY STATE4
2076 movaps 0x60(TKEYP), KEY
2077 AESENC KEY STATE1
2078 AESENC KEY STATE2
2079 AESENC KEY STATE3
2080 AESENC KEY STATE4
2081 movaps 0x70(TKEYP), KEY
2082 AESENCLAST KEY STATE1 # last round
2083 AESENCLAST KEY STATE2
2084 AESENCLAST KEY STATE3
2085 AESENCLAST KEY STATE4
2086 ret
2087ENDPROC(_aesni_enc4)
2088
2089/*
2090 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2091 */
2092ENTRY(aesni_dec)
2093#ifndef __x86_64__
2094 pushl KEYP
2095 pushl KLEN
2096 movl 12(%esp), KEYP
2097 movl 16(%esp), OUTP
2098 movl 20(%esp), INP
2099#endif
2100 mov 480(KEYP), KLEN # key length
2101 add $240, KEYP
2102 movups (INP), STATE # input
2103 call _aesni_dec1
2104 movups STATE, (OUTP) #output
2105#ifndef __x86_64__
2106 popl KLEN
2107 popl KEYP
2108#endif
2109 ret
2110ENDPROC(aesni_dec)
2111
2112/*
2113 * _aesni_dec1: internal ABI
2114 * input:
2115 * KEYP: key struct pointer
2116 * KLEN: key length
2117 * STATE: initial state (input)
2118 * output:
2119 * STATE: finial state (output)
2120 * changed:
2121 * KEY
2122 * TKEYP (T1)
2123 */
2124.align 4
2125_aesni_dec1:
2126 movaps (KEYP), KEY # key
2127 mov KEYP, TKEYP
2128 pxor KEY, STATE # round 0
2129 add $0x30, TKEYP
2130 cmp $24, KLEN
2131 jb .Ldec128
2132 lea 0x20(TKEYP), TKEYP
2133 je .Ldec192
2134 add $0x20, TKEYP
2135 movaps -0x60(TKEYP), KEY
2136 AESDEC KEY STATE
2137 movaps -0x50(TKEYP), KEY
2138 AESDEC KEY STATE
2139.align 4
2140.Ldec192:
2141 movaps -0x40(TKEYP), KEY
2142 AESDEC KEY STATE
2143 movaps -0x30(TKEYP), KEY
2144 AESDEC KEY STATE
2145.align 4
2146.Ldec128:
2147 movaps -0x20(TKEYP), KEY
2148 AESDEC KEY STATE
2149 movaps -0x10(TKEYP), KEY
2150 AESDEC KEY STATE
2151 movaps (TKEYP), KEY
2152 AESDEC KEY STATE
2153 movaps 0x10(TKEYP), KEY
2154 AESDEC KEY STATE
2155 movaps 0x20(TKEYP), KEY
2156 AESDEC KEY STATE
2157 movaps 0x30(TKEYP), KEY
2158 AESDEC KEY STATE
2159 movaps 0x40(TKEYP), KEY
2160 AESDEC KEY STATE
2161 movaps 0x50(TKEYP), KEY
2162 AESDEC KEY STATE
2163 movaps 0x60(TKEYP), KEY
2164 AESDEC KEY STATE
2165 movaps 0x70(TKEYP), KEY
2166 AESDECLAST KEY STATE
2167 ret
2168ENDPROC(_aesni_dec1)
2169
2170/*
2171 * _aesni_dec4: internal ABI
2172 * input:
2173 * KEYP: key struct pointer
2174 * KLEN: key length
2175 * STATE1: initial state (input)
2176 * STATE2
2177 * STATE3
2178 * STATE4
2179 * output:
2180 * STATE1: finial state (output)
2181 * STATE2
2182 * STATE3
2183 * STATE4
2184 * changed:
2185 * KEY
2186 * TKEYP (T1)
2187 */
2188.align 4
2189_aesni_dec4:
2190 movaps (KEYP), KEY # key
2191 mov KEYP, TKEYP
2192 pxor KEY, STATE1 # round 0
2193 pxor KEY, STATE2
2194 pxor KEY, STATE3
2195 pxor KEY, STATE4
2196 add $0x30, TKEYP
2197 cmp $24, KLEN
2198 jb .L4dec128
2199 lea 0x20(TKEYP), TKEYP
2200 je .L4dec192
2201 add $0x20, TKEYP
2202 movaps -0x60(TKEYP), KEY
2203 AESDEC KEY STATE1
2204 AESDEC KEY STATE2
2205 AESDEC KEY STATE3
2206 AESDEC KEY STATE4
2207 movaps -0x50(TKEYP), KEY
2208 AESDEC KEY STATE1
2209 AESDEC KEY STATE2
2210 AESDEC KEY STATE3
2211 AESDEC KEY STATE4
2212.align 4
2213.L4dec192:
2214 movaps -0x40(TKEYP), KEY
2215 AESDEC KEY STATE1
2216 AESDEC KEY STATE2
2217 AESDEC KEY STATE3
2218 AESDEC KEY STATE4
2219 movaps -0x30(TKEYP), KEY
2220 AESDEC KEY STATE1
2221 AESDEC KEY STATE2
2222 AESDEC KEY STATE3
2223 AESDEC KEY STATE4
2224.align 4
2225.L4dec128:
2226 movaps -0x20(TKEYP), KEY
2227 AESDEC KEY STATE1
2228 AESDEC KEY STATE2
2229 AESDEC KEY STATE3
2230 AESDEC KEY STATE4
2231 movaps -0x10(TKEYP), KEY
2232 AESDEC KEY STATE1
2233 AESDEC KEY STATE2
2234 AESDEC KEY STATE3
2235 AESDEC KEY STATE4
2236 movaps (TKEYP), KEY
2237 AESDEC KEY STATE1
2238 AESDEC KEY STATE2
2239 AESDEC KEY STATE3
2240 AESDEC KEY STATE4
2241 movaps 0x10(TKEYP), KEY
2242 AESDEC KEY STATE1
2243 AESDEC KEY STATE2
2244 AESDEC KEY STATE3
2245 AESDEC KEY STATE4
2246 movaps 0x20(TKEYP), KEY
2247 AESDEC KEY STATE1
2248 AESDEC KEY STATE2
2249 AESDEC KEY STATE3
2250 AESDEC KEY STATE4
2251 movaps 0x30(TKEYP), KEY
2252 AESDEC KEY STATE1
2253 AESDEC KEY STATE2
2254 AESDEC KEY STATE3
2255 AESDEC KEY STATE4
2256 movaps 0x40(TKEYP), KEY
2257 AESDEC KEY STATE1
2258 AESDEC KEY STATE2
2259 AESDEC KEY STATE3
2260 AESDEC KEY STATE4
2261 movaps 0x50(TKEYP), KEY
2262 AESDEC KEY STATE1
2263 AESDEC KEY STATE2
2264 AESDEC KEY STATE3
2265 AESDEC KEY STATE4
2266 movaps 0x60(TKEYP), KEY
2267 AESDEC KEY STATE1
2268 AESDEC KEY STATE2
2269 AESDEC KEY STATE3
2270 AESDEC KEY STATE4
2271 movaps 0x70(TKEYP), KEY
2272 AESDECLAST KEY STATE1 # last round
2273 AESDECLAST KEY STATE2
2274 AESDECLAST KEY STATE3
2275 AESDECLAST KEY STATE4
2276 ret
2277ENDPROC(_aesni_dec4)
2278
2279/*
2280 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2281 * size_t len)
2282 */
2283ENTRY(aesni_ecb_enc)
2284#ifndef __x86_64__
2285 pushl LEN
2286 pushl KEYP
2287 pushl KLEN
2288 movl 16(%esp), KEYP
2289 movl 20(%esp), OUTP
2290 movl 24(%esp), INP
2291 movl 28(%esp), LEN
2292#endif
2293 test LEN, LEN # check length
2294 jz .Lecb_enc_ret
2295 mov 480(KEYP), KLEN
2296 cmp $16, LEN
2297 jb .Lecb_enc_ret
2298 cmp $64, LEN
2299 jb .Lecb_enc_loop1
2300.align 4
2301.Lecb_enc_loop4:
2302 movups (INP), STATE1
2303 movups 0x10(INP), STATE2
2304 movups 0x20(INP), STATE3
2305 movups 0x30(INP), STATE4
2306 call _aesni_enc4
2307 movups STATE1, (OUTP)
2308 movups STATE2, 0x10(OUTP)
2309 movups STATE3, 0x20(OUTP)
2310 movups STATE4, 0x30(OUTP)
2311 sub $64, LEN
2312 add $64, INP
2313 add $64, OUTP
2314 cmp $64, LEN
2315 jge .Lecb_enc_loop4
2316 cmp $16, LEN
2317 jb .Lecb_enc_ret
2318.align 4
2319.Lecb_enc_loop1:
2320 movups (INP), STATE1
2321 call _aesni_enc1
2322 movups STATE1, (OUTP)
2323 sub $16, LEN
2324 add $16, INP
2325 add $16, OUTP
2326 cmp $16, LEN
2327 jge .Lecb_enc_loop1
2328.Lecb_enc_ret:
2329#ifndef __x86_64__
2330 popl KLEN
2331 popl KEYP
2332 popl LEN
2333#endif
2334 ret
2335ENDPROC(aesni_ecb_enc)
2336
2337/*
2338 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2339 * size_t len);
2340 */
2341ENTRY(aesni_ecb_dec)
2342#ifndef __x86_64__
2343 pushl LEN
2344 pushl KEYP
2345 pushl KLEN
2346 movl 16(%esp), KEYP
2347 movl 20(%esp), OUTP
2348 movl 24(%esp), INP
2349 movl 28(%esp), LEN
2350#endif
2351 test LEN, LEN
2352 jz .Lecb_dec_ret
2353 mov 480(KEYP), KLEN
2354 add $240, KEYP
2355 cmp $16, LEN
2356 jb .Lecb_dec_ret
2357 cmp $64, LEN
2358 jb .Lecb_dec_loop1
2359.align 4
2360.Lecb_dec_loop4:
2361 movups (INP), STATE1
2362 movups 0x10(INP), STATE2
2363 movups 0x20(INP), STATE3
2364 movups 0x30(INP), STATE4
2365 call _aesni_dec4
2366 movups STATE1, (OUTP)
2367 movups STATE2, 0x10(OUTP)
2368 movups STATE3, 0x20(OUTP)
2369 movups STATE4, 0x30(OUTP)
2370 sub $64, LEN
2371 add $64, INP
2372 add $64, OUTP
2373 cmp $64, LEN
2374 jge .Lecb_dec_loop4
2375 cmp $16, LEN
2376 jb .Lecb_dec_ret
2377.align 4
2378.Lecb_dec_loop1:
2379 movups (INP), STATE1
2380 call _aesni_dec1
2381 movups STATE1, (OUTP)
2382 sub $16, LEN
2383 add $16, INP
2384 add $16, OUTP
2385 cmp $16, LEN
2386 jge .Lecb_dec_loop1
2387.Lecb_dec_ret:
2388#ifndef __x86_64__
2389 popl KLEN
2390 popl KEYP
2391 popl LEN
2392#endif
2393 ret
2394ENDPROC(aesni_ecb_dec)
2395
2396/*
2397 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2398 * size_t len, u8 *iv)
2399 */
2400ENTRY(aesni_cbc_enc)
2401#ifndef __x86_64__
2402 pushl IVP
2403 pushl LEN
2404 pushl KEYP
2405 pushl KLEN
2406 movl 20(%esp), KEYP
2407 movl 24(%esp), OUTP
2408 movl 28(%esp), INP
2409 movl 32(%esp), LEN
2410 movl 36(%esp), IVP
2411#endif
2412 cmp $16, LEN
2413 jb .Lcbc_enc_ret
2414 mov 480(KEYP), KLEN
2415 movups (IVP), STATE # load iv as initial state
2416.align 4
2417.Lcbc_enc_loop:
2418 movups (INP), IN # load input
2419 pxor IN, STATE
2420 call _aesni_enc1
2421 movups STATE, (OUTP) # store output
2422 sub $16, LEN
2423 add $16, INP
2424 add $16, OUTP
2425 cmp $16, LEN
2426 jge .Lcbc_enc_loop
2427 movups STATE, (IVP)
2428.Lcbc_enc_ret:
2429#ifndef __x86_64__
2430 popl KLEN
2431 popl KEYP
2432 popl LEN
2433 popl IVP
2434#endif
2435 ret
2436ENDPROC(aesni_cbc_enc)
2437
2438/*
2439 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2440 * size_t len, u8 *iv)
2441 */
2442ENTRY(aesni_cbc_dec)
2443#ifndef __x86_64__
2444 pushl IVP
2445 pushl LEN
2446 pushl KEYP
2447 pushl KLEN
2448 movl 20(%esp), KEYP
2449 movl 24(%esp), OUTP
2450 movl 28(%esp), INP
2451 movl 32(%esp), LEN
2452 movl 36(%esp), IVP
2453#endif
2454 cmp $16, LEN
2455 jb .Lcbc_dec_just_ret
2456 mov 480(KEYP), KLEN
2457 add $240, KEYP
2458 movups (IVP), IV
2459 cmp $64, LEN
2460 jb .Lcbc_dec_loop1
2461.align 4
2462.Lcbc_dec_loop4:
2463 movups (INP), IN1
2464 movaps IN1, STATE1
2465 movups 0x10(INP), IN2
2466 movaps IN2, STATE2
2467#ifdef __x86_64__
2468 movups 0x20(INP), IN3
2469 movaps IN3, STATE3
2470 movups 0x30(INP), IN4
2471 movaps IN4, STATE4
2472#else
2473 movups 0x20(INP), IN1
2474 movaps IN1, STATE3
2475 movups 0x30(INP), IN2
2476 movaps IN2, STATE4
2477#endif
2478 call _aesni_dec4
2479 pxor IV, STATE1
2480#ifdef __x86_64__
2481 pxor IN1, STATE2
2482 pxor IN2, STATE3
2483 pxor IN3, STATE4
2484 movaps IN4, IV
2485#else
2486 pxor IN1, STATE4
2487 movaps IN2, IV
2488 movups (INP), IN1
2489 pxor IN1, STATE2
2490 movups 0x10(INP), IN2
2491 pxor IN2, STATE3
2492#endif
2493 movups STATE1, (OUTP)
2494 movups STATE2, 0x10(OUTP)
2495 movups STATE3, 0x20(OUTP)
2496 movups STATE4, 0x30(OUTP)
2497 sub $64, LEN
2498 add $64, INP
2499 add $64, OUTP
2500 cmp $64, LEN
2501 jge .Lcbc_dec_loop4
2502 cmp $16, LEN
2503 jb .Lcbc_dec_ret
2504.align 4
2505.Lcbc_dec_loop1:
2506 movups (INP), IN
2507 movaps IN, STATE
2508 call _aesni_dec1
2509 pxor IV, STATE
2510 movups STATE, (OUTP)
2511 movaps IN, IV
2512 sub $16, LEN
2513 add $16, INP
2514 add $16, OUTP
2515 cmp $16, LEN
2516 jge .Lcbc_dec_loop1
2517.Lcbc_dec_ret:
2518 movups IV, (IVP)
2519.Lcbc_dec_just_ret:
2520#ifndef __x86_64__
2521 popl KLEN
2522 popl KEYP
2523 popl LEN
2524 popl IVP
2525#endif
2526 ret
2527ENDPROC(aesni_cbc_dec)
2528
2529#ifdef __x86_64__
2530.align 16
2531.Lbswap_mask:
2532 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2533
2534/*
2535 * _aesni_inc_init: internal ABI
2536 * setup registers used by _aesni_inc
2537 * input:
2538 * IV
2539 * output:
2540 * CTR: == IV, in little endian
2541 * TCTR_LOW: == lower qword of CTR
2542 * INC: == 1, in little endian
2543 * BSWAP_MASK == endian swapping mask
2544 */
2545.align 4
2546_aesni_inc_init:
2547 movaps .Lbswap_mask, BSWAP_MASK
2548 movaps IV, CTR
2549 PSHUFB_XMM BSWAP_MASK CTR
2550 mov $1, TCTR_LOW
2551 MOVQ_R64_XMM TCTR_LOW INC
2552 MOVQ_R64_XMM CTR TCTR_LOW
2553 ret
2554ENDPROC(_aesni_inc_init)
2555
2556/*
2557 * _aesni_inc: internal ABI
2558 * Increase IV by 1, IV is in big endian
2559 * input:
2560 * IV
2561 * CTR: == IV, in little endian
2562 * TCTR_LOW: == lower qword of CTR
2563 * INC: == 1, in little endian
2564 * BSWAP_MASK == endian swapping mask
2565 * output:
2566 * IV: Increase by 1
2567 * changed:
2568 * CTR: == output IV, in little endian
2569 * TCTR_LOW: == lower qword of CTR
2570 */
2571.align 4
2572_aesni_inc:
2573 paddq INC, CTR
2574 add $1, TCTR_LOW
2575 jnc .Linc_low
2576 pslldq $8, INC
2577 paddq INC, CTR
2578 psrldq $8, INC
2579.Linc_low:
2580 movaps CTR, IV
2581 PSHUFB_XMM BSWAP_MASK IV
2582 ret
2583ENDPROC(_aesni_inc)
2584
2585/*
2586 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2587 * size_t len, u8 *iv)
2588 */
2589ENTRY(aesni_ctr_enc)
2590 cmp $16, LEN
2591 jb .Lctr_enc_just_ret
2592 mov 480(KEYP), KLEN
2593 movups (IVP), IV
2594 call _aesni_inc_init
2595 cmp $64, LEN
2596 jb .Lctr_enc_loop1
2597.align 4
2598.Lctr_enc_loop4:
2599 movaps IV, STATE1
2600 call _aesni_inc
2601 movups (INP), IN1
2602 movaps IV, STATE2
2603 call _aesni_inc
2604 movups 0x10(INP), IN2
2605 movaps IV, STATE3
2606 call _aesni_inc
2607 movups 0x20(INP), IN3
2608 movaps IV, STATE4
2609 call _aesni_inc
2610 movups 0x30(INP), IN4
2611 call _aesni_enc4
2612 pxor IN1, STATE1
2613 movups STATE1, (OUTP)
2614 pxor IN2, STATE2
2615 movups STATE2, 0x10(OUTP)
2616 pxor IN3, STATE3
2617 movups STATE3, 0x20(OUTP)
2618 pxor IN4, STATE4
2619 movups STATE4, 0x30(OUTP)
2620 sub $64, LEN
2621 add $64, INP
2622 add $64, OUTP
2623 cmp $64, LEN
2624 jge .Lctr_enc_loop4
2625 cmp $16, LEN
2626 jb .Lctr_enc_ret
2627.align 4
2628.Lctr_enc_loop1:
2629 movaps IV, STATE
2630 call _aesni_inc
2631 movups (INP), IN
2632 call _aesni_enc1
2633 pxor IN, STATE
2634 movups STATE, (OUTP)
2635 sub $16, LEN
2636 add $16, INP
2637 add $16, OUTP
2638 cmp $16, LEN
2639 jge .Lctr_enc_loop1
2640.Lctr_enc_ret:
2641 movups IV, (IVP)
2642.Lctr_enc_just_ret:
2643 ret
2644ENDPROC(aesni_ctr_enc)
2645
2646/*
2647 * _aesni_gf128mul_x_ble: internal ABI
2648 * Multiply in GF(2^128) for XTS IVs
2649 * input:
2650 * IV: current IV
2651 * GF128MUL_MASK == mask with 0x87 and 0x01
2652 * output:
2653 * IV: next IV
2654 * changed:
2655 * CTR: == temporary value
2656 */
2657#define _aesni_gf128mul_x_ble() \
2658 pshufd $0x13, IV, CTR; \
2659 paddq IV, IV; \
2660 psrad $31, CTR; \
2661 pand GF128MUL_MASK, CTR; \
2662 pxor CTR, IV;
2663
2664/*
2665 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2666 * bool enc, u8 *iv)
2667 */
2668ENTRY(aesni_xts_crypt8)
2669 cmpb $0, %cl
2670 movl $0, %ecx
2671 movl $240, %r10d
2672 leaq _aesni_enc4, %r11
2673 leaq _aesni_dec4, %rax
2674 cmovel %r10d, %ecx
2675 cmoveq %rax, %r11
2676
2677 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2678 movups (IVP), IV
2679
2680 mov 480(KEYP), KLEN
2681 addq %rcx, KEYP
2682
2683 movdqa IV, STATE1
2684 movdqu 0x00(INP), INC
2685 pxor INC, STATE1
2686 movdqu IV, 0x00(OUTP)
2687
2688 _aesni_gf128mul_x_ble()
2689 movdqa IV, STATE2
2690 movdqu 0x10(INP), INC
2691 pxor INC, STATE2
2692 movdqu IV, 0x10(OUTP)
2693
2694 _aesni_gf128mul_x_ble()
2695 movdqa IV, STATE3
2696 movdqu 0x20(INP), INC
2697 pxor INC, STATE3
2698 movdqu IV, 0x20(OUTP)
2699
2700 _aesni_gf128mul_x_ble()
2701 movdqa IV, STATE4
2702 movdqu 0x30(INP), INC
2703 pxor INC, STATE4
2704 movdqu IV, 0x30(OUTP)
2705
2706 call *%r11
2707
2708 movdqu 0x00(OUTP), INC
2709 pxor INC, STATE1
2710 movdqu STATE1, 0x00(OUTP)
2711
2712 _aesni_gf128mul_x_ble()
2713 movdqa IV, STATE1
2714 movdqu 0x40(INP), INC
2715 pxor INC, STATE1
2716 movdqu IV, 0x40(OUTP)
2717
2718 movdqu 0x10(OUTP), INC
2719 pxor INC, STATE2
2720 movdqu STATE2, 0x10(OUTP)
2721
2722 _aesni_gf128mul_x_ble()
2723 movdqa IV, STATE2
2724 movdqu 0x50(INP), INC
2725 pxor INC, STATE2
2726 movdqu IV, 0x50(OUTP)
2727
2728 movdqu 0x20(OUTP), INC
2729 pxor INC, STATE3
2730 movdqu STATE3, 0x20(OUTP)
2731
2732 _aesni_gf128mul_x_ble()
2733 movdqa IV, STATE3
2734 movdqu 0x60(INP), INC
2735 pxor INC, STATE3
2736 movdqu IV, 0x60(OUTP)
2737
2738 movdqu 0x30(OUTP), INC
2739 pxor INC, STATE4
2740 movdqu STATE4, 0x30(OUTP)
2741
2742 _aesni_gf128mul_x_ble()
2743 movdqa IV, STATE4
2744 movdqu 0x70(INP), INC
2745 pxor INC, STATE4
2746 movdqu IV, 0x70(OUTP)
2747
2748 _aesni_gf128mul_x_ble()
2749 movups IV, (IVP)
2750
2751 call *%r11
2752
2753 movdqu 0x40(OUTP), INC
2754 pxor INC, STATE1
2755 movdqu STATE1, 0x40(OUTP)
2756
2757 movdqu 0x50(OUTP), INC
2758 pxor INC, STATE2
2759 movdqu STATE2, 0x50(OUTP)
2760
2761 movdqu 0x60(OUTP), INC
2762 pxor INC, STATE3
2763 movdqu STATE3, 0x60(OUTP)
2764
2765 movdqu 0x70(OUTP), INC
2766 pxor INC, STATE4
2767 movdqu STATE4, 0x70(OUTP)
2768
2769 ret
2770ENDPROC(aesni_xts_crypt8)
2771
2772#endif