Loading...
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
13 * Copyright (c) 2010, Intel Corporation.
14 *
15 * Ported x86_64 version to x86:
16 * Author: Mathias Krause <minipli@googlemail.com>
17 */
18
19#include <linux/linkage.h>
20#include <asm/frame.h>
21
22#define STATE1 %xmm0
23#define STATE2 %xmm4
24#define STATE3 %xmm5
25#define STATE4 %xmm6
26#define STATE STATE1
27#define IN1 %xmm1
28#define IN2 %xmm7
29#define IN3 %xmm8
30#define IN4 %xmm9
31#define IN IN1
32#define KEY %xmm2
33#define IV %xmm3
34
35#define BSWAP_MASK %xmm10
36#define CTR %xmm11
37#define INC %xmm12
38
39#define GF128MUL_MASK %xmm7
40
41#ifdef __x86_64__
42#define AREG %rax
43#define KEYP %rdi
44#define OUTP %rsi
45#define UKEYP OUTP
46#define INP %rdx
47#define LEN %rcx
48#define IVP %r8
49#define KLEN %r9d
50#define T1 %r10
51#define TKEYP T1
52#define T2 %r11
53#define TCTR_LOW T2
54#else
55#define AREG %eax
56#define KEYP %edi
57#define OUTP AREG
58#define UKEYP OUTP
59#define INP %edx
60#define LEN %esi
61#define IVP %ebp
62#define KLEN %ebx
63#define T1 %ecx
64#define TKEYP T1
65#endif
66
67SYM_FUNC_START_LOCAL(_key_expansion_256a)
68 pshufd $0b11111111, %xmm1, %xmm1
69 shufps $0b00010000, %xmm0, %xmm4
70 pxor %xmm4, %xmm0
71 shufps $0b10001100, %xmm0, %xmm4
72 pxor %xmm4, %xmm0
73 pxor %xmm1, %xmm0
74 movaps %xmm0, (TKEYP)
75 add $0x10, TKEYP
76 RET
77SYM_FUNC_END(_key_expansion_256a)
78SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
79
80SYM_FUNC_START_LOCAL(_key_expansion_192a)
81 pshufd $0b01010101, %xmm1, %xmm1
82 shufps $0b00010000, %xmm0, %xmm4
83 pxor %xmm4, %xmm0
84 shufps $0b10001100, %xmm0, %xmm4
85 pxor %xmm4, %xmm0
86 pxor %xmm1, %xmm0
87
88 movaps %xmm2, %xmm5
89 movaps %xmm2, %xmm6
90 pslldq $4, %xmm5
91 pshufd $0b11111111, %xmm0, %xmm3
92 pxor %xmm3, %xmm2
93 pxor %xmm5, %xmm2
94
95 movaps %xmm0, %xmm1
96 shufps $0b01000100, %xmm0, %xmm6
97 movaps %xmm6, (TKEYP)
98 shufps $0b01001110, %xmm2, %xmm1
99 movaps %xmm1, 0x10(TKEYP)
100 add $0x20, TKEYP
101 RET
102SYM_FUNC_END(_key_expansion_192a)
103
104SYM_FUNC_START_LOCAL(_key_expansion_192b)
105 pshufd $0b01010101, %xmm1, %xmm1
106 shufps $0b00010000, %xmm0, %xmm4
107 pxor %xmm4, %xmm0
108 shufps $0b10001100, %xmm0, %xmm4
109 pxor %xmm4, %xmm0
110 pxor %xmm1, %xmm0
111
112 movaps %xmm2, %xmm5
113 pslldq $4, %xmm5
114 pshufd $0b11111111, %xmm0, %xmm3
115 pxor %xmm3, %xmm2
116 pxor %xmm5, %xmm2
117
118 movaps %xmm0, (TKEYP)
119 add $0x10, TKEYP
120 RET
121SYM_FUNC_END(_key_expansion_192b)
122
123SYM_FUNC_START_LOCAL(_key_expansion_256b)
124 pshufd $0b10101010, %xmm1, %xmm1
125 shufps $0b00010000, %xmm2, %xmm4
126 pxor %xmm4, %xmm2
127 shufps $0b10001100, %xmm2, %xmm4
128 pxor %xmm4, %xmm2
129 pxor %xmm1, %xmm2
130 movaps %xmm2, (TKEYP)
131 add $0x10, TKEYP
132 RET
133SYM_FUNC_END(_key_expansion_256b)
134
135/*
136 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
137 * unsigned int key_len)
138 */
139SYM_FUNC_START(aesni_set_key)
140 FRAME_BEGIN
141#ifndef __x86_64__
142 pushl KEYP
143 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
144 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
145 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
146#endif
147 movups (UKEYP), %xmm0 # user key (first 16 bytes)
148 movaps %xmm0, (KEYP)
149 lea 0x10(KEYP), TKEYP # key addr
150 movl %edx, 480(KEYP)
151 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
152 cmp $24, %dl
153 jb .Lenc_key128
154 je .Lenc_key192
155 movups 0x10(UKEYP), %xmm2 # other user key
156 movaps %xmm2, (TKEYP)
157 add $0x10, TKEYP
158 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
159 call _key_expansion_256a
160 aeskeygenassist $0x1, %xmm0, %xmm1
161 call _key_expansion_256b
162 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
163 call _key_expansion_256a
164 aeskeygenassist $0x2, %xmm0, %xmm1
165 call _key_expansion_256b
166 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
167 call _key_expansion_256a
168 aeskeygenassist $0x4, %xmm0, %xmm1
169 call _key_expansion_256b
170 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
171 call _key_expansion_256a
172 aeskeygenassist $0x8, %xmm0, %xmm1
173 call _key_expansion_256b
174 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
175 call _key_expansion_256a
176 aeskeygenassist $0x10, %xmm0, %xmm1
177 call _key_expansion_256b
178 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
179 call _key_expansion_256a
180 aeskeygenassist $0x20, %xmm0, %xmm1
181 call _key_expansion_256b
182 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
183 call _key_expansion_256a
184 jmp .Ldec_key
185.Lenc_key192:
186 movq 0x10(UKEYP), %xmm2 # other user key
187 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
188 call _key_expansion_192a
189 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
190 call _key_expansion_192b
191 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
192 call _key_expansion_192a
193 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
194 call _key_expansion_192b
195 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
196 call _key_expansion_192a
197 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
198 call _key_expansion_192b
199 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
200 call _key_expansion_192a
201 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
202 call _key_expansion_192b
203 jmp .Ldec_key
204.Lenc_key128:
205 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
206 call _key_expansion_128
207 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
208 call _key_expansion_128
209 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
210 call _key_expansion_128
211 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
212 call _key_expansion_128
213 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
214 call _key_expansion_128
215 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
216 call _key_expansion_128
217 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
218 call _key_expansion_128
219 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
220 call _key_expansion_128
221 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
222 call _key_expansion_128
223 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
224 call _key_expansion_128
225.Ldec_key:
226 sub $0x10, TKEYP
227 movaps (KEYP), %xmm0
228 movaps (TKEYP), %xmm1
229 movaps %xmm0, 240(TKEYP)
230 movaps %xmm1, 240(KEYP)
231 add $0x10, KEYP
232 lea 240-16(TKEYP), UKEYP
233.align 4
234.Ldec_key_loop:
235 movaps (KEYP), %xmm0
236 aesimc %xmm0, %xmm1
237 movaps %xmm1, (UKEYP)
238 add $0x10, KEYP
239 sub $0x10, UKEYP
240 cmp TKEYP, KEYP
241 jb .Ldec_key_loop
242#ifndef __x86_64__
243 popl KEYP
244#endif
245 FRAME_END
246 RET
247SYM_FUNC_END(aesni_set_key)
248
249/*
250 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
251 */
252SYM_FUNC_START(aesni_enc)
253 FRAME_BEGIN
254#ifndef __x86_64__
255 pushl KEYP
256 pushl KLEN
257 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
258 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
259 movl (FRAME_OFFSET+20)(%esp), INP # src
260#endif
261 movl 480(KEYP), KLEN # key length
262 movups (INP), STATE # input
263 call _aesni_enc1
264 movups STATE, (OUTP) # output
265#ifndef __x86_64__
266 popl KLEN
267 popl KEYP
268#endif
269 FRAME_END
270 RET
271SYM_FUNC_END(aesni_enc)
272
273/*
274 * _aesni_enc1: internal ABI
275 * input:
276 * KEYP: key struct pointer
277 * KLEN: round count
278 * STATE: initial state (input)
279 * output:
280 * STATE: finial state (output)
281 * changed:
282 * KEY
283 * TKEYP (T1)
284 */
285SYM_FUNC_START_LOCAL(_aesni_enc1)
286 movaps (KEYP), KEY # key
287 mov KEYP, TKEYP
288 pxor KEY, STATE # round 0
289 add $0x30, TKEYP
290 cmp $24, KLEN
291 jb .Lenc128
292 lea 0x20(TKEYP), TKEYP
293 je .Lenc192
294 add $0x20, TKEYP
295 movaps -0x60(TKEYP), KEY
296 aesenc KEY, STATE
297 movaps -0x50(TKEYP), KEY
298 aesenc KEY, STATE
299.align 4
300.Lenc192:
301 movaps -0x40(TKEYP), KEY
302 aesenc KEY, STATE
303 movaps -0x30(TKEYP), KEY
304 aesenc KEY, STATE
305.align 4
306.Lenc128:
307 movaps -0x20(TKEYP), KEY
308 aesenc KEY, STATE
309 movaps -0x10(TKEYP), KEY
310 aesenc KEY, STATE
311 movaps (TKEYP), KEY
312 aesenc KEY, STATE
313 movaps 0x10(TKEYP), KEY
314 aesenc KEY, STATE
315 movaps 0x20(TKEYP), KEY
316 aesenc KEY, STATE
317 movaps 0x30(TKEYP), KEY
318 aesenc KEY, STATE
319 movaps 0x40(TKEYP), KEY
320 aesenc KEY, STATE
321 movaps 0x50(TKEYP), KEY
322 aesenc KEY, STATE
323 movaps 0x60(TKEYP), KEY
324 aesenc KEY, STATE
325 movaps 0x70(TKEYP), KEY
326 aesenclast KEY, STATE
327 RET
328SYM_FUNC_END(_aesni_enc1)
329
330/*
331 * _aesni_enc4: internal ABI
332 * input:
333 * KEYP: key struct pointer
334 * KLEN: round count
335 * STATE1: initial state (input)
336 * STATE2
337 * STATE3
338 * STATE4
339 * output:
340 * STATE1: finial state (output)
341 * STATE2
342 * STATE3
343 * STATE4
344 * changed:
345 * KEY
346 * TKEYP (T1)
347 */
348SYM_FUNC_START_LOCAL(_aesni_enc4)
349 movaps (KEYP), KEY # key
350 mov KEYP, TKEYP
351 pxor KEY, STATE1 # round 0
352 pxor KEY, STATE2
353 pxor KEY, STATE3
354 pxor KEY, STATE4
355 add $0x30, TKEYP
356 cmp $24, KLEN
357 jb .L4enc128
358 lea 0x20(TKEYP), TKEYP
359 je .L4enc192
360 add $0x20, TKEYP
361 movaps -0x60(TKEYP), KEY
362 aesenc KEY, STATE1
363 aesenc KEY, STATE2
364 aesenc KEY, STATE3
365 aesenc KEY, STATE4
366 movaps -0x50(TKEYP), KEY
367 aesenc KEY, STATE1
368 aesenc KEY, STATE2
369 aesenc KEY, STATE3
370 aesenc KEY, STATE4
371#.align 4
372.L4enc192:
373 movaps -0x40(TKEYP), KEY
374 aesenc KEY, STATE1
375 aesenc KEY, STATE2
376 aesenc KEY, STATE3
377 aesenc KEY, STATE4
378 movaps -0x30(TKEYP), KEY
379 aesenc KEY, STATE1
380 aesenc KEY, STATE2
381 aesenc KEY, STATE3
382 aesenc KEY, STATE4
383#.align 4
384.L4enc128:
385 movaps -0x20(TKEYP), KEY
386 aesenc KEY, STATE1
387 aesenc KEY, STATE2
388 aesenc KEY, STATE3
389 aesenc KEY, STATE4
390 movaps -0x10(TKEYP), KEY
391 aesenc KEY, STATE1
392 aesenc KEY, STATE2
393 aesenc KEY, STATE3
394 aesenc KEY, STATE4
395 movaps (TKEYP), KEY
396 aesenc KEY, STATE1
397 aesenc KEY, STATE2
398 aesenc KEY, STATE3
399 aesenc KEY, STATE4
400 movaps 0x10(TKEYP), KEY
401 aesenc KEY, STATE1
402 aesenc KEY, STATE2
403 aesenc KEY, STATE3
404 aesenc KEY, STATE4
405 movaps 0x20(TKEYP), KEY
406 aesenc KEY, STATE1
407 aesenc KEY, STATE2
408 aesenc KEY, STATE3
409 aesenc KEY, STATE4
410 movaps 0x30(TKEYP), KEY
411 aesenc KEY, STATE1
412 aesenc KEY, STATE2
413 aesenc KEY, STATE3
414 aesenc KEY, STATE4
415 movaps 0x40(TKEYP), KEY
416 aesenc KEY, STATE1
417 aesenc KEY, STATE2
418 aesenc KEY, STATE3
419 aesenc KEY, STATE4
420 movaps 0x50(TKEYP), KEY
421 aesenc KEY, STATE1
422 aesenc KEY, STATE2
423 aesenc KEY, STATE3
424 aesenc KEY, STATE4
425 movaps 0x60(TKEYP), KEY
426 aesenc KEY, STATE1
427 aesenc KEY, STATE2
428 aesenc KEY, STATE3
429 aesenc KEY, STATE4
430 movaps 0x70(TKEYP), KEY
431 aesenclast KEY, STATE1 # last round
432 aesenclast KEY, STATE2
433 aesenclast KEY, STATE3
434 aesenclast KEY, STATE4
435 RET
436SYM_FUNC_END(_aesni_enc4)
437
438/*
439 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
440 */
441SYM_FUNC_START(aesni_dec)
442 FRAME_BEGIN
443#ifndef __x86_64__
444 pushl KEYP
445 pushl KLEN
446 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
447 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
448 movl (FRAME_OFFSET+20)(%esp), INP # src
449#endif
450 mov 480(KEYP), KLEN # key length
451 add $240, KEYP
452 movups (INP), STATE # input
453 call _aesni_dec1
454 movups STATE, (OUTP) #output
455#ifndef __x86_64__
456 popl KLEN
457 popl KEYP
458#endif
459 FRAME_END
460 RET
461SYM_FUNC_END(aesni_dec)
462
463/*
464 * _aesni_dec1: internal ABI
465 * input:
466 * KEYP: key struct pointer
467 * KLEN: key length
468 * STATE: initial state (input)
469 * output:
470 * STATE: finial state (output)
471 * changed:
472 * KEY
473 * TKEYP (T1)
474 */
475SYM_FUNC_START_LOCAL(_aesni_dec1)
476 movaps (KEYP), KEY # key
477 mov KEYP, TKEYP
478 pxor KEY, STATE # round 0
479 add $0x30, TKEYP
480 cmp $24, KLEN
481 jb .Ldec128
482 lea 0x20(TKEYP), TKEYP
483 je .Ldec192
484 add $0x20, TKEYP
485 movaps -0x60(TKEYP), KEY
486 aesdec KEY, STATE
487 movaps -0x50(TKEYP), KEY
488 aesdec KEY, STATE
489.align 4
490.Ldec192:
491 movaps -0x40(TKEYP), KEY
492 aesdec KEY, STATE
493 movaps -0x30(TKEYP), KEY
494 aesdec KEY, STATE
495.align 4
496.Ldec128:
497 movaps -0x20(TKEYP), KEY
498 aesdec KEY, STATE
499 movaps -0x10(TKEYP), KEY
500 aesdec KEY, STATE
501 movaps (TKEYP), KEY
502 aesdec KEY, STATE
503 movaps 0x10(TKEYP), KEY
504 aesdec KEY, STATE
505 movaps 0x20(TKEYP), KEY
506 aesdec KEY, STATE
507 movaps 0x30(TKEYP), KEY
508 aesdec KEY, STATE
509 movaps 0x40(TKEYP), KEY
510 aesdec KEY, STATE
511 movaps 0x50(TKEYP), KEY
512 aesdec KEY, STATE
513 movaps 0x60(TKEYP), KEY
514 aesdec KEY, STATE
515 movaps 0x70(TKEYP), KEY
516 aesdeclast KEY, STATE
517 RET
518SYM_FUNC_END(_aesni_dec1)
519
520/*
521 * _aesni_dec4: internal ABI
522 * input:
523 * KEYP: key struct pointer
524 * KLEN: key length
525 * STATE1: initial state (input)
526 * STATE2
527 * STATE3
528 * STATE4
529 * output:
530 * STATE1: finial state (output)
531 * STATE2
532 * STATE3
533 * STATE4
534 * changed:
535 * KEY
536 * TKEYP (T1)
537 */
538SYM_FUNC_START_LOCAL(_aesni_dec4)
539 movaps (KEYP), KEY # key
540 mov KEYP, TKEYP
541 pxor KEY, STATE1 # round 0
542 pxor KEY, STATE2
543 pxor KEY, STATE3
544 pxor KEY, STATE4
545 add $0x30, TKEYP
546 cmp $24, KLEN
547 jb .L4dec128
548 lea 0x20(TKEYP), TKEYP
549 je .L4dec192
550 add $0x20, TKEYP
551 movaps -0x60(TKEYP), KEY
552 aesdec KEY, STATE1
553 aesdec KEY, STATE2
554 aesdec KEY, STATE3
555 aesdec KEY, STATE4
556 movaps -0x50(TKEYP), KEY
557 aesdec KEY, STATE1
558 aesdec KEY, STATE2
559 aesdec KEY, STATE3
560 aesdec KEY, STATE4
561.align 4
562.L4dec192:
563 movaps -0x40(TKEYP), KEY
564 aesdec KEY, STATE1
565 aesdec KEY, STATE2
566 aesdec KEY, STATE3
567 aesdec KEY, STATE4
568 movaps -0x30(TKEYP), KEY
569 aesdec KEY, STATE1
570 aesdec KEY, STATE2
571 aesdec KEY, STATE3
572 aesdec KEY, STATE4
573.align 4
574.L4dec128:
575 movaps -0x20(TKEYP), KEY
576 aesdec KEY, STATE1
577 aesdec KEY, STATE2
578 aesdec KEY, STATE3
579 aesdec KEY, STATE4
580 movaps -0x10(TKEYP), KEY
581 aesdec KEY, STATE1
582 aesdec KEY, STATE2
583 aesdec KEY, STATE3
584 aesdec KEY, STATE4
585 movaps (TKEYP), KEY
586 aesdec KEY, STATE1
587 aesdec KEY, STATE2
588 aesdec KEY, STATE3
589 aesdec KEY, STATE4
590 movaps 0x10(TKEYP), KEY
591 aesdec KEY, STATE1
592 aesdec KEY, STATE2
593 aesdec KEY, STATE3
594 aesdec KEY, STATE4
595 movaps 0x20(TKEYP), KEY
596 aesdec KEY, STATE1
597 aesdec KEY, STATE2
598 aesdec KEY, STATE3
599 aesdec KEY, STATE4
600 movaps 0x30(TKEYP), KEY
601 aesdec KEY, STATE1
602 aesdec KEY, STATE2
603 aesdec KEY, STATE3
604 aesdec KEY, STATE4
605 movaps 0x40(TKEYP), KEY
606 aesdec KEY, STATE1
607 aesdec KEY, STATE2
608 aesdec KEY, STATE3
609 aesdec KEY, STATE4
610 movaps 0x50(TKEYP), KEY
611 aesdec KEY, STATE1
612 aesdec KEY, STATE2
613 aesdec KEY, STATE3
614 aesdec KEY, STATE4
615 movaps 0x60(TKEYP), KEY
616 aesdec KEY, STATE1
617 aesdec KEY, STATE2
618 aesdec KEY, STATE3
619 aesdec KEY, STATE4
620 movaps 0x70(TKEYP), KEY
621 aesdeclast KEY, STATE1 # last round
622 aesdeclast KEY, STATE2
623 aesdeclast KEY, STATE3
624 aesdeclast KEY, STATE4
625 RET
626SYM_FUNC_END(_aesni_dec4)
627
628/*
629 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
630 * size_t len)
631 */
632SYM_FUNC_START(aesni_ecb_enc)
633 FRAME_BEGIN
634#ifndef __x86_64__
635 pushl LEN
636 pushl KEYP
637 pushl KLEN
638 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
639 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
640 movl (FRAME_OFFSET+24)(%esp), INP # src
641 movl (FRAME_OFFSET+28)(%esp), LEN # len
642#endif
643 test LEN, LEN # check length
644 jz .Lecb_enc_ret
645 mov 480(KEYP), KLEN
646 cmp $16, LEN
647 jb .Lecb_enc_ret
648 cmp $64, LEN
649 jb .Lecb_enc_loop1
650.align 4
651.Lecb_enc_loop4:
652 movups (INP), STATE1
653 movups 0x10(INP), STATE2
654 movups 0x20(INP), STATE3
655 movups 0x30(INP), STATE4
656 call _aesni_enc4
657 movups STATE1, (OUTP)
658 movups STATE2, 0x10(OUTP)
659 movups STATE3, 0x20(OUTP)
660 movups STATE4, 0x30(OUTP)
661 sub $64, LEN
662 add $64, INP
663 add $64, OUTP
664 cmp $64, LEN
665 jge .Lecb_enc_loop4
666 cmp $16, LEN
667 jb .Lecb_enc_ret
668.align 4
669.Lecb_enc_loop1:
670 movups (INP), STATE1
671 call _aesni_enc1
672 movups STATE1, (OUTP)
673 sub $16, LEN
674 add $16, INP
675 add $16, OUTP
676 cmp $16, LEN
677 jge .Lecb_enc_loop1
678.Lecb_enc_ret:
679#ifndef __x86_64__
680 popl KLEN
681 popl KEYP
682 popl LEN
683#endif
684 FRAME_END
685 RET
686SYM_FUNC_END(aesni_ecb_enc)
687
688/*
689 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
690 * size_t len);
691 */
692SYM_FUNC_START(aesni_ecb_dec)
693 FRAME_BEGIN
694#ifndef __x86_64__
695 pushl LEN
696 pushl KEYP
697 pushl KLEN
698 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
699 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
700 movl (FRAME_OFFSET+24)(%esp), INP # src
701 movl (FRAME_OFFSET+28)(%esp), LEN # len
702#endif
703 test LEN, LEN
704 jz .Lecb_dec_ret
705 mov 480(KEYP), KLEN
706 add $240, KEYP
707 cmp $16, LEN
708 jb .Lecb_dec_ret
709 cmp $64, LEN
710 jb .Lecb_dec_loop1
711.align 4
712.Lecb_dec_loop4:
713 movups (INP), STATE1
714 movups 0x10(INP), STATE2
715 movups 0x20(INP), STATE3
716 movups 0x30(INP), STATE4
717 call _aesni_dec4
718 movups STATE1, (OUTP)
719 movups STATE2, 0x10(OUTP)
720 movups STATE3, 0x20(OUTP)
721 movups STATE4, 0x30(OUTP)
722 sub $64, LEN
723 add $64, INP
724 add $64, OUTP
725 cmp $64, LEN
726 jge .Lecb_dec_loop4
727 cmp $16, LEN
728 jb .Lecb_dec_ret
729.align 4
730.Lecb_dec_loop1:
731 movups (INP), STATE1
732 call _aesni_dec1
733 movups STATE1, (OUTP)
734 sub $16, LEN
735 add $16, INP
736 add $16, OUTP
737 cmp $16, LEN
738 jge .Lecb_dec_loop1
739.Lecb_dec_ret:
740#ifndef __x86_64__
741 popl KLEN
742 popl KEYP
743 popl LEN
744#endif
745 FRAME_END
746 RET
747SYM_FUNC_END(aesni_ecb_dec)
748
749/*
750 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
751 * size_t len, u8 *iv)
752 */
753SYM_FUNC_START(aesni_cbc_enc)
754 FRAME_BEGIN
755#ifndef __x86_64__
756 pushl IVP
757 pushl LEN
758 pushl KEYP
759 pushl KLEN
760 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
761 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
762 movl (FRAME_OFFSET+28)(%esp), INP # src
763 movl (FRAME_OFFSET+32)(%esp), LEN # len
764 movl (FRAME_OFFSET+36)(%esp), IVP # iv
765#endif
766 cmp $16, LEN
767 jb .Lcbc_enc_ret
768 mov 480(KEYP), KLEN
769 movups (IVP), STATE # load iv as initial state
770.align 4
771.Lcbc_enc_loop:
772 movups (INP), IN # load input
773 pxor IN, STATE
774 call _aesni_enc1
775 movups STATE, (OUTP) # store output
776 sub $16, LEN
777 add $16, INP
778 add $16, OUTP
779 cmp $16, LEN
780 jge .Lcbc_enc_loop
781 movups STATE, (IVP)
782.Lcbc_enc_ret:
783#ifndef __x86_64__
784 popl KLEN
785 popl KEYP
786 popl LEN
787 popl IVP
788#endif
789 FRAME_END
790 RET
791SYM_FUNC_END(aesni_cbc_enc)
792
793/*
794 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
795 * size_t len, u8 *iv)
796 */
797SYM_FUNC_START(aesni_cbc_dec)
798 FRAME_BEGIN
799#ifndef __x86_64__
800 pushl IVP
801 pushl LEN
802 pushl KEYP
803 pushl KLEN
804 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
805 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
806 movl (FRAME_OFFSET+28)(%esp), INP # src
807 movl (FRAME_OFFSET+32)(%esp), LEN # len
808 movl (FRAME_OFFSET+36)(%esp), IVP # iv
809#endif
810 cmp $16, LEN
811 jb .Lcbc_dec_just_ret
812 mov 480(KEYP), KLEN
813 add $240, KEYP
814 movups (IVP), IV
815 cmp $64, LEN
816 jb .Lcbc_dec_loop1
817.align 4
818.Lcbc_dec_loop4:
819 movups (INP), IN1
820 movaps IN1, STATE1
821 movups 0x10(INP), IN2
822 movaps IN2, STATE2
823#ifdef __x86_64__
824 movups 0x20(INP), IN3
825 movaps IN3, STATE3
826 movups 0x30(INP), IN4
827 movaps IN4, STATE4
828#else
829 movups 0x20(INP), IN1
830 movaps IN1, STATE3
831 movups 0x30(INP), IN2
832 movaps IN2, STATE4
833#endif
834 call _aesni_dec4
835 pxor IV, STATE1
836#ifdef __x86_64__
837 pxor IN1, STATE2
838 pxor IN2, STATE3
839 pxor IN3, STATE4
840 movaps IN4, IV
841#else
842 pxor IN1, STATE4
843 movaps IN2, IV
844 movups (INP), IN1
845 pxor IN1, STATE2
846 movups 0x10(INP), IN2
847 pxor IN2, STATE3
848#endif
849 movups STATE1, (OUTP)
850 movups STATE2, 0x10(OUTP)
851 movups STATE3, 0x20(OUTP)
852 movups STATE4, 0x30(OUTP)
853 sub $64, LEN
854 add $64, INP
855 add $64, OUTP
856 cmp $64, LEN
857 jge .Lcbc_dec_loop4
858 cmp $16, LEN
859 jb .Lcbc_dec_ret
860.align 4
861.Lcbc_dec_loop1:
862 movups (INP), IN
863 movaps IN, STATE
864 call _aesni_dec1
865 pxor IV, STATE
866 movups STATE, (OUTP)
867 movaps IN, IV
868 sub $16, LEN
869 add $16, INP
870 add $16, OUTP
871 cmp $16, LEN
872 jge .Lcbc_dec_loop1
873.Lcbc_dec_ret:
874 movups IV, (IVP)
875.Lcbc_dec_just_ret:
876#ifndef __x86_64__
877 popl KLEN
878 popl KEYP
879 popl LEN
880 popl IVP
881#endif
882 FRAME_END
883 RET
884SYM_FUNC_END(aesni_cbc_dec)
885
886/*
887 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
888 * size_t len, u8 *iv)
889 */
890SYM_FUNC_START(aesni_cts_cbc_enc)
891 FRAME_BEGIN
892#ifndef __x86_64__
893 pushl IVP
894 pushl LEN
895 pushl KEYP
896 pushl KLEN
897 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
898 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
899 movl (FRAME_OFFSET+28)(%esp), INP # src
900 movl (FRAME_OFFSET+32)(%esp), LEN # len
901 movl (FRAME_OFFSET+36)(%esp), IVP # iv
902 lea .Lcts_permute_table, T1
903#else
904 lea .Lcts_permute_table(%rip), T1
905#endif
906 mov 480(KEYP), KLEN
907 movups (IVP), STATE
908 sub $16, LEN
909 mov T1, IVP
910 add $32, IVP
911 add LEN, T1
912 sub LEN, IVP
913 movups (T1), %xmm4
914 movups (IVP), %xmm5
915
916 movups (INP), IN1
917 add LEN, INP
918 movups (INP), IN2
919
920 pxor IN1, STATE
921 call _aesni_enc1
922
923 pshufb %xmm5, IN2
924 pxor STATE, IN2
925 pshufb %xmm4, STATE
926 add OUTP, LEN
927 movups STATE, (LEN)
928
929 movaps IN2, STATE
930 call _aesni_enc1
931 movups STATE, (OUTP)
932
933#ifndef __x86_64__
934 popl KLEN
935 popl KEYP
936 popl LEN
937 popl IVP
938#endif
939 FRAME_END
940 RET
941SYM_FUNC_END(aesni_cts_cbc_enc)
942
943/*
944 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
945 * size_t len, u8 *iv)
946 */
947SYM_FUNC_START(aesni_cts_cbc_dec)
948 FRAME_BEGIN
949#ifndef __x86_64__
950 pushl IVP
951 pushl LEN
952 pushl KEYP
953 pushl KLEN
954 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
955 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
956 movl (FRAME_OFFSET+28)(%esp), INP # src
957 movl (FRAME_OFFSET+32)(%esp), LEN # len
958 movl (FRAME_OFFSET+36)(%esp), IVP # iv
959 lea .Lcts_permute_table, T1
960#else
961 lea .Lcts_permute_table(%rip), T1
962#endif
963 mov 480(KEYP), KLEN
964 add $240, KEYP
965 movups (IVP), IV
966 sub $16, LEN
967 mov T1, IVP
968 add $32, IVP
969 add LEN, T1
970 sub LEN, IVP
971 movups (T1), %xmm4
972
973 movups (INP), STATE
974 add LEN, INP
975 movups (INP), IN1
976
977 call _aesni_dec1
978 movaps STATE, IN2
979 pshufb %xmm4, STATE
980 pxor IN1, STATE
981
982 add OUTP, LEN
983 movups STATE, (LEN)
984
985 movups (IVP), %xmm0
986 pshufb %xmm0, IN1
987 pblendvb IN2, IN1
988 movaps IN1, STATE
989 call _aesni_dec1
990
991 pxor IV, STATE
992 movups STATE, (OUTP)
993
994#ifndef __x86_64__
995 popl KLEN
996 popl KEYP
997 popl LEN
998 popl IVP
999#endif
1000 FRAME_END
1001 RET
1002SYM_FUNC_END(aesni_cts_cbc_dec)
1003
1004.pushsection .rodata
1005.align 16
1006.Lcts_permute_table:
1007 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1008 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1009 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
1010 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
1011 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1012 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1013#ifdef __x86_64__
1014.Lbswap_mask:
1015 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1016#endif
1017.popsection
1018
1019#ifdef __x86_64__
1020/*
1021 * _aesni_inc_init: internal ABI
1022 * setup registers used by _aesni_inc
1023 * input:
1024 * IV
1025 * output:
1026 * CTR: == IV, in little endian
1027 * TCTR_LOW: == lower qword of CTR
1028 * INC: == 1, in little endian
1029 * BSWAP_MASK == endian swapping mask
1030 */
1031SYM_FUNC_START_LOCAL(_aesni_inc_init)
1032 movaps .Lbswap_mask(%rip), BSWAP_MASK
1033 movaps IV, CTR
1034 pshufb BSWAP_MASK, CTR
1035 mov $1, TCTR_LOW
1036 movq TCTR_LOW, INC
1037 movq CTR, TCTR_LOW
1038 RET
1039SYM_FUNC_END(_aesni_inc_init)
1040
1041/*
1042 * _aesni_inc: internal ABI
1043 * Increase IV by 1, IV is in big endian
1044 * input:
1045 * IV
1046 * CTR: == IV, in little endian
1047 * TCTR_LOW: == lower qword of CTR
1048 * INC: == 1, in little endian
1049 * BSWAP_MASK == endian swapping mask
1050 * output:
1051 * IV: Increase by 1
1052 * changed:
1053 * CTR: == output IV, in little endian
1054 * TCTR_LOW: == lower qword of CTR
1055 */
1056SYM_FUNC_START_LOCAL(_aesni_inc)
1057 paddq INC, CTR
1058 add $1, TCTR_LOW
1059 jnc .Linc_low
1060 pslldq $8, INC
1061 paddq INC, CTR
1062 psrldq $8, INC
1063.Linc_low:
1064 movaps CTR, IV
1065 pshufb BSWAP_MASK, IV
1066 RET
1067SYM_FUNC_END(_aesni_inc)
1068
1069/*
1070 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1071 * size_t len, u8 *iv)
1072 */
1073SYM_FUNC_START(aesni_ctr_enc)
1074 FRAME_BEGIN
1075 cmp $16, LEN
1076 jb .Lctr_enc_just_ret
1077 mov 480(KEYP), KLEN
1078 movups (IVP), IV
1079 call _aesni_inc_init
1080 cmp $64, LEN
1081 jb .Lctr_enc_loop1
1082.align 4
1083.Lctr_enc_loop4:
1084 movaps IV, STATE1
1085 call _aesni_inc
1086 movups (INP), IN1
1087 movaps IV, STATE2
1088 call _aesni_inc
1089 movups 0x10(INP), IN2
1090 movaps IV, STATE3
1091 call _aesni_inc
1092 movups 0x20(INP), IN3
1093 movaps IV, STATE4
1094 call _aesni_inc
1095 movups 0x30(INP), IN4
1096 call _aesni_enc4
1097 pxor IN1, STATE1
1098 movups STATE1, (OUTP)
1099 pxor IN2, STATE2
1100 movups STATE2, 0x10(OUTP)
1101 pxor IN3, STATE3
1102 movups STATE3, 0x20(OUTP)
1103 pxor IN4, STATE4
1104 movups STATE4, 0x30(OUTP)
1105 sub $64, LEN
1106 add $64, INP
1107 add $64, OUTP
1108 cmp $64, LEN
1109 jge .Lctr_enc_loop4
1110 cmp $16, LEN
1111 jb .Lctr_enc_ret
1112.align 4
1113.Lctr_enc_loop1:
1114 movaps IV, STATE
1115 call _aesni_inc
1116 movups (INP), IN
1117 call _aesni_enc1
1118 pxor IN, STATE
1119 movups STATE, (OUTP)
1120 sub $16, LEN
1121 add $16, INP
1122 add $16, OUTP
1123 cmp $16, LEN
1124 jge .Lctr_enc_loop1
1125.Lctr_enc_ret:
1126 movups IV, (IVP)
1127.Lctr_enc_just_ret:
1128 FRAME_END
1129 RET
1130SYM_FUNC_END(aesni_ctr_enc)
1131
1132#endif
1133
1134.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
1135.align 16
1136.Lgf128mul_x_ble_mask:
1137 .octa 0x00000000000000010000000000000087
1138.previous
1139
1140/*
1141 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
1142 * input:
1143 * IV: current IV
1144 * GF128MUL_MASK == mask with 0x87 and 0x01
1145 * output:
1146 * IV: next IV
1147 * changed:
1148 * KEY: == temporary value
1149 */
1150.macro _aesni_gf128mul_x_ble
1151 pshufd $0x13, IV, KEY
1152 paddq IV, IV
1153 psrad $31, KEY
1154 pand GF128MUL_MASK, KEY
1155 pxor KEY, IV
1156.endm
1157
1158.macro _aesni_xts_crypt enc
1159 FRAME_BEGIN
1160#ifndef __x86_64__
1161 pushl IVP
1162 pushl LEN
1163 pushl KEYP
1164 pushl KLEN
1165 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
1166 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
1167 movl (FRAME_OFFSET+28)(%esp), INP # src
1168 movl (FRAME_OFFSET+32)(%esp), LEN # len
1169 movl (FRAME_OFFSET+36)(%esp), IVP # iv
1170 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
1171#else
1172 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
1173#endif
1174 movups (IVP), IV
1175
1176 mov 480(KEYP), KLEN
1177.if !\enc
1178 add $240, KEYP
1179
1180 test $15, LEN
1181 jz .Lxts_loop4\@
1182 sub $16, LEN
1183.endif
1184
1185.Lxts_loop4\@:
1186 sub $64, LEN
1187 jl .Lxts_1x\@
1188
1189 movdqa IV, STATE1
1190 movdqu 0x00(INP), IN
1191 pxor IN, STATE1
1192 movdqu IV, 0x00(OUTP)
1193
1194 _aesni_gf128mul_x_ble
1195 movdqa IV, STATE2
1196 movdqu 0x10(INP), IN
1197 pxor IN, STATE2
1198 movdqu IV, 0x10(OUTP)
1199
1200 _aesni_gf128mul_x_ble
1201 movdqa IV, STATE3
1202 movdqu 0x20(INP), IN
1203 pxor IN, STATE3
1204 movdqu IV, 0x20(OUTP)
1205
1206 _aesni_gf128mul_x_ble
1207 movdqa IV, STATE4
1208 movdqu 0x30(INP), IN
1209 pxor IN, STATE4
1210 movdqu IV, 0x30(OUTP)
1211
1212.if \enc
1213 call _aesni_enc4
1214.else
1215 call _aesni_dec4
1216.endif
1217
1218 movdqu 0x00(OUTP), IN
1219 pxor IN, STATE1
1220 movdqu STATE1, 0x00(OUTP)
1221
1222 movdqu 0x10(OUTP), IN
1223 pxor IN, STATE2
1224 movdqu STATE2, 0x10(OUTP)
1225
1226 movdqu 0x20(OUTP), IN
1227 pxor IN, STATE3
1228 movdqu STATE3, 0x20(OUTP)
1229
1230 movdqu 0x30(OUTP), IN
1231 pxor IN, STATE4
1232 movdqu STATE4, 0x30(OUTP)
1233
1234 _aesni_gf128mul_x_ble
1235
1236 add $64, INP
1237 add $64, OUTP
1238 test LEN, LEN
1239 jnz .Lxts_loop4\@
1240
1241.Lxts_ret_iv\@:
1242 movups IV, (IVP)
1243
1244.Lxts_ret\@:
1245#ifndef __x86_64__
1246 popl KLEN
1247 popl KEYP
1248 popl LEN
1249 popl IVP
1250#endif
1251 FRAME_END
1252 RET
1253
1254.Lxts_1x\@:
1255 add $64, LEN
1256 jz .Lxts_ret_iv\@
1257.if \enc
1258 sub $16, LEN
1259 jl .Lxts_cts4\@
1260.endif
1261
1262.Lxts_loop1\@:
1263 movdqu (INP), STATE
1264.if \enc
1265 pxor IV, STATE
1266 call _aesni_enc1
1267.else
1268 add $16, INP
1269 sub $16, LEN
1270 jl .Lxts_cts1\@
1271 pxor IV, STATE
1272 call _aesni_dec1
1273.endif
1274 pxor IV, STATE
1275 _aesni_gf128mul_x_ble
1276
1277 test LEN, LEN
1278 jz .Lxts_out\@
1279
1280.if \enc
1281 add $16, INP
1282 sub $16, LEN
1283 jl .Lxts_cts1\@
1284.endif
1285
1286 movdqu STATE, (OUTP)
1287 add $16, OUTP
1288 jmp .Lxts_loop1\@
1289
1290.Lxts_out\@:
1291 movdqu STATE, (OUTP)
1292 jmp .Lxts_ret_iv\@
1293
1294.if \enc
1295.Lxts_cts4\@:
1296 movdqa STATE4, STATE
1297 sub $16, OUTP
1298.Lxts_cts1\@:
1299.else
1300.Lxts_cts1\@:
1301 movdqa IV, STATE4
1302 _aesni_gf128mul_x_ble
1303
1304 pxor IV, STATE
1305 call _aesni_dec1
1306 pxor IV, STATE
1307.endif
1308#ifndef __x86_64__
1309 lea .Lcts_permute_table, T1
1310#else
1311 lea .Lcts_permute_table(%rip), T1
1312#endif
1313 add LEN, INP /* rewind input pointer */
1314 add $16, LEN /* # bytes in final block */
1315 movups (INP), IN1
1316
1317 mov T1, IVP
1318 add $32, IVP
1319 add LEN, T1
1320 sub LEN, IVP
1321 add OUTP, LEN
1322
1323 movups (T1), %xmm4
1324 movaps STATE, IN2
1325 pshufb %xmm4, STATE
1326 movups STATE, (LEN)
1327
1328 movups (IVP), %xmm0
1329 pshufb %xmm0, IN1
1330 pblendvb IN2, IN1
1331 movaps IN1, STATE
1332
1333.if \enc
1334 pxor IV, STATE
1335 call _aesni_enc1
1336 pxor IV, STATE
1337.else
1338 pxor STATE4, STATE
1339 call _aesni_dec1
1340 pxor STATE4, STATE
1341.endif
1342
1343 movups STATE, (OUTP)
1344 jmp .Lxts_ret\@
1345.endm
1346
1347/*
1348 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
1349 * const u8 *src, unsigned int len, le128 *iv)
1350 */
1351SYM_FUNC_START(aesni_xts_enc)
1352 _aesni_xts_crypt 1
1353SYM_FUNC_END(aesni_xts_enc)
1354
1355/*
1356 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
1357 * const u8 *src, unsigned int len, le128 *iv)
1358 */
1359SYM_FUNC_START(aesni_xts_dec)
1360 _aesni_xts_crypt 0
1361SYM_FUNC_END(aesni_xts_dec)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
23 *
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
26 */
27
28#include <linux/linkage.h>
29#include <asm/frame.h>
30#include <asm/nospec-branch.h>
31
32/*
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register. This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned). It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released. However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
39 */
40#define MOVADQ movaps
41#define MOVUDQ movups
42
43#ifdef __x86_64__
44
45# constants in mergeable sections, linker can reorder and merge
46.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
47.align 16
48.Lgf128mul_x_ble_mask:
49 .octa 0x00000000000000010000000000000087
50.section .rodata.cst16.POLY, "aM", @progbits, 16
51.align 16
52POLY: .octa 0xC2000000000000000000000000000001
53.section .rodata.cst16.TWOONE, "aM", @progbits, 16
54.align 16
55TWOONE: .octa 0x00000001000000000000000000000001
56
57.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
58.align 16
59SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
60.section .rodata.cst16.MASK1, "aM", @progbits, 16
61.align 16
62MASK1: .octa 0x0000000000000000ffffffffffffffff
63.section .rodata.cst16.MASK2, "aM", @progbits, 16
64.align 16
65MASK2: .octa 0xffffffffffffffff0000000000000000
66.section .rodata.cst16.ONE, "aM", @progbits, 16
67.align 16
68ONE: .octa 0x00000000000000000000000000000001
69.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
70.align 16
71F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
72.section .rodata.cst16.dec, "aM", @progbits, 16
73.align 16
74dec: .octa 0x1
75.section .rodata.cst16.enc, "aM", @progbits, 16
76.align 16
77enc: .octa 0x2
78
79# order of these constants should not change.
80# more specifically, ALL_F should follow SHIFT_MASK,
81# and zero should follow ALL_F
82.section .rodata, "a", @progbits
83.align 16
84SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
85ALL_F: .octa 0xffffffffffffffffffffffffffffffff
86 .octa 0x00000000000000000000000000000000
87
88.text
89
90
91#define STACK_OFFSET 8*3
92
93#define AadHash 16*0
94#define AadLen 16*1
95#define InLen (16*1)+8
96#define PBlockEncKey 16*2
97#define OrigIV 16*3
98#define CurCount 16*4
99#define PBlockLen 16*5
100#define HashKey 16*6 // store HashKey <<1 mod poly here
101#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
102#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
103#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
104#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
105 // bits of HashKey <<1 mod poly here
106 //(for Karatsuba purposes)
107#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^2 <<1 mod poly here
109 // (for Karatsuba purposes)
110#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^3 <<1 mod poly here
112 // (for Karatsuba purposes)
113#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
114 // bits of HashKey^4 <<1 mod poly here
115 // (for Karatsuba purposes)
116
117#define arg1 rdi
118#define arg2 rsi
119#define arg3 rdx
120#define arg4 rcx
121#define arg5 r8
122#define arg6 r9
123#define arg7 STACK_OFFSET+8(%rsp)
124#define arg8 STACK_OFFSET+16(%rsp)
125#define arg9 STACK_OFFSET+24(%rsp)
126#define arg10 STACK_OFFSET+32(%rsp)
127#define arg11 STACK_OFFSET+40(%rsp)
128#define keysize 2*15*16(%arg1)
129#endif
130
131
132#define STATE1 %xmm0
133#define STATE2 %xmm4
134#define STATE3 %xmm5
135#define STATE4 %xmm6
136#define STATE STATE1
137#define IN1 %xmm1
138#define IN2 %xmm7
139#define IN3 %xmm8
140#define IN4 %xmm9
141#define IN IN1
142#define KEY %xmm2
143#define IV %xmm3
144
145#define BSWAP_MASK %xmm10
146#define CTR %xmm11
147#define INC %xmm12
148
149#define GF128MUL_MASK %xmm10
150
151#ifdef __x86_64__
152#define AREG %rax
153#define KEYP %rdi
154#define OUTP %rsi
155#define UKEYP OUTP
156#define INP %rdx
157#define LEN %rcx
158#define IVP %r8
159#define KLEN %r9d
160#define T1 %r10
161#define TKEYP T1
162#define T2 %r11
163#define TCTR_LOW T2
164#else
165#define AREG %eax
166#define KEYP %edi
167#define OUTP AREG
168#define UKEYP OUTP
169#define INP %edx
170#define LEN %esi
171#define IVP %ebp
172#define KLEN %ebx
173#define T1 %ecx
174#define TKEYP T1
175#endif
176
177.macro FUNC_SAVE
178 push %r12
179 push %r13
180 push %r14
181#
182# states of %xmm registers %xmm6:%xmm15 not saved
183# all %xmm registers are clobbered
184#
185.endm
186
187
188.macro FUNC_RESTORE
189 pop %r14
190 pop %r13
191 pop %r12
192.endm
193
194# Precompute hashkeys.
195# Input: Hash subkey.
196# Output: HashKeys stored in gcm_context_data. Only needs to be called
197# once per key.
198# clobbers r12, and tmp xmm registers.
199.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
200 mov \SUBKEY, %r12
201 movdqu (%r12), \TMP3
202 movdqa SHUF_MASK(%rip), \TMP2
203 pshufb \TMP2, \TMP3
204
205 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
206
207 movdqa \TMP3, \TMP2
208 psllq $1, \TMP3
209 psrlq $63, \TMP2
210 movdqa \TMP2, \TMP1
211 pslldq $8, \TMP2
212 psrldq $8, \TMP1
213 por \TMP2, \TMP3
214
215 # reduce HashKey<<1
216
217 pshufd $0x24, \TMP1, \TMP2
218 pcmpeqd TWOONE(%rip), \TMP2
219 pand POLY(%rip), \TMP2
220 pxor \TMP2, \TMP3
221 movdqu \TMP3, HashKey(%arg2)
222
223 movdqa \TMP3, \TMP5
224 pshufd $78, \TMP3, \TMP1
225 pxor \TMP3, \TMP1
226 movdqu \TMP1, HashKey_k(%arg2)
227
228 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
229# TMP5 = HashKey^2<<1 (mod poly)
230 movdqu \TMP5, HashKey_2(%arg2)
231# HashKey_2 = HashKey^2<<1 (mod poly)
232 pshufd $78, \TMP5, \TMP1
233 pxor \TMP5, \TMP1
234 movdqu \TMP1, HashKey_2_k(%arg2)
235
236 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
237# TMP5 = HashKey^3<<1 (mod poly)
238 movdqu \TMP5, HashKey_3(%arg2)
239 pshufd $78, \TMP5, \TMP1
240 pxor \TMP5, \TMP1
241 movdqu \TMP1, HashKey_3_k(%arg2)
242
243 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
244# TMP5 = HashKey^3<<1 (mod poly)
245 movdqu \TMP5, HashKey_4(%arg2)
246 pshufd $78, \TMP5, \TMP1
247 pxor \TMP5, \TMP1
248 movdqu \TMP1, HashKey_4_k(%arg2)
249.endm
250
251# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
252# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
253.macro GCM_INIT Iv SUBKEY AAD AADLEN
254 mov \AADLEN, %r11
255 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
256 xor %r11d, %r11d
257 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
258 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
259 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
260 mov \Iv, %rax
261 movdqu (%rax), %xmm0
262 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
263
264 movdqa SHUF_MASK(%rip), %xmm2
265 pshufb %xmm2, %xmm0
266 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
267
268 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
269 movdqu HashKey(%arg2), %xmm13
270
271 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
272 %xmm4, %xmm5, %xmm6
273.endm
274
275# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
276# struct has been initialized by GCM_INIT.
277# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
278# Clobbers rax, r10-r13, and xmm0-xmm15
279.macro GCM_ENC_DEC operation
280 movdqu AadHash(%arg2), %xmm8
281 movdqu HashKey(%arg2), %xmm13
282 add %arg5, InLen(%arg2)
283
284 xor %r11d, %r11d # initialise the data pointer offset as zero
285 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
286
287 sub %r11, %arg5 # sub partial block data used
288 mov %arg5, %r13 # save the number of bytes
289
290 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
291 mov %r13, %r12
292 # Encrypt/Decrypt first few blocks
293
294 and $(3<<4), %r12
295 jz _initial_num_blocks_is_0_\@
296 cmp $(2<<4), %r12
297 jb _initial_num_blocks_is_1_\@
298 je _initial_num_blocks_is_2_\@
299_initial_num_blocks_is_3_\@:
300 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
301%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
302 sub $48, %r13
303 jmp _initial_blocks_\@
304_initial_num_blocks_is_2_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
307 sub $32, %r13
308 jmp _initial_blocks_\@
309_initial_num_blocks_is_1_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
312 sub $16, %r13
313 jmp _initial_blocks_\@
314_initial_num_blocks_is_0_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
317_initial_blocks_\@:
318
319 # Main loop - Encrypt/Decrypt remaining blocks
320
321 cmp $0, %r13
322 je _zero_cipher_left_\@
323 sub $64, %r13
324 je _four_cipher_left_\@
325_crypt_by_4_\@:
326 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
327 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
328 %xmm7, %xmm8, enc
329 add $64, %r11
330 sub $64, %r13
331 jne _crypt_by_4_\@
332_four_cipher_left_\@:
333 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
334%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
335_zero_cipher_left_\@:
336 movdqu %xmm8, AadHash(%arg2)
337 movdqu %xmm0, CurCount(%arg2)
338
339 mov %arg5, %r13
340 and $15, %r13 # %r13 = arg5 (mod 16)
341 je _multiple_of_16_bytes_\@
342
343 mov %r13, PBlockLen(%arg2)
344
345 # Handle the last <16 Byte block separately
346 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
347 movdqu %xmm0, CurCount(%arg2)
348 movdqa SHUF_MASK(%rip), %xmm10
349 pshufb %xmm10, %xmm0
350
351 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
352 movdqu %xmm0, PBlockEncKey(%arg2)
353
354 cmp $16, %arg5
355 jge _large_enough_update_\@
356
357 lea (%arg4,%r11,1), %r10
358 mov %r13, %r12
359 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
360 jmp _data_read_\@
361
362_large_enough_update_\@:
363 sub $16, %r11
364 add %r13, %r11
365
366 # receive the last <16 Byte block
367 movdqu (%arg4, %r11, 1), %xmm1
368
369 sub %r13, %r11
370 add $16, %r11
371
372 lea SHIFT_MASK+16(%rip), %r12
373 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
374 # (r13 is the number of bytes in plaintext mod 16)
375 sub %r13, %r12
376 # get the appropriate shuffle mask
377 movdqu (%r12), %xmm2
378 # shift right 16-r13 bytes
379 pshufb %xmm2, %xmm1
380
381_data_read_\@:
382 lea ALL_F+16(%rip), %r12
383 sub %r13, %r12
384
385.ifc \operation, dec
386 movdqa %xmm1, %xmm2
387.endif
388 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
389 movdqu (%r12), %xmm1
390 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
391 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
392.ifc \operation, dec
393 pand %xmm1, %xmm2
394 movdqa SHUF_MASK(%rip), %xmm10
395 pshufb %xmm10 ,%xmm2
396
397 pxor %xmm2, %xmm8
398.else
399 movdqa SHUF_MASK(%rip), %xmm10
400 pshufb %xmm10,%xmm0
401
402 pxor %xmm0, %xmm8
403.endif
404
405 movdqu %xmm8, AadHash(%arg2)
406.ifc \operation, enc
407 # GHASH computation for the last <16 byte block
408 movdqa SHUF_MASK(%rip), %xmm10
409 # shuffle xmm0 back to output as ciphertext
410 pshufb %xmm10, %xmm0
411.endif
412
413 # Output %r13 bytes
414 movq %xmm0, %rax
415 cmp $8, %r13
416 jle _less_than_8_bytes_left_\@
417 mov %rax, (%arg3 , %r11, 1)
418 add $8, %r11
419 psrldq $8, %xmm0
420 movq %xmm0, %rax
421 sub $8, %r13
422_less_than_8_bytes_left_\@:
423 mov %al, (%arg3, %r11, 1)
424 add $1, %r11
425 shr $8, %rax
426 sub $1, %r13
427 jne _less_than_8_bytes_left_\@
428_multiple_of_16_bytes_\@:
429.endm
430
431# GCM_COMPLETE Finishes update of tag of last partial block
432# Output: Authorization Tag (AUTH_TAG)
433# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
434.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
435 movdqu AadHash(%arg2), %xmm8
436 movdqu HashKey(%arg2), %xmm13
437
438 mov PBlockLen(%arg2), %r12
439
440 cmp $0, %r12
441 je _partial_done\@
442
443 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
444
445_partial_done\@:
446 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
447 shl $3, %r12 # convert into number of bits
448 movd %r12d, %xmm15 # len(A) in %xmm15
449 mov InLen(%arg2), %r12
450 shl $3, %r12 # len(C) in bits (*128)
451 movq %r12, %xmm1
452
453 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
454 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
455 pxor %xmm15, %xmm8
456 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
457 # final GHASH computation
458 movdqa SHUF_MASK(%rip), %xmm10
459 pshufb %xmm10, %xmm8
460
461 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
462 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
463 pxor %xmm8, %xmm0
464_return_T_\@:
465 mov \AUTHTAG, %r10 # %r10 = authTag
466 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
467 cmp $16, %r11
468 je _T_16_\@
469 cmp $8, %r11
470 jl _T_4_\@
471_T_8_\@:
472 movq %xmm0, %rax
473 mov %rax, (%r10)
474 add $8, %r10
475 sub $8, %r11
476 psrldq $8, %xmm0
477 cmp $0, %r11
478 je _return_T_done_\@
479_T_4_\@:
480 movd %xmm0, %eax
481 mov %eax, (%r10)
482 add $4, %r10
483 sub $4, %r11
484 psrldq $4, %xmm0
485 cmp $0, %r11
486 je _return_T_done_\@
487_T_123_\@:
488 movd %xmm0, %eax
489 cmp $2, %r11
490 jl _T_1_\@
491 mov %ax, (%r10)
492 cmp $2, %r11
493 je _return_T_done_\@
494 add $2, %r10
495 sar $16, %eax
496_T_1_\@:
497 mov %al, (%r10)
498 jmp _return_T_done_\@
499_T_16_\@:
500 movdqu %xmm0, (%r10)
501_return_T_done_\@:
502.endm
503
504#ifdef __x86_64__
505/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
506*
507*
508* Input: A and B (128-bits each, bit-reflected)
509* Output: C = A*B*x mod poly, (i.e. >>1 )
510* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
511* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
512*
513*/
514.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
515 movdqa \GH, \TMP1
516 pshufd $78, \GH, \TMP2
517 pshufd $78, \HK, \TMP3
518 pxor \GH, \TMP2 # TMP2 = a1+a0
519 pxor \HK, \TMP3 # TMP3 = b1+b0
520 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
521 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
522 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
523 pxor \GH, \TMP2
524 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
525 movdqa \TMP2, \TMP3
526 pslldq $8, \TMP3 # left shift TMP3 2 DWs
527 psrldq $8, \TMP2 # right shift TMP2 2 DWs
528 pxor \TMP3, \GH
529 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
530
531 # first phase of the reduction
532
533 movdqa \GH, \TMP2
534 movdqa \GH, \TMP3
535 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
536 # in in order to perform
537 # independent shifts
538 pslld $31, \TMP2 # packed right shift <<31
539 pslld $30, \TMP3 # packed right shift <<30
540 pslld $25, \TMP4 # packed right shift <<25
541 pxor \TMP3, \TMP2 # xor the shifted versions
542 pxor \TMP4, \TMP2
543 movdqa \TMP2, \TMP5
544 psrldq $4, \TMP5 # right shift TMP5 1 DW
545 pslldq $12, \TMP2 # left shift TMP2 3 DWs
546 pxor \TMP2, \GH
547
548 # second phase of the reduction
549
550 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
551 # in in order to perform
552 # independent shifts
553 movdqa \GH,\TMP3
554 movdqa \GH,\TMP4
555 psrld $1,\TMP2 # packed left shift >>1
556 psrld $2,\TMP3 # packed left shift >>2
557 psrld $7,\TMP4 # packed left shift >>7
558 pxor \TMP3,\TMP2 # xor the shifted versions
559 pxor \TMP4,\TMP2
560 pxor \TMP5, \TMP2
561 pxor \TMP2, \GH
562 pxor \TMP1, \GH # result is in TMP1
563.endm
564
565# Reads DLEN bytes starting at DPTR and stores in XMMDst
566# where 0 < DLEN < 16
567# Clobbers %rax, DLEN and XMM1
568.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
569 cmp $8, \DLEN
570 jl _read_lt8_\@
571 mov (\DPTR), %rax
572 movq %rax, \XMMDst
573 sub $8, \DLEN
574 jz _done_read_partial_block_\@
575 xor %eax, %eax
576_read_next_byte_\@:
577 shl $8, %rax
578 mov 7(\DPTR, \DLEN, 1), %al
579 dec \DLEN
580 jnz _read_next_byte_\@
581 movq %rax, \XMM1
582 pslldq $8, \XMM1
583 por \XMM1, \XMMDst
584 jmp _done_read_partial_block_\@
585_read_lt8_\@:
586 xor %eax, %eax
587_read_next_byte_lt8_\@:
588 shl $8, %rax
589 mov -1(\DPTR, \DLEN, 1), %al
590 dec \DLEN
591 jnz _read_next_byte_lt8_\@
592 movq %rax, \XMMDst
593_done_read_partial_block_\@:
594.endm
595
596# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
597# clobbers r10-11, xmm14
598.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
599 TMP6 TMP7
600 MOVADQ SHUF_MASK(%rip), %xmm14
601 mov \AAD, %r10 # %r10 = AAD
602 mov \AADLEN, %r11 # %r11 = aadLen
603 pxor \TMP7, \TMP7
604 pxor \TMP6, \TMP6
605
606 cmp $16, %r11
607 jl _get_AAD_rest\@
608_get_AAD_blocks\@:
609 movdqu (%r10), \TMP7
610 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
611 pxor \TMP7, \TMP6
612 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
613 add $16, %r10
614 sub $16, %r11
615 cmp $16, %r11
616 jge _get_AAD_blocks\@
617
618 movdqu \TMP6, \TMP7
619
620 /* read the last <16B of AAD */
621_get_AAD_rest\@:
622 cmp $0, %r11
623 je _get_AAD_done\@
624
625 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
626 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
627 pxor \TMP6, \TMP7
628 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
629 movdqu \TMP7, \TMP6
630
631_get_AAD_done\@:
632 movdqu \TMP6, AadHash(%arg2)
633.endm
634
635# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
636# between update calls.
637# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
638# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
639# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
640.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
641 AAD_HASH operation
642 mov PBlockLen(%arg2), %r13
643 cmp $0, %r13
644 je _partial_block_done_\@ # Leave Macro if no partial blocks
645 # Read in input data without over reading
646 cmp $16, \PLAIN_CYPH_LEN
647 jl _fewer_than_16_bytes_\@
648 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
649 jmp _data_read_\@
650
651_fewer_than_16_bytes_\@:
652 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
653 mov \PLAIN_CYPH_LEN, %r12
654 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
655
656 mov PBlockLen(%arg2), %r13
657
658_data_read_\@: # Finished reading in data
659
660 movdqu PBlockEncKey(%arg2), %xmm9
661 movdqu HashKey(%arg2), %xmm13
662
663 lea SHIFT_MASK(%rip), %r12
664
665 # adjust the shuffle mask pointer to be able to shift r13 bytes
666 # r16-r13 is the number of bytes in plaintext mod 16)
667 add %r13, %r12
668 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
669 pshufb %xmm2, %xmm9 # shift right r13 bytes
670
671.ifc \operation, dec
672 movdqa %xmm1, %xmm3
673 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
674
675 mov \PLAIN_CYPH_LEN, %r10
676 add %r13, %r10
677 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
678 sub $16, %r10
679 # Determine if if partial block is not being filled and
680 # shift mask accordingly
681 jge _no_extra_mask_1_\@
682 sub %r10, %r12
683_no_extra_mask_1_\@:
684
685 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
686 # get the appropriate mask to mask out bottom r13 bytes of xmm9
687 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
688
689 pand %xmm1, %xmm3
690 movdqa SHUF_MASK(%rip), %xmm10
691 pshufb %xmm10, %xmm3
692 pshufb %xmm2, %xmm3
693 pxor %xmm3, \AAD_HASH
694
695 cmp $0, %r10
696 jl _partial_incomplete_1_\@
697
698 # GHASH computation for the last <16 Byte block
699 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
700 xor %eax, %eax
701
702 mov %rax, PBlockLen(%arg2)
703 jmp _dec_done_\@
704_partial_incomplete_1_\@:
705 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
706_dec_done_\@:
707 movdqu \AAD_HASH, AadHash(%arg2)
708.else
709 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
710
711 mov \PLAIN_CYPH_LEN, %r10
712 add %r13, %r10
713 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
714 sub $16, %r10
715 # Determine if if partial block is not being filled and
716 # shift mask accordingly
717 jge _no_extra_mask_2_\@
718 sub %r10, %r12
719_no_extra_mask_2_\@:
720
721 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
722 # get the appropriate mask to mask out bottom r13 bytes of xmm9
723 pand %xmm1, %xmm9
724
725 movdqa SHUF_MASK(%rip), %xmm1
726 pshufb %xmm1, %xmm9
727 pshufb %xmm2, %xmm9
728 pxor %xmm9, \AAD_HASH
729
730 cmp $0, %r10
731 jl _partial_incomplete_2_\@
732
733 # GHASH computation for the last <16 Byte block
734 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
735 xor %eax, %eax
736
737 mov %rax, PBlockLen(%arg2)
738 jmp _encode_done_\@
739_partial_incomplete_2_\@:
740 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
741_encode_done_\@:
742 movdqu \AAD_HASH, AadHash(%arg2)
743
744 movdqa SHUF_MASK(%rip), %xmm10
745 # shuffle xmm9 back to output as ciphertext
746 pshufb %xmm10, %xmm9
747 pshufb %xmm2, %xmm9
748.endif
749 # output encrypted Bytes
750 cmp $0, %r10
751 jl _partial_fill_\@
752 mov %r13, %r12
753 mov $16, %r13
754 # Set r13 to be the number of bytes to write out
755 sub %r12, %r13
756 jmp _count_set_\@
757_partial_fill_\@:
758 mov \PLAIN_CYPH_LEN, %r13
759_count_set_\@:
760 movdqa %xmm9, %xmm0
761 movq %xmm0, %rax
762 cmp $8, %r13
763 jle _less_than_8_bytes_left_\@
764
765 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
766 add $8, \DATA_OFFSET
767 psrldq $8, %xmm0
768 movq %xmm0, %rax
769 sub $8, %r13
770_less_than_8_bytes_left_\@:
771 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
772 add $1, \DATA_OFFSET
773 shr $8, %rax
774 sub $1, %r13
775 jne _less_than_8_bytes_left_\@
776_partial_block_done_\@:
777.endm # PARTIAL_BLOCK
778
779/*
780* if a = number of total plaintext bytes
781* b = floor(a/16)
782* num_initial_blocks = b mod 4
783* encrypt the initial num_initial_blocks blocks and apply ghash on
784* the ciphertext
785* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
786* are clobbered
787* arg1, %arg2, %arg3 are used as a pointer only, not modified
788*/
789
790
791.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
792 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
793 MOVADQ SHUF_MASK(%rip), %xmm14
794
795 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
796
797 # start AES for num_initial_blocks blocks
798
799 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
800
801.if (\i == 5) || (\i == 6) || (\i == 7)
802
803 MOVADQ ONE(%RIP),\TMP1
804 MOVADQ 0(%arg1),\TMP2
805.irpc index, \i_seq
806 paddd \TMP1, \XMM0 # INCR Y0
807.ifc \operation, dec
808 movdqa \XMM0, %xmm\index
809.else
810 MOVADQ \XMM0, %xmm\index
811.endif
812 pshufb %xmm14, %xmm\index # perform a 16 byte swap
813 pxor \TMP2, %xmm\index
814.endr
815 lea 0x10(%arg1),%r10
816 mov keysize,%eax
817 shr $2,%eax # 128->4, 192->6, 256->8
818 add $5,%eax # 128->9, 192->11, 256->13
819
820aes_loop_initial_\@:
821 MOVADQ (%r10),\TMP1
822.irpc index, \i_seq
823 aesenc \TMP1, %xmm\index
824.endr
825 add $16,%r10
826 sub $1,%eax
827 jnz aes_loop_initial_\@
828
829 MOVADQ (%r10), \TMP1
830.irpc index, \i_seq
831 aesenclast \TMP1, %xmm\index # Last Round
832.endr
833.irpc index, \i_seq
834 movdqu (%arg4 , %r11, 1), \TMP1
835 pxor \TMP1, %xmm\index
836 movdqu %xmm\index, (%arg3 , %r11, 1)
837 # write back plaintext/ciphertext for num_initial_blocks
838 add $16, %r11
839
840.ifc \operation, dec
841 movdqa \TMP1, %xmm\index
842.endif
843 pshufb %xmm14, %xmm\index
844
845 # prepare plaintext/ciphertext for GHASH computation
846.endr
847.endif
848
849 # apply GHASH on num_initial_blocks blocks
850
851.if \i == 5
852 pxor %xmm5, %xmm6
853 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854 pxor %xmm6, %xmm7
855 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856 pxor %xmm7, %xmm8
857 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
858.elseif \i == 6
859 pxor %xmm6, %xmm7
860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 pxor %xmm7, %xmm8
862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863.elseif \i == 7
864 pxor %xmm7, %xmm8
865 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866.endif
867 cmp $64, %r13
868 jl _initial_blocks_done\@
869 # no need for precomputed values
870/*
871*
872* Precomputations for HashKey parallel with encryption of first 4 blocks.
873* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
874*/
875 MOVADQ ONE(%RIP),\TMP1
876 paddd \TMP1, \XMM0 # INCR Y0
877 MOVADQ \XMM0, \XMM1
878 pshufb %xmm14, \XMM1 # perform a 16 byte swap
879
880 paddd \TMP1, \XMM0 # INCR Y0
881 MOVADQ \XMM0, \XMM2
882 pshufb %xmm14, \XMM2 # perform a 16 byte swap
883
884 paddd \TMP1, \XMM0 # INCR Y0
885 MOVADQ \XMM0, \XMM3
886 pshufb %xmm14, \XMM3 # perform a 16 byte swap
887
888 paddd \TMP1, \XMM0 # INCR Y0
889 MOVADQ \XMM0, \XMM4
890 pshufb %xmm14, \XMM4 # perform a 16 byte swap
891
892 MOVADQ 0(%arg1),\TMP1
893 pxor \TMP1, \XMM1
894 pxor \TMP1, \XMM2
895 pxor \TMP1, \XMM3
896 pxor \TMP1, \XMM4
897.irpc index, 1234 # do 4 rounds
898 movaps 0x10*\index(%arg1), \TMP1
899 aesenc \TMP1, \XMM1
900 aesenc \TMP1, \XMM2
901 aesenc \TMP1, \XMM3
902 aesenc \TMP1, \XMM4
903.endr
904.irpc index, 56789 # do next 5 rounds
905 movaps 0x10*\index(%arg1), \TMP1
906 aesenc \TMP1, \XMM1
907 aesenc \TMP1, \XMM2
908 aesenc \TMP1, \XMM3
909 aesenc \TMP1, \XMM4
910.endr
911 lea 0xa0(%arg1),%r10
912 mov keysize,%eax
913 shr $2,%eax # 128->4, 192->6, 256->8
914 sub $4,%eax # 128->0, 192->2, 256->4
915 jz aes_loop_pre_done\@
916
917aes_loop_pre_\@:
918 MOVADQ (%r10),\TMP2
919.irpc index, 1234
920 aesenc \TMP2, %xmm\index
921.endr
922 add $16,%r10
923 sub $1,%eax
924 jnz aes_loop_pre_\@
925
926aes_loop_pre_done\@:
927 MOVADQ (%r10), \TMP2
928 aesenclast \TMP2, \XMM1
929 aesenclast \TMP2, \XMM2
930 aesenclast \TMP2, \XMM3
931 aesenclast \TMP2, \XMM4
932 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
933 pxor \TMP1, \XMM1
934.ifc \operation, dec
935 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
936 movdqa \TMP1, \XMM1
937.endif
938 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
939 pxor \TMP1, \XMM2
940.ifc \operation, dec
941 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
942 movdqa \TMP1, \XMM2
943.endif
944 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
945 pxor \TMP1, \XMM3
946.ifc \operation, dec
947 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
948 movdqa \TMP1, \XMM3
949.endif
950 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
951 pxor \TMP1, \XMM4
952.ifc \operation, dec
953 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
954 movdqa \TMP1, \XMM4
955.else
956 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
957 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
958 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
959 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
960.endif
961
962 add $64, %r11
963 pshufb %xmm14, \XMM1 # perform a 16 byte swap
964 pxor \XMMDst, \XMM1
965# combine GHASHed value with the corresponding ciphertext
966 pshufb %xmm14, \XMM2 # perform a 16 byte swap
967 pshufb %xmm14, \XMM3 # perform a 16 byte swap
968 pshufb %xmm14, \XMM4 # perform a 16 byte swap
969
970_initial_blocks_done\@:
971
972.endm
973
974/*
975* encrypt 4 blocks at a time
976* ghash the 4 previously encrypted ciphertext blocks
977* arg1, %arg3, %arg4 are used as pointers only, not modified
978* %r11 is the data offset value
979*/
980.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
981TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
982
983 movdqa \XMM1, \XMM5
984 movdqa \XMM2, \XMM6
985 movdqa \XMM3, \XMM7
986 movdqa \XMM4, \XMM8
987
988 movdqa SHUF_MASK(%rip), %xmm15
989 # multiply TMP5 * HashKey using karatsuba
990
991 movdqa \XMM5, \TMP4
992 pshufd $78, \XMM5, \TMP6
993 pxor \XMM5, \TMP6
994 paddd ONE(%rip), \XMM0 # INCR CNT
995 movdqu HashKey_4(%arg2), \TMP5
996 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
997 movdqa \XMM0, \XMM1
998 paddd ONE(%rip), \XMM0 # INCR CNT
999 movdqa \XMM0, \XMM2
1000 paddd ONE(%rip), \XMM0 # INCR CNT
1001 movdqa \XMM0, \XMM3
1002 paddd ONE(%rip), \XMM0 # INCR CNT
1003 movdqa \XMM0, \XMM4
1004 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1005 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1006 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1007 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1008 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1009
1010 pxor (%arg1), \XMM1
1011 pxor (%arg1), \XMM2
1012 pxor (%arg1), \XMM3
1013 pxor (%arg1), \XMM4
1014 movdqu HashKey_4_k(%arg2), \TMP5
1015 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1016 movaps 0x10(%arg1), \TMP1
1017 aesenc \TMP1, \XMM1 # Round 1
1018 aesenc \TMP1, \XMM2
1019 aesenc \TMP1, \XMM3
1020 aesenc \TMP1, \XMM4
1021 movaps 0x20(%arg1), \TMP1
1022 aesenc \TMP1, \XMM1 # Round 2
1023 aesenc \TMP1, \XMM2
1024 aesenc \TMP1, \XMM3
1025 aesenc \TMP1, \XMM4
1026 movdqa \XMM6, \TMP1
1027 pshufd $78, \XMM6, \TMP2
1028 pxor \XMM6, \TMP2
1029 movdqu HashKey_3(%arg2), \TMP5
1030 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1031 movaps 0x30(%arg1), \TMP3
1032 aesenc \TMP3, \XMM1 # Round 3
1033 aesenc \TMP3, \XMM2
1034 aesenc \TMP3, \XMM3
1035 aesenc \TMP3, \XMM4
1036 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1037 movaps 0x40(%arg1), \TMP3
1038 aesenc \TMP3, \XMM1 # Round 4
1039 aesenc \TMP3, \XMM2
1040 aesenc \TMP3, \XMM3
1041 aesenc \TMP3, \XMM4
1042 movdqu HashKey_3_k(%arg2), \TMP5
1043 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1044 movaps 0x50(%arg1), \TMP3
1045 aesenc \TMP3, \XMM1 # Round 5
1046 aesenc \TMP3, \XMM2
1047 aesenc \TMP3, \XMM3
1048 aesenc \TMP3, \XMM4
1049 pxor \TMP1, \TMP4
1050# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1051 pxor \XMM6, \XMM5
1052 pxor \TMP2, \TMP6
1053 movdqa \XMM7, \TMP1
1054 pshufd $78, \XMM7, \TMP2
1055 pxor \XMM7, \TMP2
1056 movdqu HashKey_2(%arg2), \TMP5
1057
1058 # Multiply TMP5 * HashKey using karatsuba
1059
1060 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1061 movaps 0x60(%arg1), \TMP3
1062 aesenc \TMP3, \XMM1 # Round 6
1063 aesenc \TMP3, \XMM2
1064 aesenc \TMP3, \XMM3
1065 aesenc \TMP3, \XMM4
1066 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1067 movaps 0x70(%arg1), \TMP3
1068 aesenc \TMP3, \XMM1 # Round 7
1069 aesenc \TMP3, \XMM2
1070 aesenc \TMP3, \XMM3
1071 aesenc \TMP3, \XMM4
1072 movdqu HashKey_2_k(%arg2), \TMP5
1073 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1074 movaps 0x80(%arg1), \TMP3
1075 aesenc \TMP3, \XMM1 # Round 8
1076 aesenc \TMP3, \XMM2
1077 aesenc \TMP3, \XMM3
1078 aesenc \TMP3, \XMM4
1079 pxor \TMP1, \TMP4
1080# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1081 pxor \XMM7, \XMM5
1082 pxor \TMP2, \TMP6
1083
1084 # Multiply XMM8 * HashKey
1085 # XMM8 and TMP5 hold the values for the two operands
1086
1087 movdqa \XMM8, \TMP1
1088 pshufd $78, \XMM8, \TMP2
1089 pxor \XMM8, \TMP2
1090 movdqu HashKey(%arg2), \TMP5
1091 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1092 movaps 0x90(%arg1), \TMP3
1093 aesenc \TMP3, \XMM1 # Round 9
1094 aesenc \TMP3, \XMM2
1095 aesenc \TMP3, \XMM3
1096 aesenc \TMP3, \XMM4
1097 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1098 lea 0xa0(%arg1),%r10
1099 mov keysize,%eax
1100 shr $2,%eax # 128->4, 192->6, 256->8
1101 sub $4,%eax # 128->0, 192->2, 256->4
1102 jz aes_loop_par_enc_done\@
1103
1104aes_loop_par_enc\@:
1105 MOVADQ (%r10),\TMP3
1106.irpc index, 1234
1107 aesenc \TMP3, %xmm\index
1108.endr
1109 add $16,%r10
1110 sub $1,%eax
1111 jnz aes_loop_par_enc\@
1112
1113aes_loop_par_enc_done\@:
1114 MOVADQ (%r10), \TMP3
1115 aesenclast \TMP3, \XMM1 # Round 10
1116 aesenclast \TMP3, \XMM2
1117 aesenclast \TMP3, \XMM3
1118 aesenclast \TMP3, \XMM4
1119 movdqu HashKey_k(%arg2), \TMP5
1120 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1121 movdqu (%arg4,%r11,1), \TMP3
1122 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1123 movdqu 16(%arg4,%r11,1), \TMP3
1124 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1125 movdqu 32(%arg4,%r11,1), \TMP3
1126 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1127 movdqu 48(%arg4,%r11,1), \TMP3
1128 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1129 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1130 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1131 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1132 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1133 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1134 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1135 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1136 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1137
1138 pxor \TMP4, \TMP1
1139 pxor \XMM8, \XMM5
1140 pxor \TMP6, \TMP2
1141 pxor \TMP1, \TMP2
1142 pxor \XMM5, \TMP2
1143 movdqa \TMP2, \TMP3
1144 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1145 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1146 pxor \TMP3, \XMM5
1147 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1148
1149 # first phase of reduction
1150
1151 movdqa \XMM5, \TMP2
1152 movdqa \XMM5, \TMP3
1153 movdqa \XMM5, \TMP4
1154# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1155 pslld $31, \TMP2 # packed right shift << 31
1156 pslld $30, \TMP3 # packed right shift << 30
1157 pslld $25, \TMP4 # packed right shift << 25
1158 pxor \TMP3, \TMP2 # xor the shifted versions
1159 pxor \TMP4, \TMP2
1160 movdqa \TMP2, \TMP5
1161 psrldq $4, \TMP5 # right shift T5 1 DW
1162 pslldq $12, \TMP2 # left shift T2 3 DWs
1163 pxor \TMP2, \XMM5
1164
1165 # second phase of reduction
1166
1167 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1168 movdqa \XMM5,\TMP3
1169 movdqa \XMM5,\TMP4
1170 psrld $1, \TMP2 # packed left shift >>1
1171 psrld $2, \TMP3 # packed left shift >>2
1172 psrld $7, \TMP4 # packed left shift >>7
1173 pxor \TMP3,\TMP2 # xor the shifted versions
1174 pxor \TMP4,\TMP2
1175 pxor \TMP5, \TMP2
1176 pxor \TMP2, \XMM5
1177 pxor \TMP1, \XMM5 # result is in TMP1
1178
1179 pxor \XMM5, \XMM1
1180.endm
1181
1182/*
1183* decrypt 4 blocks at a time
1184* ghash the 4 previously decrypted ciphertext blocks
1185* arg1, %arg3, %arg4 are used as pointers only, not modified
1186* %r11 is the data offset value
1187*/
1188.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1189TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1190
1191 movdqa \XMM1, \XMM5
1192 movdqa \XMM2, \XMM6
1193 movdqa \XMM3, \XMM7
1194 movdqa \XMM4, \XMM8
1195
1196 movdqa SHUF_MASK(%rip), %xmm15
1197 # multiply TMP5 * HashKey using karatsuba
1198
1199 movdqa \XMM5, \TMP4
1200 pshufd $78, \XMM5, \TMP6
1201 pxor \XMM5, \TMP6
1202 paddd ONE(%rip), \XMM0 # INCR CNT
1203 movdqu HashKey_4(%arg2), \TMP5
1204 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1205 movdqa \XMM0, \XMM1
1206 paddd ONE(%rip), \XMM0 # INCR CNT
1207 movdqa \XMM0, \XMM2
1208 paddd ONE(%rip), \XMM0 # INCR CNT
1209 movdqa \XMM0, \XMM3
1210 paddd ONE(%rip), \XMM0 # INCR CNT
1211 movdqa \XMM0, \XMM4
1212 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1213 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1214 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1215 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1216 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1217
1218 pxor (%arg1), \XMM1
1219 pxor (%arg1), \XMM2
1220 pxor (%arg1), \XMM3
1221 pxor (%arg1), \XMM4
1222 movdqu HashKey_4_k(%arg2), \TMP5
1223 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1224 movaps 0x10(%arg1), \TMP1
1225 aesenc \TMP1, \XMM1 # Round 1
1226 aesenc \TMP1, \XMM2
1227 aesenc \TMP1, \XMM3
1228 aesenc \TMP1, \XMM4
1229 movaps 0x20(%arg1), \TMP1
1230 aesenc \TMP1, \XMM1 # Round 2
1231 aesenc \TMP1, \XMM2
1232 aesenc \TMP1, \XMM3
1233 aesenc \TMP1, \XMM4
1234 movdqa \XMM6, \TMP1
1235 pshufd $78, \XMM6, \TMP2
1236 pxor \XMM6, \TMP2
1237 movdqu HashKey_3(%arg2), \TMP5
1238 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1239 movaps 0x30(%arg1), \TMP3
1240 aesenc \TMP3, \XMM1 # Round 3
1241 aesenc \TMP3, \XMM2
1242 aesenc \TMP3, \XMM3
1243 aesenc \TMP3, \XMM4
1244 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1245 movaps 0x40(%arg1), \TMP3
1246 aesenc \TMP3, \XMM1 # Round 4
1247 aesenc \TMP3, \XMM2
1248 aesenc \TMP3, \XMM3
1249 aesenc \TMP3, \XMM4
1250 movdqu HashKey_3_k(%arg2), \TMP5
1251 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1252 movaps 0x50(%arg1), \TMP3
1253 aesenc \TMP3, \XMM1 # Round 5
1254 aesenc \TMP3, \XMM2
1255 aesenc \TMP3, \XMM3
1256 aesenc \TMP3, \XMM4
1257 pxor \TMP1, \TMP4
1258# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1259 pxor \XMM6, \XMM5
1260 pxor \TMP2, \TMP6
1261 movdqa \XMM7, \TMP1
1262 pshufd $78, \XMM7, \TMP2
1263 pxor \XMM7, \TMP2
1264 movdqu HashKey_2(%arg2), \TMP5
1265
1266 # Multiply TMP5 * HashKey using karatsuba
1267
1268 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1269 movaps 0x60(%arg1), \TMP3
1270 aesenc \TMP3, \XMM1 # Round 6
1271 aesenc \TMP3, \XMM2
1272 aesenc \TMP3, \XMM3
1273 aesenc \TMP3, \XMM4
1274 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1275 movaps 0x70(%arg1), \TMP3
1276 aesenc \TMP3, \XMM1 # Round 7
1277 aesenc \TMP3, \XMM2
1278 aesenc \TMP3, \XMM3
1279 aesenc \TMP3, \XMM4
1280 movdqu HashKey_2_k(%arg2), \TMP5
1281 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1282 movaps 0x80(%arg1), \TMP3
1283 aesenc \TMP3, \XMM1 # Round 8
1284 aesenc \TMP3, \XMM2
1285 aesenc \TMP3, \XMM3
1286 aesenc \TMP3, \XMM4
1287 pxor \TMP1, \TMP4
1288# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1289 pxor \XMM7, \XMM5
1290 pxor \TMP2, \TMP6
1291
1292 # Multiply XMM8 * HashKey
1293 # XMM8 and TMP5 hold the values for the two operands
1294
1295 movdqa \XMM8, \TMP1
1296 pshufd $78, \XMM8, \TMP2
1297 pxor \XMM8, \TMP2
1298 movdqu HashKey(%arg2), \TMP5
1299 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1300 movaps 0x90(%arg1), \TMP3
1301 aesenc \TMP3, \XMM1 # Round 9
1302 aesenc \TMP3, \XMM2
1303 aesenc \TMP3, \XMM3
1304 aesenc \TMP3, \XMM4
1305 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1306 lea 0xa0(%arg1),%r10
1307 mov keysize,%eax
1308 shr $2,%eax # 128->4, 192->6, 256->8
1309 sub $4,%eax # 128->0, 192->2, 256->4
1310 jz aes_loop_par_dec_done\@
1311
1312aes_loop_par_dec\@:
1313 MOVADQ (%r10),\TMP3
1314.irpc index, 1234
1315 aesenc \TMP3, %xmm\index
1316.endr
1317 add $16,%r10
1318 sub $1,%eax
1319 jnz aes_loop_par_dec\@
1320
1321aes_loop_par_dec_done\@:
1322 MOVADQ (%r10), \TMP3
1323 aesenclast \TMP3, \XMM1 # last round
1324 aesenclast \TMP3, \XMM2
1325 aesenclast \TMP3, \XMM3
1326 aesenclast \TMP3, \XMM4
1327 movdqu HashKey_k(%arg2), \TMP5
1328 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1329 movdqu (%arg4,%r11,1), \TMP3
1330 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1331 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1332 movdqa \TMP3, \XMM1
1333 movdqu 16(%arg4,%r11,1), \TMP3
1334 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1335 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1336 movdqa \TMP3, \XMM2
1337 movdqu 32(%arg4,%r11,1), \TMP3
1338 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1339 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1340 movdqa \TMP3, \XMM3
1341 movdqu 48(%arg4,%r11,1), \TMP3
1342 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1343 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1344 movdqa \TMP3, \XMM4
1345 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1346 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1347 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1348 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1349
1350 pxor \TMP4, \TMP1
1351 pxor \XMM8, \XMM5
1352 pxor \TMP6, \TMP2
1353 pxor \TMP1, \TMP2
1354 pxor \XMM5, \TMP2
1355 movdqa \TMP2, \TMP3
1356 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1357 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1358 pxor \TMP3, \XMM5
1359 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1360
1361 # first phase of reduction
1362
1363 movdqa \XMM5, \TMP2
1364 movdqa \XMM5, \TMP3
1365 movdqa \XMM5, \TMP4
1366# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1367 pslld $31, \TMP2 # packed right shift << 31
1368 pslld $30, \TMP3 # packed right shift << 30
1369 pslld $25, \TMP4 # packed right shift << 25
1370 pxor \TMP3, \TMP2 # xor the shifted versions
1371 pxor \TMP4, \TMP2
1372 movdqa \TMP2, \TMP5
1373 psrldq $4, \TMP5 # right shift T5 1 DW
1374 pslldq $12, \TMP2 # left shift T2 3 DWs
1375 pxor \TMP2, \XMM5
1376
1377 # second phase of reduction
1378
1379 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1380 movdqa \XMM5,\TMP3
1381 movdqa \XMM5,\TMP4
1382 psrld $1, \TMP2 # packed left shift >>1
1383 psrld $2, \TMP3 # packed left shift >>2
1384 psrld $7, \TMP4 # packed left shift >>7
1385 pxor \TMP3,\TMP2 # xor the shifted versions
1386 pxor \TMP4,\TMP2
1387 pxor \TMP5, \TMP2
1388 pxor \TMP2, \XMM5
1389 pxor \TMP1, \XMM5 # result is in TMP1
1390
1391 pxor \XMM5, \XMM1
1392.endm
1393
1394/* GHASH the last 4 ciphertext blocks. */
1395.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1396TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1397
1398 # Multiply TMP6 * HashKey (using Karatsuba)
1399
1400 movdqa \XMM1, \TMP6
1401 pshufd $78, \XMM1, \TMP2
1402 pxor \XMM1, \TMP2
1403 movdqu HashKey_4(%arg2), \TMP5
1404 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1405 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1406 movdqu HashKey_4_k(%arg2), \TMP4
1407 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1408 movdqa \XMM1, \XMMDst
1409 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1410
1411 # Multiply TMP1 * HashKey (using Karatsuba)
1412
1413 movdqa \XMM2, \TMP1
1414 pshufd $78, \XMM2, \TMP2
1415 pxor \XMM2, \TMP2
1416 movdqu HashKey_3(%arg2), \TMP5
1417 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1418 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1419 movdqu HashKey_3_k(%arg2), \TMP4
1420 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1421 pxor \TMP1, \TMP6
1422 pxor \XMM2, \XMMDst
1423 pxor \TMP2, \XMM1
1424# results accumulated in TMP6, XMMDst, XMM1
1425
1426 # Multiply TMP1 * HashKey (using Karatsuba)
1427
1428 movdqa \XMM3, \TMP1
1429 pshufd $78, \XMM3, \TMP2
1430 pxor \XMM3, \TMP2
1431 movdqu HashKey_2(%arg2), \TMP5
1432 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1433 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1434 movdqu HashKey_2_k(%arg2), \TMP4
1435 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1436 pxor \TMP1, \TMP6
1437 pxor \XMM3, \XMMDst
1438 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1439
1440 # Multiply TMP1 * HashKey (using Karatsuba)
1441 movdqa \XMM4, \TMP1
1442 pshufd $78, \XMM4, \TMP2
1443 pxor \XMM4, \TMP2
1444 movdqu HashKey(%arg2), \TMP5
1445 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1446 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1447 movdqu HashKey_k(%arg2), \TMP4
1448 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1449 pxor \TMP1, \TMP6
1450 pxor \XMM4, \XMMDst
1451 pxor \XMM1, \TMP2
1452 pxor \TMP6, \TMP2
1453 pxor \XMMDst, \TMP2
1454 # middle section of the temp results combined as in karatsuba algorithm
1455 movdqa \TMP2, \TMP4
1456 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1457 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1458 pxor \TMP4, \XMMDst
1459 pxor \TMP2, \TMP6
1460# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1461 # first phase of the reduction
1462 movdqa \XMMDst, \TMP2
1463 movdqa \XMMDst, \TMP3
1464 movdqa \XMMDst, \TMP4
1465# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1466 pslld $31, \TMP2 # packed right shifting << 31
1467 pslld $30, \TMP3 # packed right shifting << 30
1468 pslld $25, \TMP4 # packed right shifting << 25
1469 pxor \TMP3, \TMP2 # xor the shifted versions
1470 pxor \TMP4, \TMP2
1471 movdqa \TMP2, \TMP7
1472 psrldq $4, \TMP7 # right shift TMP7 1 DW
1473 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1474 pxor \TMP2, \XMMDst
1475
1476 # second phase of the reduction
1477 movdqa \XMMDst, \TMP2
1478 # make 3 copies of XMMDst for doing 3 shift operations
1479 movdqa \XMMDst, \TMP3
1480 movdqa \XMMDst, \TMP4
1481 psrld $1, \TMP2 # packed left shift >> 1
1482 psrld $2, \TMP3 # packed left shift >> 2
1483 psrld $7, \TMP4 # packed left shift >> 7
1484 pxor \TMP3, \TMP2 # xor the shifted versions
1485 pxor \TMP4, \TMP2
1486 pxor \TMP7, \TMP2
1487 pxor \TMP2, \XMMDst
1488 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1489.endm
1490
1491
1492/* Encryption of a single block
1493* uses eax & r10
1494*/
1495
1496.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1497
1498 pxor (%arg1), \XMM0
1499 mov keysize,%eax
1500 shr $2,%eax # 128->4, 192->6, 256->8
1501 add $5,%eax # 128->9, 192->11, 256->13
1502 lea 16(%arg1), %r10 # get first expanded key address
1503
1504_esb_loop_\@:
1505 MOVADQ (%r10),\TMP1
1506 aesenc \TMP1,\XMM0
1507 add $16,%r10
1508 sub $1,%eax
1509 jnz _esb_loop_\@
1510
1511 MOVADQ (%r10),\TMP1
1512 aesenclast \TMP1,\XMM0
1513.endm
1514/*****************************************************************************
1515* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1516* struct gcm_context_data *data
1517* // Context data
1518* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1519* const u8 *in, // Ciphertext input
1520* u64 plaintext_len, // Length of data in bytes for decryption.
1521* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1522* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1523* // concatenated with 0x00000001. 16-byte aligned pointer.
1524* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1525* const u8 *aad, // Additional Authentication Data (AAD)
1526* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1527* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1528* // given authentication tag and only return the plaintext if they match.
1529* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1530* // (most likely), 12 or 8.
1531*
1532* Assumptions:
1533*
1534* keys:
1535* keys are pre-expanded and aligned to 16 bytes. we are using the first
1536* set of 11 keys in the data structure void *aes_ctx
1537*
1538* iv:
1539* 0 1 2 3
1540* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1541* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1542* | Salt (From the SA) |
1543* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544* | Initialization Vector |
1545* | (This is the sequence number from IPSec header) |
1546* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547* | 0x1 |
1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*
1550*
1551*
1552* AAD:
1553* AAD padded to 128 bits with 0
1554* for example, assume AAD is a u32 vector
1555*
1556* if AAD is 8 bytes:
1557* AAD[3] = {A0, A1};
1558* padded AAD in xmm register = {A1 A0 0 0}
1559*
1560* 0 1 2 3
1561* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563* | SPI (A1) |
1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565* | 32-bit Sequence Number (A0) |
1566* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1567* | 0x0 |
1568* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569*
1570* AAD Format with 32-bit Sequence Number
1571*
1572* if AAD is 12 bytes:
1573* AAD[3] = {A0, A1, A2};
1574* padded AAD in xmm register = {A2 A1 A0 0}
1575*
1576* 0 1 2 3
1577* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1580* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1581* | SPI (A2) |
1582* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583* | 64-bit Extended Sequence Number {A1,A0} |
1584* | |
1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586* | 0x0 |
1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588*
1589* AAD Format with 64-bit Extended Sequence Number
1590*
1591* poly = x^128 + x^127 + x^126 + x^121 + 1
1592*
1593*****************************************************************************/
1594SYM_FUNC_START(aesni_gcm_dec)
1595 FUNC_SAVE
1596
1597 GCM_INIT %arg6, arg7, arg8, arg9
1598 GCM_ENC_DEC dec
1599 GCM_COMPLETE arg10, arg11
1600 FUNC_RESTORE
1601 ret
1602SYM_FUNC_END(aesni_gcm_dec)
1603
1604
1605/*****************************************************************************
1606* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1607* struct gcm_context_data *data
1608* // Context data
1609* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1610* const u8 *in, // Plaintext input
1611* u64 plaintext_len, // Length of data in bytes for encryption.
1612* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1613* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1614* // concatenated with 0x00000001. 16-byte aligned pointer.
1615* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1616* const u8 *aad, // Additional Authentication Data (AAD)
1617* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1618* u8 *auth_tag, // Authenticated Tag output.
1619* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1620* // 12 or 8.
1621*
1622* Assumptions:
1623*
1624* keys:
1625* keys are pre-expanded and aligned to 16 bytes. we are using the
1626* first set of 11 keys in the data structure void *aes_ctx
1627*
1628*
1629* iv:
1630* 0 1 2 3
1631* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1632* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1633* | Salt (From the SA) |
1634* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635* | Initialization Vector |
1636* | (This is the sequence number from IPSec header) |
1637* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638* | 0x1 |
1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*
1641*
1642*
1643* AAD:
1644* AAD padded to 128 bits with 0
1645* for example, assume AAD is a u32 vector
1646*
1647* if AAD is 8 bytes:
1648* AAD[3] = {A0, A1};
1649* padded AAD in xmm register = {A1 A0 0 0}
1650*
1651* 0 1 2 3
1652* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654* | SPI (A1) |
1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656* | 32-bit Sequence Number (A0) |
1657* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658* | 0x0 |
1659* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660*
1661* AAD Format with 32-bit Sequence Number
1662*
1663* if AAD is 12 bytes:
1664* AAD[3] = {A0, A1, A2};
1665* padded AAD in xmm register = {A2 A1 A0 0}
1666*
1667* 0 1 2 3
1668* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1669* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1670* | SPI (A2) |
1671* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672* | 64-bit Extended Sequence Number {A1,A0} |
1673* | |
1674* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675* | 0x0 |
1676* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677*
1678* AAD Format with 64-bit Extended Sequence Number
1679*
1680* poly = x^128 + x^127 + x^126 + x^121 + 1
1681***************************************************************************/
1682SYM_FUNC_START(aesni_gcm_enc)
1683 FUNC_SAVE
1684
1685 GCM_INIT %arg6, arg7, arg8, arg9
1686 GCM_ENC_DEC enc
1687
1688 GCM_COMPLETE arg10, arg11
1689 FUNC_RESTORE
1690 ret
1691SYM_FUNC_END(aesni_gcm_enc)
1692
1693/*****************************************************************************
1694* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1695* struct gcm_context_data *data,
1696* // context data
1697* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1698* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1699* // concatenated with 0x00000001. 16-byte aligned pointer.
1700* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1701* const u8 *aad, // Additional Authentication Data (AAD)
1702* u64 aad_len) // Length of AAD in bytes.
1703*/
1704SYM_FUNC_START(aesni_gcm_init)
1705 FUNC_SAVE
1706 GCM_INIT %arg3, %arg4,%arg5, %arg6
1707 FUNC_RESTORE
1708 ret
1709SYM_FUNC_END(aesni_gcm_init)
1710
1711/*****************************************************************************
1712* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1713* struct gcm_context_data *data,
1714* // context data
1715* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1716* const u8 *in, // Plaintext input
1717* u64 plaintext_len, // Length of data in bytes for encryption.
1718*/
1719SYM_FUNC_START(aesni_gcm_enc_update)
1720 FUNC_SAVE
1721 GCM_ENC_DEC enc
1722 FUNC_RESTORE
1723 ret
1724SYM_FUNC_END(aesni_gcm_enc_update)
1725
1726/*****************************************************************************
1727* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1728* struct gcm_context_data *data,
1729* // context data
1730* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1731* const u8 *in, // Plaintext input
1732* u64 plaintext_len, // Length of data in bytes for encryption.
1733*/
1734SYM_FUNC_START(aesni_gcm_dec_update)
1735 FUNC_SAVE
1736 GCM_ENC_DEC dec
1737 FUNC_RESTORE
1738 ret
1739SYM_FUNC_END(aesni_gcm_dec_update)
1740
1741/*****************************************************************************
1742* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1743* struct gcm_context_data *data,
1744* // context data
1745* u8 *auth_tag, // Authenticated Tag output.
1746* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1747* // 12 or 8.
1748*/
1749SYM_FUNC_START(aesni_gcm_finalize)
1750 FUNC_SAVE
1751 GCM_COMPLETE %arg3 %arg4
1752 FUNC_RESTORE
1753 ret
1754SYM_FUNC_END(aesni_gcm_finalize)
1755
1756#endif
1757
1758
1759SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1760SYM_FUNC_START_LOCAL(_key_expansion_256a)
1761 pshufd $0b11111111, %xmm1, %xmm1
1762 shufps $0b00010000, %xmm0, %xmm4
1763 pxor %xmm4, %xmm0
1764 shufps $0b10001100, %xmm0, %xmm4
1765 pxor %xmm4, %xmm0
1766 pxor %xmm1, %xmm0
1767 movaps %xmm0, (TKEYP)
1768 add $0x10, TKEYP
1769 ret
1770SYM_FUNC_END(_key_expansion_256a)
1771SYM_FUNC_END_ALIAS(_key_expansion_128)
1772
1773SYM_FUNC_START_LOCAL(_key_expansion_192a)
1774 pshufd $0b01010101, %xmm1, %xmm1
1775 shufps $0b00010000, %xmm0, %xmm4
1776 pxor %xmm4, %xmm0
1777 shufps $0b10001100, %xmm0, %xmm4
1778 pxor %xmm4, %xmm0
1779 pxor %xmm1, %xmm0
1780
1781 movaps %xmm2, %xmm5
1782 movaps %xmm2, %xmm6
1783 pslldq $4, %xmm5
1784 pshufd $0b11111111, %xmm0, %xmm3
1785 pxor %xmm3, %xmm2
1786 pxor %xmm5, %xmm2
1787
1788 movaps %xmm0, %xmm1
1789 shufps $0b01000100, %xmm0, %xmm6
1790 movaps %xmm6, (TKEYP)
1791 shufps $0b01001110, %xmm2, %xmm1
1792 movaps %xmm1, 0x10(TKEYP)
1793 add $0x20, TKEYP
1794 ret
1795SYM_FUNC_END(_key_expansion_192a)
1796
1797SYM_FUNC_START_LOCAL(_key_expansion_192b)
1798 pshufd $0b01010101, %xmm1, %xmm1
1799 shufps $0b00010000, %xmm0, %xmm4
1800 pxor %xmm4, %xmm0
1801 shufps $0b10001100, %xmm0, %xmm4
1802 pxor %xmm4, %xmm0
1803 pxor %xmm1, %xmm0
1804
1805 movaps %xmm2, %xmm5
1806 pslldq $4, %xmm5
1807 pshufd $0b11111111, %xmm0, %xmm3
1808 pxor %xmm3, %xmm2
1809 pxor %xmm5, %xmm2
1810
1811 movaps %xmm0, (TKEYP)
1812 add $0x10, TKEYP
1813 ret
1814SYM_FUNC_END(_key_expansion_192b)
1815
1816SYM_FUNC_START_LOCAL(_key_expansion_256b)
1817 pshufd $0b10101010, %xmm1, %xmm1
1818 shufps $0b00010000, %xmm2, %xmm4
1819 pxor %xmm4, %xmm2
1820 shufps $0b10001100, %xmm2, %xmm4
1821 pxor %xmm4, %xmm2
1822 pxor %xmm1, %xmm2
1823 movaps %xmm2, (TKEYP)
1824 add $0x10, TKEYP
1825 ret
1826SYM_FUNC_END(_key_expansion_256b)
1827
1828/*
1829 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1830 * unsigned int key_len)
1831 */
1832SYM_FUNC_START(aesni_set_key)
1833 FRAME_BEGIN
1834#ifndef __x86_64__
1835 pushl KEYP
1836 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1837 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1838 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1839#endif
1840 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1841 movaps %xmm0, (KEYP)
1842 lea 0x10(KEYP), TKEYP # key addr
1843 movl %edx, 480(KEYP)
1844 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1845 cmp $24, %dl
1846 jb .Lenc_key128
1847 je .Lenc_key192
1848 movups 0x10(UKEYP), %xmm2 # other user key
1849 movaps %xmm2, (TKEYP)
1850 add $0x10, TKEYP
1851 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1852 call _key_expansion_256a
1853 aeskeygenassist $0x1, %xmm0, %xmm1
1854 call _key_expansion_256b
1855 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1856 call _key_expansion_256a
1857 aeskeygenassist $0x2, %xmm0, %xmm1
1858 call _key_expansion_256b
1859 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1860 call _key_expansion_256a
1861 aeskeygenassist $0x4, %xmm0, %xmm1
1862 call _key_expansion_256b
1863 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1864 call _key_expansion_256a
1865 aeskeygenassist $0x8, %xmm0, %xmm1
1866 call _key_expansion_256b
1867 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1868 call _key_expansion_256a
1869 aeskeygenassist $0x10, %xmm0, %xmm1
1870 call _key_expansion_256b
1871 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1872 call _key_expansion_256a
1873 aeskeygenassist $0x20, %xmm0, %xmm1
1874 call _key_expansion_256b
1875 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1876 call _key_expansion_256a
1877 jmp .Ldec_key
1878.Lenc_key192:
1879 movq 0x10(UKEYP), %xmm2 # other user key
1880 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1881 call _key_expansion_192a
1882 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1883 call _key_expansion_192b
1884 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1885 call _key_expansion_192a
1886 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1887 call _key_expansion_192b
1888 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1889 call _key_expansion_192a
1890 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1891 call _key_expansion_192b
1892 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1893 call _key_expansion_192a
1894 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
1895 call _key_expansion_192b
1896 jmp .Ldec_key
1897.Lenc_key128:
1898 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
1899 call _key_expansion_128
1900 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
1901 call _key_expansion_128
1902 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
1903 call _key_expansion_128
1904 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
1905 call _key_expansion_128
1906 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
1907 call _key_expansion_128
1908 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
1909 call _key_expansion_128
1910 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
1911 call _key_expansion_128
1912 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
1913 call _key_expansion_128
1914 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
1915 call _key_expansion_128
1916 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
1917 call _key_expansion_128
1918.Ldec_key:
1919 sub $0x10, TKEYP
1920 movaps (KEYP), %xmm0
1921 movaps (TKEYP), %xmm1
1922 movaps %xmm0, 240(TKEYP)
1923 movaps %xmm1, 240(KEYP)
1924 add $0x10, KEYP
1925 lea 240-16(TKEYP), UKEYP
1926.align 4
1927.Ldec_key_loop:
1928 movaps (KEYP), %xmm0
1929 aesimc %xmm0, %xmm1
1930 movaps %xmm1, (UKEYP)
1931 add $0x10, KEYP
1932 sub $0x10, UKEYP
1933 cmp TKEYP, KEYP
1934 jb .Ldec_key_loop
1935 xor AREG, AREG
1936#ifndef __x86_64__
1937 popl KEYP
1938#endif
1939 FRAME_END
1940 ret
1941SYM_FUNC_END(aesni_set_key)
1942
1943/*
1944 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1945 */
1946SYM_FUNC_START(aesni_enc)
1947 FRAME_BEGIN
1948#ifndef __x86_64__
1949 pushl KEYP
1950 pushl KLEN
1951 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1952 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1953 movl (FRAME_OFFSET+20)(%esp), INP # src
1954#endif
1955 movl 480(KEYP), KLEN # key length
1956 movups (INP), STATE # input
1957 call _aesni_enc1
1958 movups STATE, (OUTP) # output
1959#ifndef __x86_64__
1960 popl KLEN
1961 popl KEYP
1962#endif
1963 FRAME_END
1964 ret
1965SYM_FUNC_END(aesni_enc)
1966
1967/*
1968 * _aesni_enc1: internal ABI
1969 * input:
1970 * KEYP: key struct pointer
1971 * KLEN: round count
1972 * STATE: initial state (input)
1973 * output:
1974 * STATE: finial state (output)
1975 * changed:
1976 * KEY
1977 * TKEYP (T1)
1978 */
1979SYM_FUNC_START_LOCAL(_aesni_enc1)
1980 movaps (KEYP), KEY # key
1981 mov KEYP, TKEYP
1982 pxor KEY, STATE # round 0
1983 add $0x30, TKEYP
1984 cmp $24, KLEN
1985 jb .Lenc128
1986 lea 0x20(TKEYP), TKEYP
1987 je .Lenc192
1988 add $0x20, TKEYP
1989 movaps -0x60(TKEYP), KEY
1990 aesenc KEY, STATE
1991 movaps -0x50(TKEYP), KEY
1992 aesenc KEY, STATE
1993.align 4
1994.Lenc192:
1995 movaps -0x40(TKEYP), KEY
1996 aesenc KEY, STATE
1997 movaps -0x30(TKEYP), KEY
1998 aesenc KEY, STATE
1999.align 4
2000.Lenc128:
2001 movaps -0x20(TKEYP), KEY
2002 aesenc KEY, STATE
2003 movaps -0x10(TKEYP), KEY
2004 aesenc KEY, STATE
2005 movaps (TKEYP), KEY
2006 aesenc KEY, STATE
2007 movaps 0x10(TKEYP), KEY
2008 aesenc KEY, STATE
2009 movaps 0x20(TKEYP), KEY
2010 aesenc KEY, STATE
2011 movaps 0x30(TKEYP), KEY
2012 aesenc KEY, STATE
2013 movaps 0x40(TKEYP), KEY
2014 aesenc KEY, STATE
2015 movaps 0x50(TKEYP), KEY
2016 aesenc KEY, STATE
2017 movaps 0x60(TKEYP), KEY
2018 aesenc KEY, STATE
2019 movaps 0x70(TKEYP), KEY
2020 aesenclast KEY, STATE
2021 ret
2022SYM_FUNC_END(_aesni_enc1)
2023
2024/*
2025 * _aesni_enc4: internal ABI
2026 * input:
2027 * KEYP: key struct pointer
2028 * KLEN: round count
2029 * STATE1: initial state (input)
2030 * STATE2
2031 * STATE3
2032 * STATE4
2033 * output:
2034 * STATE1: finial state (output)
2035 * STATE2
2036 * STATE3
2037 * STATE4
2038 * changed:
2039 * KEY
2040 * TKEYP (T1)
2041 */
2042SYM_FUNC_START_LOCAL(_aesni_enc4)
2043 movaps (KEYP), KEY # key
2044 mov KEYP, TKEYP
2045 pxor KEY, STATE1 # round 0
2046 pxor KEY, STATE2
2047 pxor KEY, STATE3
2048 pxor KEY, STATE4
2049 add $0x30, TKEYP
2050 cmp $24, KLEN
2051 jb .L4enc128
2052 lea 0x20(TKEYP), TKEYP
2053 je .L4enc192
2054 add $0x20, TKEYP
2055 movaps -0x60(TKEYP), KEY
2056 aesenc KEY, STATE1
2057 aesenc KEY, STATE2
2058 aesenc KEY, STATE3
2059 aesenc KEY, STATE4
2060 movaps -0x50(TKEYP), KEY
2061 aesenc KEY, STATE1
2062 aesenc KEY, STATE2
2063 aesenc KEY, STATE3
2064 aesenc KEY, STATE4
2065#.align 4
2066.L4enc192:
2067 movaps -0x40(TKEYP), KEY
2068 aesenc KEY, STATE1
2069 aesenc KEY, STATE2
2070 aesenc KEY, STATE3
2071 aesenc KEY, STATE4
2072 movaps -0x30(TKEYP), KEY
2073 aesenc KEY, STATE1
2074 aesenc KEY, STATE2
2075 aesenc KEY, STATE3
2076 aesenc KEY, STATE4
2077#.align 4
2078.L4enc128:
2079 movaps -0x20(TKEYP), KEY
2080 aesenc KEY, STATE1
2081 aesenc KEY, STATE2
2082 aesenc KEY, STATE3
2083 aesenc KEY, STATE4
2084 movaps -0x10(TKEYP), KEY
2085 aesenc KEY, STATE1
2086 aesenc KEY, STATE2
2087 aesenc KEY, STATE3
2088 aesenc KEY, STATE4
2089 movaps (TKEYP), KEY
2090 aesenc KEY, STATE1
2091 aesenc KEY, STATE2
2092 aesenc KEY, STATE3
2093 aesenc KEY, STATE4
2094 movaps 0x10(TKEYP), KEY
2095 aesenc KEY, STATE1
2096 aesenc KEY, STATE2
2097 aesenc KEY, STATE3
2098 aesenc KEY, STATE4
2099 movaps 0x20(TKEYP), KEY
2100 aesenc KEY, STATE1
2101 aesenc KEY, STATE2
2102 aesenc KEY, STATE3
2103 aesenc KEY, STATE4
2104 movaps 0x30(TKEYP), KEY
2105 aesenc KEY, STATE1
2106 aesenc KEY, STATE2
2107 aesenc KEY, STATE3
2108 aesenc KEY, STATE4
2109 movaps 0x40(TKEYP), KEY
2110 aesenc KEY, STATE1
2111 aesenc KEY, STATE2
2112 aesenc KEY, STATE3
2113 aesenc KEY, STATE4
2114 movaps 0x50(TKEYP), KEY
2115 aesenc KEY, STATE1
2116 aesenc KEY, STATE2
2117 aesenc KEY, STATE3
2118 aesenc KEY, STATE4
2119 movaps 0x60(TKEYP), KEY
2120 aesenc KEY, STATE1
2121 aesenc KEY, STATE2
2122 aesenc KEY, STATE3
2123 aesenc KEY, STATE4
2124 movaps 0x70(TKEYP), KEY
2125 aesenclast KEY, STATE1 # last round
2126 aesenclast KEY, STATE2
2127 aesenclast KEY, STATE3
2128 aesenclast KEY, STATE4
2129 ret
2130SYM_FUNC_END(_aesni_enc4)
2131
2132/*
2133 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2134 */
2135SYM_FUNC_START(aesni_dec)
2136 FRAME_BEGIN
2137#ifndef __x86_64__
2138 pushl KEYP
2139 pushl KLEN
2140 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2141 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2142 movl (FRAME_OFFSET+20)(%esp), INP # src
2143#endif
2144 mov 480(KEYP), KLEN # key length
2145 add $240, KEYP
2146 movups (INP), STATE # input
2147 call _aesni_dec1
2148 movups STATE, (OUTP) #output
2149#ifndef __x86_64__
2150 popl KLEN
2151 popl KEYP
2152#endif
2153 FRAME_END
2154 ret
2155SYM_FUNC_END(aesni_dec)
2156
2157/*
2158 * _aesni_dec1: internal ABI
2159 * input:
2160 * KEYP: key struct pointer
2161 * KLEN: key length
2162 * STATE: initial state (input)
2163 * output:
2164 * STATE: finial state (output)
2165 * changed:
2166 * KEY
2167 * TKEYP (T1)
2168 */
2169SYM_FUNC_START_LOCAL(_aesni_dec1)
2170 movaps (KEYP), KEY # key
2171 mov KEYP, TKEYP
2172 pxor KEY, STATE # round 0
2173 add $0x30, TKEYP
2174 cmp $24, KLEN
2175 jb .Ldec128
2176 lea 0x20(TKEYP), TKEYP
2177 je .Ldec192
2178 add $0x20, TKEYP
2179 movaps -0x60(TKEYP), KEY
2180 aesdec KEY, STATE
2181 movaps -0x50(TKEYP), KEY
2182 aesdec KEY, STATE
2183.align 4
2184.Ldec192:
2185 movaps -0x40(TKEYP), KEY
2186 aesdec KEY, STATE
2187 movaps -0x30(TKEYP), KEY
2188 aesdec KEY, STATE
2189.align 4
2190.Ldec128:
2191 movaps -0x20(TKEYP), KEY
2192 aesdec KEY, STATE
2193 movaps -0x10(TKEYP), KEY
2194 aesdec KEY, STATE
2195 movaps (TKEYP), KEY
2196 aesdec KEY, STATE
2197 movaps 0x10(TKEYP), KEY
2198 aesdec KEY, STATE
2199 movaps 0x20(TKEYP), KEY
2200 aesdec KEY, STATE
2201 movaps 0x30(TKEYP), KEY
2202 aesdec KEY, STATE
2203 movaps 0x40(TKEYP), KEY
2204 aesdec KEY, STATE
2205 movaps 0x50(TKEYP), KEY
2206 aesdec KEY, STATE
2207 movaps 0x60(TKEYP), KEY
2208 aesdec KEY, STATE
2209 movaps 0x70(TKEYP), KEY
2210 aesdeclast KEY, STATE
2211 ret
2212SYM_FUNC_END(_aesni_dec1)
2213
2214/*
2215 * _aesni_dec4: internal ABI
2216 * input:
2217 * KEYP: key struct pointer
2218 * KLEN: key length
2219 * STATE1: initial state (input)
2220 * STATE2
2221 * STATE3
2222 * STATE4
2223 * output:
2224 * STATE1: finial state (output)
2225 * STATE2
2226 * STATE3
2227 * STATE4
2228 * changed:
2229 * KEY
2230 * TKEYP (T1)
2231 */
2232SYM_FUNC_START_LOCAL(_aesni_dec4)
2233 movaps (KEYP), KEY # key
2234 mov KEYP, TKEYP
2235 pxor KEY, STATE1 # round 0
2236 pxor KEY, STATE2
2237 pxor KEY, STATE3
2238 pxor KEY, STATE4
2239 add $0x30, TKEYP
2240 cmp $24, KLEN
2241 jb .L4dec128
2242 lea 0x20(TKEYP), TKEYP
2243 je .L4dec192
2244 add $0x20, TKEYP
2245 movaps -0x60(TKEYP), KEY
2246 aesdec KEY, STATE1
2247 aesdec KEY, STATE2
2248 aesdec KEY, STATE3
2249 aesdec KEY, STATE4
2250 movaps -0x50(TKEYP), KEY
2251 aesdec KEY, STATE1
2252 aesdec KEY, STATE2
2253 aesdec KEY, STATE3
2254 aesdec KEY, STATE4
2255.align 4
2256.L4dec192:
2257 movaps -0x40(TKEYP), KEY
2258 aesdec KEY, STATE1
2259 aesdec KEY, STATE2
2260 aesdec KEY, STATE3
2261 aesdec KEY, STATE4
2262 movaps -0x30(TKEYP), KEY
2263 aesdec KEY, STATE1
2264 aesdec KEY, STATE2
2265 aesdec KEY, STATE3
2266 aesdec KEY, STATE4
2267.align 4
2268.L4dec128:
2269 movaps -0x20(TKEYP), KEY
2270 aesdec KEY, STATE1
2271 aesdec KEY, STATE2
2272 aesdec KEY, STATE3
2273 aesdec KEY, STATE4
2274 movaps -0x10(TKEYP), KEY
2275 aesdec KEY, STATE1
2276 aesdec KEY, STATE2
2277 aesdec KEY, STATE3
2278 aesdec KEY, STATE4
2279 movaps (TKEYP), KEY
2280 aesdec KEY, STATE1
2281 aesdec KEY, STATE2
2282 aesdec KEY, STATE3
2283 aesdec KEY, STATE4
2284 movaps 0x10(TKEYP), KEY
2285 aesdec KEY, STATE1
2286 aesdec KEY, STATE2
2287 aesdec KEY, STATE3
2288 aesdec KEY, STATE4
2289 movaps 0x20(TKEYP), KEY
2290 aesdec KEY, STATE1
2291 aesdec KEY, STATE2
2292 aesdec KEY, STATE3
2293 aesdec KEY, STATE4
2294 movaps 0x30(TKEYP), KEY
2295 aesdec KEY, STATE1
2296 aesdec KEY, STATE2
2297 aesdec KEY, STATE3
2298 aesdec KEY, STATE4
2299 movaps 0x40(TKEYP), KEY
2300 aesdec KEY, STATE1
2301 aesdec KEY, STATE2
2302 aesdec KEY, STATE3
2303 aesdec KEY, STATE4
2304 movaps 0x50(TKEYP), KEY
2305 aesdec KEY, STATE1
2306 aesdec KEY, STATE2
2307 aesdec KEY, STATE3
2308 aesdec KEY, STATE4
2309 movaps 0x60(TKEYP), KEY
2310 aesdec KEY, STATE1
2311 aesdec KEY, STATE2
2312 aesdec KEY, STATE3
2313 aesdec KEY, STATE4
2314 movaps 0x70(TKEYP), KEY
2315 aesdeclast KEY, STATE1 # last round
2316 aesdeclast KEY, STATE2
2317 aesdeclast KEY, STATE3
2318 aesdeclast KEY, STATE4
2319 ret
2320SYM_FUNC_END(_aesni_dec4)
2321
2322/*
2323 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2324 * size_t len)
2325 */
2326SYM_FUNC_START(aesni_ecb_enc)
2327 FRAME_BEGIN
2328#ifndef __x86_64__
2329 pushl LEN
2330 pushl KEYP
2331 pushl KLEN
2332 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2333 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2334 movl (FRAME_OFFSET+24)(%esp), INP # src
2335 movl (FRAME_OFFSET+28)(%esp), LEN # len
2336#endif
2337 test LEN, LEN # check length
2338 jz .Lecb_enc_ret
2339 mov 480(KEYP), KLEN
2340 cmp $16, LEN
2341 jb .Lecb_enc_ret
2342 cmp $64, LEN
2343 jb .Lecb_enc_loop1
2344.align 4
2345.Lecb_enc_loop4:
2346 movups (INP), STATE1
2347 movups 0x10(INP), STATE2
2348 movups 0x20(INP), STATE3
2349 movups 0x30(INP), STATE4
2350 call _aesni_enc4
2351 movups STATE1, (OUTP)
2352 movups STATE2, 0x10(OUTP)
2353 movups STATE3, 0x20(OUTP)
2354 movups STATE4, 0x30(OUTP)
2355 sub $64, LEN
2356 add $64, INP
2357 add $64, OUTP
2358 cmp $64, LEN
2359 jge .Lecb_enc_loop4
2360 cmp $16, LEN
2361 jb .Lecb_enc_ret
2362.align 4
2363.Lecb_enc_loop1:
2364 movups (INP), STATE1
2365 call _aesni_enc1
2366 movups STATE1, (OUTP)
2367 sub $16, LEN
2368 add $16, INP
2369 add $16, OUTP
2370 cmp $16, LEN
2371 jge .Lecb_enc_loop1
2372.Lecb_enc_ret:
2373#ifndef __x86_64__
2374 popl KLEN
2375 popl KEYP
2376 popl LEN
2377#endif
2378 FRAME_END
2379 ret
2380SYM_FUNC_END(aesni_ecb_enc)
2381
2382/*
2383 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2384 * size_t len);
2385 */
2386SYM_FUNC_START(aesni_ecb_dec)
2387 FRAME_BEGIN
2388#ifndef __x86_64__
2389 pushl LEN
2390 pushl KEYP
2391 pushl KLEN
2392 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2393 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2394 movl (FRAME_OFFSET+24)(%esp), INP # src
2395 movl (FRAME_OFFSET+28)(%esp), LEN # len
2396#endif
2397 test LEN, LEN
2398 jz .Lecb_dec_ret
2399 mov 480(KEYP), KLEN
2400 add $240, KEYP
2401 cmp $16, LEN
2402 jb .Lecb_dec_ret
2403 cmp $64, LEN
2404 jb .Lecb_dec_loop1
2405.align 4
2406.Lecb_dec_loop4:
2407 movups (INP), STATE1
2408 movups 0x10(INP), STATE2
2409 movups 0x20(INP), STATE3
2410 movups 0x30(INP), STATE4
2411 call _aesni_dec4
2412 movups STATE1, (OUTP)
2413 movups STATE2, 0x10(OUTP)
2414 movups STATE3, 0x20(OUTP)
2415 movups STATE4, 0x30(OUTP)
2416 sub $64, LEN
2417 add $64, INP
2418 add $64, OUTP
2419 cmp $64, LEN
2420 jge .Lecb_dec_loop4
2421 cmp $16, LEN
2422 jb .Lecb_dec_ret
2423.align 4
2424.Lecb_dec_loop1:
2425 movups (INP), STATE1
2426 call _aesni_dec1
2427 movups STATE1, (OUTP)
2428 sub $16, LEN
2429 add $16, INP
2430 add $16, OUTP
2431 cmp $16, LEN
2432 jge .Lecb_dec_loop1
2433.Lecb_dec_ret:
2434#ifndef __x86_64__
2435 popl KLEN
2436 popl KEYP
2437 popl LEN
2438#endif
2439 FRAME_END
2440 ret
2441SYM_FUNC_END(aesni_ecb_dec)
2442
2443/*
2444 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2445 * size_t len, u8 *iv)
2446 */
2447SYM_FUNC_START(aesni_cbc_enc)
2448 FRAME_BEGIN
2449#ifndef __x86_64__
2450 pushl IVP
2451 pushl LEN
2452 pushl KEYP
2453 pushl KLEN
2454 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2455 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2456 movl (FRAME_OFFSET+28)(%esp), INP # src
2457 movl (FRAME_OFFSET+32)(%esp), LEN # len
2458 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2459#endif
2460 cmp $16, LEN
2461 jb .Lcbc_enc_ret
2462 mov 480(KEYP), KLEN
2463 movups (IVP), STATE # load iv as initial state
2464.align 4
2465.Lcbc_enc_loop:
2466 movups (INP), IN # load input
2467 pxor IN, STATE
2468 call _aesni_enc1
2469 movups STATE, (OUTP) # store output
2470 sub $16, LEN
2471 add $16, INP
2472 add $16, OUTP
2473 cmp $16, LEN
2474 jge .Lcbc_enc_loop
2475 movups STATE, (IVP)
2476.Lcbc_enc_ret:
2477#ifndef __x86_64__
2478 popl KLEN
2479 popl KEYP
2480 popl LEN
2481 popl IVP
2482#endif
2483 FRAME_END
2484 ret
2485SYM_FUNC_END(aesni_cbc_enc)
2486
2487/*
2488 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489 * size_t len, u8 *iv)
2490 */
2491SYM_FUNC_START(aesni_cbc_dec)
2492 FRAME_BEGIN
2493#ifndef __x86_64__
2494 pushl IVP
2495 pushl LEN
2496 pushl KEYP
2497 pushl KLEN
2498 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2499 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2500 movl (FRAME_OFFSET+28)(%esp), INP # src
2501 movl (FRAME_OFFSET+32)(%esp), LEN # len
2502 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2503#endif
2504 cmp $16, LEN
2505 jb .Lcbc_dec_just_ret
2506 mov 480(KEYP), KLEN
2507 add $240, KEYP
2508 movups (IVP), IV
2509 cmp $64, LEN
2510 jb .Lcbc_dec_loop1
2511.align 4
2512.Lcbc_dec_loop4:
2513 movups (INP), IN1
2514 movaps IN1, STATE1
2515 movups 0x10(INP), IN2
2516 movaps IN2, STATE2
2517#ifdef __x86_64__
2518 movups 0x20(INP), IN3
2519 movaps IN3, STATE3
2520 movups 0x30(INP), IN4
2521 movaps IN4, STATE4
2522#else
2523 movups 0x20(INP), IN1
2524 movaps IN1, STATE3
2525 movups 0x30(INP), IN2
2526 movaps IN2, STATE4
2527#endif
2528 call _aesni_dec4
2529 pxor IV, STATE1
2530#ifdef __x86_64__
2531 pxor IN1, STATE2
2532 pxor IN2, STATE3
2533 pxor IN3, STATE4
2534 movaps IN4, IV
2535#else
2536 pxor IN1, STATE4
2537 movaps IN2, IV
2538 movups (INP), IN1
2539 pxor IN1, STATE2
2540 movups 0x10(INP), IN2
2541 pxor IN2, STATE3
2542#endif
2543 movups STATE1, (OUTP)
2544 movups STATE2, 0x10(OUTP)
2545 movups STATE3, 0x20(OUTP)
2546 movups STATE4, 0x30(OUTP)
2547 sub $64, LEN
2548 add $64, INP
2549 add $64, OUTP
2550 cmp $64, LEN
2551 jge .Lcbc_dec_loop4
2552 cmp $16, LEN
2553 jb .Lcbc_dec_ret
2554.align 4
2555.Lcbc_dec_loop1:
2556 movups (INP), IN
2557 movaps IN, STATE
2558 call _aesni_dec1
2559 pxor IV, STATE
2560 movups STATE, (OUTP)
2561 movaps IN, IV
2562 sub $16, LEN
2563 add $16, INP
2564 add $16, OUTP
2565 cmp $16, LEN
2566 jge .Lcbc_dec_loop1
2567.Lcbc_dec_ret:
2568 movups IV, (IVP)
2569.Lcbc_dec_just_ret:
2570#ifndef __x86_64__
2571 popl KLEN
2572 popl KEYP
2573 popl LEN
2574 popl IVP
2575#endif
2576 FRAME_END
2577 ret
2578SYM_FUNC_END(aesni_cbc_dec)
2579
2580#ifdef __x86_64__
2581.pushsection .rodata
2582.align 16
2583.Lbswap_mask:
2584 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2585.popsection
2586
2587/*
2588 * _aesni_inc_init: internal ABI
2589 * setup registers used by _aesni_inc
2590 * input:
2591 * IV
2592 * output:
2593 * CTR: == IV, in little endian
2594 * TCTR_LOW: == lower qword of CTR
2595 * INC: == 1, in little endian
2596 * BSWAP_MASK == endian swapping mask
2597 */
2598SYM_FUNC_START_LOCAL(_aesni_inc_init)
2599 movaps .Lbswap_mask, BSWAP_MASK
2600 movaps IV, CTR
2601 pshufb BSWAP_MASK, CTR
2602 mov $1, TCTR_LOW
2603 movq TCTR_LOW, INC
2604 movq CTR, TCTR_LOW
2605 ret
2606SYM_FUNC_END(_aesni_inc_init)
2607
2608/*
2609 * _aesni_inc: internal ABI
2610 * Increase IV by 1, IV is in big endian
2611 * input:
2612 * IV
2613 * CTR: == IV, in little endian
2614 * TCTR_LOW: == lower qword of CTR
2615 * INC: == 1, in little endian
2616 * BSWAP_MASK == endian swapping mask
2617 * output:
2618 * IV: Increase by 1
2619 * changed:
2620 * CTR: == output IV, in little endian
2621 * TCTR_LOW: == lower qword of CTR
2622 */
2623SYM_FUNC_START_LOCAL(_aesni_inc)
2624 paddq INC, CTR
2625 add $1, TCTR_LOW
2626 jnc .Linc_low
2627 pslldq $8, INC
2628 paddq INC, CTR
2629 psrldq $8, INC
2630.Linc_low:
2631 movaps CTR, IV
2632 pshufb BSWAP_MASK, IV
2633 ret
2634SYM_FUNC_END(_aesni_inc)
2635
2636/*
2637 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2638 * size_t len, u8 *iv)
2639 */
2640SYM_FUNC_START(aesni_ctr_enc)
2641 FRAME_BEGIN
2642 cmp $16, LEN
2643 jb .Lctr_enc_just_ret
2644 mov 480(KEYP), KLEN
2645 movups (IVP), IV
2646 call _aesni_inc_init
2647 cmp $64, LEN
2648 jb .Lctr_enc_loop1
2649.align 4
2650.Lctr_enc_loop4:
2651 movaps IV, STATE1
2652 call _aesni_inc
2653 movups (INP), IN1
2654 movaps IV, STATE2
2655 call _aesni_inc
2656 movups 0x10(INP), IN2
2657 movaps IV, STATE3
2658 call _aesni_inc
2659 movups 0x20(INP), IN3
2660 movaps IV, STATE4
2661 call _aesni_inc
2662 movups 0x30(INP), IN4
2663 call _aesni_enc4
2664 pxor IN1, STATE1
2665 movups STATE1, (OUTP)
2666 pxor IN2, STATE2
2667 movups STATE2, 0x10(OUTP)
2668 pxor IN3, STATE3
2669 movups STATE3, 0x20(OUTP)
2670 pxor IN4, STATE4
2671 movups STATE4, 0x30(OUTP)
2672 sub $64, LEN
2673 add $64, INP
2674 add $64, OUTP
2675 cmp $64, LEN
2676 jge .Lctr_enc_loop4
2677 cmp $16, LEN
2678 jb .Lctr_enc_ret
2679.align 4
2680.Lctr_enc_loop1:
2681 movaps IV, STATE
2682 call _aesni_inc
2683 movups (INP), IN
2684 call _aesni_enc1
2685 pxor IN, STATE
2686 movups STATE, (OUTP)
2687 sub $16, LEN
2688 add $16, INP
2689 add $16, OUTP
2690 cmp $16, LEN
2691 jge .Lctr_enc_loop1
2692.Lctr_enc_ret:
2693 movups IV, (IVP)
2694.Lctr_enc_just_ret:
2695 FRAME_END
2696 ret
2697SYM_FUNC_END(aesni_ctr_enc)
2698
2699/*
2700 * _aesni_gf128mul_x_ble: internal ABI
2701 * Multiply in GF(2^128) for XTS IVs
2702 * input:
2703 * IV: current IV
2704 * GF128MUL_MASK == mask with 0x87 and 0x01
2705 * output:
2706 * IV: next IV
2707 * changed:
2708 * CTR: == temporary value
2709 */
2710#define _aesni_gf128mul_x_ble() \
2711 pshufd $0x13, IV, CTR; \
2712 paddq IV, IV; \
2713 psrad $31, CTR; \
2714 pand GF128MUL_MASK, CTR; \
2715 pxor CTR, IV;
2716
2717/*
2718 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2719 * const u8 *src, bool enc, le128 *iv)
2720 */
2721SYM_FUNC_START(aesni_xts_crypt8)
2722 FRAME_BEGIN
2723 cmpb $0, %cl
2724 movl $0, %ecx
2725 movl $240, %r10d
2726 leaq _aesni_enc4, %r11
2727 leaq _aesni_dec4, %rax
2728 cmovel %r10d, %ecx
2729 cmoveq %rax, %r11
2730
2731 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2732 movups (IVP), IV
2733
2734 mov 480(KEYP), KLEN
2735 addq %rcx, KEYP
2736
2737 movdqa IV, STATE1
2738 movdqu 0x00(INP), INC
2739 pxor INC, STATE1
2740 movdqu IV, 0x00(OUTP)
2741
2742 _aesni_gf128mul_x_ble()
2743 movdqa IV, STATE2
2744 movdqu 0x10(INP), INC
2745 pxor INC, STATE2
2746 movdqu IV, 0x10(OUTP)
2747
2748 _aesni_gf128mul_x_ble()
2749 movdqa IV, STATE3
2750 movdqu 0x20(INP), INC
2751 pxor INC, STATE3
2752 movdqu IV, 0x20(OUTP)
2753
2754 _aesni_gf128mul_x_ble()
2755 movdqa IV, STATE4
2756 movdqu 0x30(INP), INC
2757 pxor INC, STATE4
2758 movdqu IV, 0x30(OUTP)
2759
2760 CALL_NOSPEC r11
2761
2762 movdqu 0x00(OUTP), INC
2763 pxor INC, STATE1
2764 movdqu STATE1, 0x00(OUTP)
2765
2766 _aesni_gf128mul_x_ble()
2767 movdqa IV, STATE1
2768 movdqu 0x40(INP), INC
2769 pxor INC, STATE1
2770 movdqu IV, 0x40(OUTP)
2771
2772 movdqu 0x10(OUTP), INC
2773 pxor INC, STATE2
2774 movdqu STATE2, 0x10(OUTP)
2775
2776 _aesni_gf128mul_x_ble()
2777 movdqa IV, STATE2
2778 movdqu 0x50(INP), INC
2779 pxor INC, STATE2
2780 movdqu IV, 0x50(OUTP)
2781
2782 movdqu 0x20(OUTP), INC
2783 pxor INC, STATE3
2784 movdqu STATE3, 0x20(OUTP)
2785
2786 _aesni_gf128mul_x_ble()
2787 movdqa IV, STATE3
2788 movdqu 0x60(INP), INC
2789 pxor INC, STATE3
2790 movdqu IV, 0x60(OUTP)
2791
2792 movdqu 0x30(OUTP), INC
2793 pxor INC, STATE4
2794 movdqu STATE4, 0x30(OUTP)
2795
2796 _aesni_gf128mul_x_ble()
2797 movdqa IV, STATE4
2798 movdqu 0x70(INP), INC
2799 pxor INC, STATE4
2800 movdqu IV, 0x70(OUTP)
2801
2802 _aesni_gf128mul_x_ble()
2803 movups IV, (IVP)
2804
2805 CALL_NOSPEC r11
2806
2807 movdqu 0x40(OUTP), INC
2808 pxor INC, STATE1
2809 movdqu STATE1, 0x40(OUTP)
2810
2811 movdqu 0x50(OUTP), INC
2812 pxor INC, STATE2
2813 movdqu STATE2, 0x50(OUTP)
2814
2815 movdqu 0x60(OUTP), INC
2816 pxor INC, STATE3
2817 movdqu STATE3, 0x60(OUTP)
2818
2819 movdqu 0x70(OUTP), INC
2820 pxor INC, STATE4
2821 movdqu STATE4, 0x70(OUTP)
2822
2823 FRAME_END
2824 ret
2825SYM_FUNC_END(aesni_xts_crypt8)
2826
2827#endif