Loading...
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34
35#ifdef __x86_64__
36.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
56.text
57
58
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
91#define STATE1 %xmm0
92#define STATE2 %xmm4
93#define STATE3 %xmm5
94#define STATE4 %xmm6
95#define STATE STATE1
96#define IN1 %xmm1
97#define IN2 %xmm7
98#define IN3 %xmm8
99#define IN4 %xmm9
100#define IN IN1
101#define KEY %xmm2
102#define IV %xmm3
103
104#define BSWAP_MASK %xmm10
105#define CTR %xmm11
106#define INC %xmm12
107
108#ifdef __x86_64__
109#define AREG %rax
110#define KEYP %rdi
111#define OUTP %rsi
112#define UKEYP OUTP
113#define INP %rdx
114#define LEN %rcx
115#define IVP %r8
116#define KLEN %r9d
117#define T1 %r10
118#define TKEYP T1
119#define T2 %r11
120#define TCTR_LOW T2
121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
300
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651 add $64, %r11
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267 push %r12
1268 push %r13
1269 push %r14
1270 mov %rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275 sub $VARIABLE_OFFSET, %rsp
1276 and $~63, %rsp # align rsp to 64 bytes
1277 mov %arg6, %r12
1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285 movdqa %xmm13, %xmm2
1286 psllq $1, %xmm13
1287 psrlq $63, %xmm2
1288 movdqa %xmm2, %xmm1
1289 pslldq $8, %xmm2
1290 psrldq $8, %xmm1
1291 por %xmm2, %xmm13
1292
1293 # Reduction
1294
1295 pshufd $0x24, %xmm1, %xmm2
1296 pcmpeqd TWOONE(%rip), %xmm2
1297 pand POLY(%rip), %xmm2
1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301 # Decrypt first few blocks
1302
1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306 mov %r13, %r12
1307 and $(3<<4), %r12
1308 jz _initial_num_blocks_is_0_decrypt
1309 cmp $(2<<4), %r12
1310 jb _initial_num_blocks_is_1_decrypt
1311 je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 sub $48, %r13
1316 jmp _initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 sub $32, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 sub $16, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331 cmp $0, %r13
1332 je _zero_cipher_left_decrypt
1333 sub $64, %r13
1334 je _four_cipher_left_decrypt
1335_decrypt_by_4:
1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338 add $64, %r11
1339 sub $64, %r13
1340 jne _decrypt_by_4
1341_four_cipher_left_decrypt:
1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345 mov %arg4, %r13
1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt
1348
1349 # Handle the last <16 byte block separately
1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11
1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366 movdqa %xmm1, %xmm2
1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 pand %xmm1, %xmm2
1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
1375 pxor %xmm2, %xmm8
1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377 # GHASH computation for the last <16 byte block
1378 sub %r13, %r11
1379 add $16, %r11
1380
1381 # output %r13 bytes
1382 MOVQ_R64_XMM %xmm0, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left_decrypt
1385 mov %rax, (%arg2 , %r11, 1)
1386 add $8, %r11
1387 psrldq $8, %xmm0
1388 MOVQ_R64_XMM %xmm0, %rax
1389 sub $8, %r13
1390_less_than_8_bytes_left_decrypt:
1391 mov %al, (%arg2, %r11, 1)
1392 add $1, %r11
1393 shr $8, %rax
1394 sub $1, %r13
1395 jne _less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398 shl $3, %r12 # convert into number of bits
1399 movd %r12d, %xmm15 # len(A) in %xmm15
1400 shl $3, %arg4 # len(C) in bits (*128)
1401 MOVQ_R64_XMM %arg4, %xmm1
1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 pxor %xmm15, %xmm8
1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406 # final GHASH computation
1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
1410 mov %arg5, %rax # %rax = *Y0
1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413 pxor %xmm8, %xmm0
1414_return_T_decrypt:
1415 mov arg9, %r10 # %r10 = authTag
1416 mov arg10, %r11 # %r11 = auth_tag_len
1417 cmp $16, %r11
1418 je _T_16_decrypt
1419 cmp $12, %r11
1420 je _T_12_decrypt
1421_T_8_decrypt:
1422 MOVQ_R64_XMM %xmm0, %rax
1423 mov %rax, (%r10)
1424 jmp _return_T_done_decrypt
1425_T_12_decrypt:
1426 MOVQ_R64_XMM %xmm0, %rax
1427 mov %rax, (%r10)
1428 psrldq $8, %xmm0
1429 movd %xmm0, %eax
1430 mov %eax, 8(%r10)
1431 jmp _return_T_done_decrypt
1432_T_16_decrypt:
1433 movdqu %xmm0, (%r10)
1434_return_T_done_decrypt:
1435 mov %r14, %rsp
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439 ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1617 sub $16, %r11
1618 add %r13, %r11
1619 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1620 lea SHIFT_MASK+16(%rip), %r12
1621 sub %r13, %r12
1622 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623 # (%r13 is the number of bytes in plaintext mod 16)
1624 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1625 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1626 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1627 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1628 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10,%xmm0
1632
1633 pxor %xmm0, %xmm8
1634 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635 # GHASH computation for the last <16 byte block
1636 sub %r13, %r11
1637 add $16, %r11
1638
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
1641
1642 # shuffle xmm0 back to output as ciphertext
1643
1644 # Output %r13 bytes
1645 MOVQ_R64_XMM %xmm0, %rax
1646 cmp $8, %r13
1647 jle _less_than_8_bytes_left_encrypt
1648 mov %rax, (%arg2 , %r11, 1)
1649 add $8, %r11
1650 psrldq $8, %xmm0
1651 MOVQ_R64_XMM %xmm0, %rax
1652 sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654 mov %al, (%arg2, %r11, 1)
1655 add $1, %r11
1656 shr $8, %rax
1657 sub $1, %r13
1658 jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660 mov arg8, %r12 # %r12 = addLen (number of bytes)
1661 shl $3, %r12
1662 movd %r12d, %xmm15 # len(A) in %xmm15
1663 shl $3, %arg4 # len(C) in bits (*128)
1664 MOVQ_R64_XMM %arg4, %xmm1
1665 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1666 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1667 pxor %xmm15, %xmm8
1668 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669 # final GHASH computation
1670 movdqa SHUF_MASK(%rip), %xmm10
1671 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1672
1673 mov %arg5, %rax # %rax = *Y0
1674 movdqu (%rax), %xmm0 # %xmm0 = Y0
1675 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1676 pxor %xmm8, %xmm0
1677_return_T_encrypt:
1678 mov arg9, %r10 # %r10 = authTag
1679 mov arg10, %r11 # %r11 = auth_tag_len
1680 cmp $16, %r11
1681 je _T_16_encrypt
1682 cmp $12, %r11
1683 je _T_12_encrypt
1684_T_8_encrypt:
1685 MOVQ_R64_XMM %xmm0, %rax
1686 mov %rax, (%r10)
1687 jmp _return_T_done_encrypt
1688_T_12_encrypt:
1689 MOVQ_R64_XMM %xmm0, %rax
1690 mov %rax, (%r10)
1691 psrldq $8, %xmm0
1692 movd %xmm0, %eax
1693 mov %eax, 8(%r10)
1694 jmp _return_T_done_encrypt
1695_T_16_encrypt:
1696 movdqu %xmm0, (%r10)
1697_return_T_done_encrypt:
1698 mov %r14, %rsp
1699 pop %r14
1700 pop %r13
1701 pop %r12
1702 ret
1703
1704#endif
1705
1706
1707_key_expansion_128:
1708_key_expansion_256a:
1709 pshufd $0b11111111, %xmm1, %xmm1
1710 shufps $0b00010000, %xmm0, %xmm4
1711 pxor %xmm4, %xmm0
1712 shufps $0b10001100, %xmm0, %xmm4
1713 pxor %xmm4, %xmm0
1714 pxor %xmm1, %xmm0
1715 movaps %xmm0, (TKEYP)
1716 add $0x10, TKEYP
1717 ret
1718
1719.align 4
1720_key_expansion_192a:
1721 pshufd $0b01010101, %xmm1, %xmm1
1722 shufps $0b00010000, %xmm0, %xmm4
1723 pxor %xmm4, %xmm0
1724 shufps $0b10001100, %xmm0, %xmm4
1725 pxor %xmm4, %xmm0
1726 pxor %xmm1, %xmm0
1727
1728 movaps %xmm2, %xmm5
1729 movaps %xmm2, %xmm6
1730 pslldq $4, %xmm5
1731 pshufd $0b11111111, %xmm0, %xmm3
1732 pxor %xmm3, %xmm2
1733 pxor %xmm5, %xmm2
1734
1735 movaps %xmm0, %xmm1
1736 shufps $0b01000100, %xmm0, %xmm6
1737 movaps %xmm6, (TKEYP)
1738 shufps $0b01001110, %xmm2, %xmm1
1739 movaps %xmm1, 0x10(TKEYP)
1740 add $0x20, TKEYP
1741 ret
1742
1743.align 4
1744_key_expansion_192b:
1745 pshufd $0b01010101, %xmm1, %xmm1
1746 shufps $0b00010000, %xmm0, %xmm4
1747 pxor %xmm4, %xmm0
1748 shufps $0b10001100, %xmm0, %xmm4
1749 pxor %xmm4, %xmm0
1750 pxor %xmm1, %xmm0
1751
1752 movaps %xmm2, %xmm5
1753 pslldq $4, %xmm5
1754 pshufd $0b11111111, %xmm0, %xmm3
1755 pxor %xmm3, %xmm2
1756 pxor %xmm5, %xmm2
1757
1758 movaps %xmm0, (TKEYP)
1759 add $0x10, TKEYP
1760 ret
1761
1762.align 4
1763_key_expansion_256b:
1764 pshufd $0b10101010, %xmm1, %xmm1
1765 shufps $0b00010000, %xmm2, %xmm4
1766 pxor %xmm4, %xmm2
1767 shufps $0b10001100, %xmm2, %xmm4
1768 pxor %xmm4, %xmm2
1769 pxor %xmm1, %xmm2
1770 movaps %xmm2, (TKEYP)
1771 add $0x10, TKEYP
1772 ret
1773
1774/*
1775 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1776 * unsigned int key_len)
1777 */
1778ENTRY(aesni_set_key)
1779#ifndef __x86_64__
1780 pushl KEYP
1781 movl 8(%esp), KEYP # ctx
1782 movl 12(%esp), UKEYP # in_key
1783 movl 16(%esp), %edx # key_len
1784#endif
1785 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1786 movaps %xmm0, (KEYP)
1787 lea 0x10(KEYP), TKEYP # key addr
1788 movl %edx, 480(KEYP)
1789 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1790 cmp $24, %dl
1791 jb .Lenc_key128
1792 je .Lenc_key192
1793 movups 0x10(UKEYP), %xmm2 # other user key
1794 movaps %xmm2, (TKEYP)
1795 add $0x10, TKEYP
1796 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1797 call _key_expansion_256a
1798 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1799 call _key_expansion_256b
1800 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1801 call _key_expansion_256a
1802 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1803 call _key_expansion_256b
1804 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1805 call _key_expansion_256a
1806 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1807 call _key_expansion_256b
1808 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1809 call _key_expansion_256a
1810 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1811 call _key_expansion_256b
1812 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1813 call _key_expansion_256a
1814 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1815 call _key_expansion_256b
1816 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1817 call _key_expansion_256a
1818 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1819 call _key_expansion_256b
1820 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1821 call _key_expansion_256a
1822 jmp .Ldec_key
1823.Lenc_key192:
1824 movq 0x10(UKEYP), %xmm2 # other user key
1825 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1826 call _key_expansion_192a
1827 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1828 call _key_expansion_192b
1829 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1830 call _key_expansion_192a
1831 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1832 call _key_expansion_192b
1833 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1834 call _key_expansion_192a
1835 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1836 call _key_expansion_192b
1837 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1838 call _key_expansion_192a
1839 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1840 call _key_expansion_192b
1841 jmp .Ldec_key
1842.Lenc_key128:
1843 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1844 call _key_expansion_128
1845 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1846 call _key_expansion_128
1847 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1848 call _key_expansion_128
1849 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1850 call _key_expansion_128
1851 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1852 call _key_expansion_128
1853 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1854 call _key_expansion_128
1855 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1856 call _key_expansion_128
1857 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1858 call _key_expansion_128
1859 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1860 call _key_expansion_128
1861 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1862 call _key_expansion_128
1863.Ldec_key:
1864 sub $0x10, TKEYP
1865 movaps (KEYP), %xmm0
1866 movaps (TKEYP), %xmm1
1867 movaps %xmm0, 240(TKEYP)
1868 movaps %xmm1, 240(KEYP)
1869 add $0x10, KEYP
1870 lea 240-16(TKEYP), UKEYP
1871.align 4
1872.Ldec_key_loop:
1873 movaps (KEYP), %xmm0
1874 AESIMC %xmm0 %xmm1
1875 movaps %xmm1, (UKEYP)
1876 add $0x10, KEYP
1877 sub $0x10, UKEYP
1878 cmp TKEYP, KEYP
1879 jb .Ldec_key_loop
1880 xor AREG, AREG
1881#ifndef __x86_64__
1882 popl KEYP
1883#endif
1884 ret
1885
1886/*
1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1888 */
1889ENTRY(aesni_enc)
1890#ifndef __x86_64__
1891 pushl KEYP
1892 pushl KLEN
1893 movl 12(%esp), KEYP
1894 movl 16(%esp), OUTP
1895 movl 20(%esp), INP
1896#endif
1897 movl 480(KEYP), KLEN # key length
1898 movups (INP), STATE # input
1899 call _aesni_enc1
1900 movups STATE, (OUTP) # output
1901#ifndef __x86_64__
1902 popl KLEN
1903 popl KEYP
1904#endif
1905 ret
1906
1907/*
1908 * _aesni_enc1: internal ABI
1909 * input:
1910 * KEYP: key struct pointer
1911 * KLEN: round count
1912 * STATE: initial state (input)
1913 * output:
1914 * STATE: finial state (output)
1915 * changed:
1916 * KEY
1917 * TKEYP (T1)
1918 */
1919.align 4
1920_aesni_enc1:
1921 movaps (KEYP), KEY # key
1922 mov KEYP, TKEYP
1923 pxor KEY, STATE # round 0
1924 add $0x30, TKEYP
1925 cmp $24, KLEN
1926 jb .Lenc128
1927 lea 0x20(TKEYP), TKEYP
1928 je .Lenc192
1929 add $0x20, TKEYP
1930 movaps -0x60(TKEYP), KEY
1931 AESENC KEY STATE
1932 movaps -0x50(TKEYP), KEY
1933 AESENC KEY STATE
1934.align 4
1935.Lenc192:
1936 movaps -0x40(TKEYP), KEY
1937 AESENC KEY STATE
1938 movaps -0x30(TKEYP), KEY
1939 AESENC KEY STATE
1940.align 4
1941.Lenc128:
1942 movaps -0x20(TKEYP), KEY
1943 AESENC KEY STATE
1944 movaps -0x10(TKEYP), KEY
1945 AESENC KEY STATE
1946 movaps (TKEYP), KEY
1947 AESENC KEY STATE
1948 movaps 0x10(TKEYP), KEY
1949 AESENC KEY STATE
1950 movaps 0x20(TKEYP), KEY
1951 AESENC KEY STATE
1952 movaps 0x30(TKEYP), KEY
1953 AESENC KEY STATE
1954 movaps 0x40(TKEYP), KEY
1955 AESENC KEY STATE
1956 movaps 0x50(TKEYP), KEY
1957 AESENC KEY STATE
1958 movaps 0x60(TKEYP), KEY
1959 AESENC KEY STATE
1960 movaps 0x70(TKEYP), KEY
1961 AESENCLAST KEY STATE
1962 ret
1963
1964/*
1965 * _aesni_enc4: internal ABI
1966 * input:
1967 * KEYP: key struct pointer
1968 * KLEN: round count
1969 * STATE1: initial state (input)
1970 * STATE2
1971 * STATE3
1972 * STATE4
1973 * output:
1974 * STATE1: finial state (output)
1975 * STATE2
1976 * STATE3
1977 * STATE4
1978 * changed:
1979 * KEY
1980 * TKEYP (T1)
1981 */
1982.align 4
1983_aesni_enc4:
1984 movaps (KEYP), KEY # key
1985 mov KEYP, TKEYP
1986 pxor KEY, STATE1 # round 0
1987 pxor KEY, STATE2
1988 pxor KEY, STATE3
1989 pxor KEY, STATE4
1990 add $0x30, TKEYP
1991 cmp $24, KLEN
1992 jb .L4enc128
1993 lea 0x20(TKEYP), TKEYP
1994 je .L4enc192
1995 add $0x20, TKEYP
1996 movaps -0x60(TKEYP), KEY
1997 AESENC KEY STATE1
1998 AESENC KEY STATE2
1999 AESENC KEY STATE3
2000 AESENC KEY STATE4
2001 movaps -0x50(TKEYP), KEY
2002 AESENC KEY STATE1
2003 AESENC KEY STATE2
2004 AESENC KEY STATE3
2005 AESENC KEY STATE4
2006#.align 4
2007.L4enc192:
2008 movaps -0x40(TKEYP), KEY
2009 AESENC KEY STATE1
2010 AESENC KEY STATE2
2011 AESENC KEY STATE3
2012 AESENC KEY STATE4
2013 movaps -0x30(TKEYP), KEY
2014 AESENC KEY STATE1
2015 AESENC KEY STATE2
2016 AESENC KEY STATE3
2017 AESENC KEY STATE4
2018#.align 4
2019.L4enc128:
2020 movaps -0x20(TKEYP), KEY
2021 AESENC KEY STATE1
2022 AESENC KEY STATE2
2023 AESENC KEY STATE3
2024 AESENC KEY STATE4
2025 movaps -0x10(TKEYP), KEY
2026 AESENC KEY STATE1
2027 AESENC KEY STATE2
2028 AESENC KEY STATE3
2029 AESENC KEY STATE4
2030 movaps (TKEYP), KEY
2031 AESENC KEY STATE1
2032 AESENC KEY STATE2
2033 AESENC KEY STATE3
2034 AESENC KEY STATE4
2035 movaps 0x10(TKEYP), KEY
2036 AESENC KEY STATE1
2037 AESENC KEY STATE2
2038 AESENC KEY STATE3
2039 AESENC KEY STATE4
2040 movaps 0x20(TKEYP), KEY
2041 AESENC KEY STATE1
2042 AESENC KEY STATE2
2043 AESENC KEY STATE3
2044 AESENC KEY STATE4
2045 movaps 0x30(TKEYP), KEY
2046 AESENC KEY STATE1
2047 AESENC KEY STATE2
2048 AESENC KEY STATE3
2049 AESENC KEY STATE4
2050 movaps 0x40(TKEYP), KEY
2051 AESENC KEY STATE1
2052 AESENC KEY STATE2
2053 AESENC KEY STATE3
2054 AESENC KEY STATE4
2055 movaps 0x50(TKEYP), KEY
2056 AESENC KEY STATE1
2057 AESENC KEY STATE2
2058 AESENC KEY STATE3
2059 AESENC KEY STATE4
2060 movaps 0x60(TKEYP), KEY
2061 AESENC KEY STATE1
2062 AESENC KEY STATE2
2063 AESENC KEY STATE3
2064 AESENC KEY STATE4
2065 movaps 0x70(TKEYP), KEY
2066 AESENCLAST KEY STATE1 # last round
2067 AESENCLAST KEY STATE2
2068 AESENCLAST KEY STATE3
2069 AESENCLAST KEY STATE4
2070 ret
2071
2072/*
2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2074 */
2075ENTRY(aesni_dec)
2076#ifndef __x86_64__
2077 pushl KEYP
2078 pushl KLEN
2079 movl 12(%esp), KEYP
2080 movl 16(%esp), OUTP
2081 movl 20(%esp), INP
2082#endif
2083 mov 480(KEYP), KLEN # key length
2084 add $240, KEYP
2085 movups (INP), STATE # input
2086 call _aesni_dec1
2087 movups STATE, (OUTP) #output
2088#ifndef __x86_64__
2089 popl KLEN
2090 popl KEYP
2091#endif
2092 ret
2093
2094/*
2095 * _aesni_dec1: internal ABI
2096 * input:
2097 * KEYP: key struct pointer
2098 * KLEN: key length
2099 * STATE: initial state (input)
2100 * output:
2101 * STATE: finial state (output)
2102 * changed:
2103 * KEY
2104 * TKEYP (T1)
2105 */
2106.align 4
2107_aesni_dec1:
2108 movaps (KEYP), KEY # key
2109 mov KEYP, TKEYP
2110 pxor KEY, STATE # round 0
2111 add $0x30, TKEYP
2112 cmp $24, KLEN
2113 jb .Ldec128
2114 lea 0x20(TKEYP), TKEYP
2115 je .Ldec192
2116 add $0x20, TKEYP
2117 movaps -0x60(TKEYP), KEY
2118 AESDEC KEY STATE
2119 movaps -0x50(TKEYP), KEY
2120 AESDEC KEY STATE
2121.align 4
2122.Ldec192:
2123 movaps -0x40(TKEYP), KEY
2124 AESDEC KEY STATE
2125 movaps -0x30(TKEYP), KEY
2126 AESDEC KEY STATE
2127.align 4
2128.Ldec128:
2129 movaps -0x20(TKEYP), KEY
2130 AESDEC KEY STATE
2131 movaps -0x10(TKEYP), KEY
2132 AESDEC KEY STATE
2133 movaps (TKEYP), KEY
2134 AESDEC KEY STATE
2135 movaps 0x10(TKEYP), KEY
2136 AESDEC KEY STATE
2137 movaps 0x20(TKEYP), KEY
2138 AESDEC KEY STATE
2139 movaps 0x30(TKEYP), KEY
2140 AESDEC KEY STATE
2141 movaps 0x40(TKEYP), KEY
2142 AESDEC KEY STATE
2143 movaps 0x50(TKEYP), KEY
2144 AESDEC KEY STATE
2145 movaps 0x60(TKEYP), KEY
2146 AESDEC KEY STATE
2147 movaps 0x70(TKEYP), KEY
2148 AESDECLAST KEY STATE
2149 ret
2150
2151/*
2152 * _aesni_dec4: internal ABI
2153 * input:
2154 * KEYP: key struct pointer
2155 * KLEN: key length
2156 * STATE1: initial state (input)
2157 * STATE2
2158 * STATE3
2159 * STATE4
2160 * output:
2161 * STATE1: finial state (output)
2162 * STATE2
2163 * STATE3
2164 * STATE4
2165 * changed:
2166 * KEY
2167 * TKEYP (T1)
2168 */
2169.align 4
2170_aesni_dec4:
2171 movaps (KEYP), KEY # key
2172 mov KEYP, TKEYP
2173 pxor KEY, STATE1 # round 0
2174 pxor KEY, STATE2
2175 pxor KEY, STATE3
2176 pxor KEY, STATE4
2177 add $0x30, TKEYP
2178 cmp $24, KLEN
2179 jb .L4dec128
2180 lea 0x20(TKEYP), TKEYP
2181 je .L4dec192
2182 add $0x20, TKEYP
2183 movaps -0x60(TKEYP), KEY
2184 AESDEC KEY STATE1
2185 AESDEC KEY STATE2
2186 AESDEC KEY STATE3
2187 AESDEC KEY STATE4
2188 movaps -0x50(TKEYP), KEY
2189 AESDEC KEY STATE1
2190 AESDEC KEY STATE2
2191 AESDEC KEY STATE3
2192 AESDEC KEY STATE4
2193.align 4
2194.L4dec192:
2195 movaps -0x40(TKEYP), KEY
2196 AESDEC KEY STATE1
2197 AESDEC KEY STATE2
2198 AESDEC KEY STATE3
2199 AESDEC KEY STATE4
2200 movaps -0x30(TKEYP), KEY
2201 AESDEC KEY STATE1
2202 AESDEC KEY STATE2
2203 AESDEC KEY STATE3
2204 AESDEC KEY STATE4
2205.align 4
2206.L4dec128:
2207 movaps -0x20(TKEYP), KEY
2208 AESDEC KEY STATE1
2209 AESDEC KEY STATE2
2210 AESDEC KEY STATE3
2211 AESDEC KEY STATE4
2212 movaps -0x10(TKEYP), KEY
2213 AESDEC KEY STATE1
2214 AESDEC KEY STATE2
2215 AESDEC KEY STATE3
2216 AESDEC KEY STATE4
2217 movaps (TKEYP), KEY
2218 AESDEC KEY STATE1
2219 AESDEC KEY STATE2
2220 AESDEC KEY STATE3
2221 AESDEC KEY STATE4
2222 movaps 0x10(TKEYP), KEY
2223 AESDEC KEY STATE1
2224 AESDEC KEY STATE2
2225 AESDEC KEY STATE3
2226 AESDEC KEY STATE4
2227 movaps 0x20(TKEYP), KEY
2228 AESDEC KEY STATE1
2229 AESDEC KEY STATE2
2230 AESDEC KEY STATE3
2231 AESDEC KEY STATE4
2232 movaps 0x30(TKEYP), KEY
2233 AESDEC KEY STATE1
2234 AESDEC KEY STATE2
2235 AESDEC KEY STATE3
2236 AESDEC KEY STATE4
2237 movaps 0x40(TKEYP), KEY
2238 AESDEC KEY STATE1
2239 AESDEC KEY STATE2
2240 AESDEC KEY STATE3
2241 AESDEC KEY STATE4
2242 movaps 0x50(TKEYP), KEY
2243 AESDEC KEY STATE1
2244 AESDEC KEY STATE2
2245 AESDEC KEY STATE3
2246 AESDEC KEY STATE4
2247 movaps 0x60(TKEYP), KEY
2248 AESDEC KEY STATE1
2249 AESDEC KEY STATE2
2250 AESDEC KEY STATE3
2251 AESDEC KEY STATE4
2252 movaps 0x70(TKEYP), KEY
2253 AESDECLAST KEY STATE1 # last round
2254 AESDECLAST KEY STATE2
2255 AESDECLAST KEY STATE3
2256 AESDECLAST KEY STATE4
2257 ret
2258
2259/*
2260 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 * size_t len)
2262 */
2263ENTRY(aesni_ecb_enc)
2264#ifndef __x86_64__
2265 pushl LEN
2266 pushl KEYP
2267 pushl KLEN
2268 movl 16(%esp), KEYP
2269 movl 20(%esp), OUTP
2270 movl 24(%esp), INP
2271 movl 28(%esp), LEN
2272#endif
2273 test LEN, LEN # check length
2274 jz .Lecb_enc_ret
2275 mov 480(KEYP), KLEN
2276 cmp $16, LEN
2277 jb .Lecb_enc_ret
2278 cmp $64, LEN
2279 jb .Lecb_enc_loop1
2280.align 4
2281.Lecb_enc_loop4:
2282 movups (INP), STATE1
2283 movups 0x10(INP), STATE2
2284 movups 0x20(INP), STATE3
2285 movups 0x30(INP), STATE4
2286 call _aesni_enc4
2287 movups STATE1, (OUTP)
2288 movups STATE2, 0x10(OUTP)
2289 movups STATE3, 0x20(OUTP)
2290 movups STATE4, 0x30(OUTP)
2291 sub $64, LEN
2292 add $64, INP
2293 add $64, OUTP
2294 cmp $64, LEN
2295 jge .Lecb_enc_loop4
2296 cmp $16, LEN
2297 jb .Lecb_enc_ret
2298.align 4
2299.Lecb_enc_loop1:
2300 movups (INP), STATE1
2301 call _aesni_enc1
2302 movups STATE1, (OUTP)
2303 sub $16, LEN
2304 add $16, INP
2305 add $16, OUTP
2306 cmp $16, LEN
2307 jge .Lecb_enc_loop1
2308.Lecb_enc_ret:
2309#ifndef __x86_64__
2310 popl KLEN
2311 popl KEYP
2312 popl LEN
2313#endif
2314 ret
2315
2316/*
2317 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 * size_t len);
2319 */
2320ENTRY(aesni_ecb_dec)
2321#ifndef __x86_64__
2322 pushl LEN
2323 pushl KEYP
2324 pushl KLEN
2325 movl 16(%esp), KEYP
2326 movl 20(%esp), OUTP
2327 movl 24(%esp), INP
2328 movl 28(%esp), LEN
2329#endif
2330 test LEN, LEN
2331 jz .Lecb_dec_ret
2332 mov 480(KEYP), KLEN
2333 add $240, KEYP
2334 cmp $16, LEN
2335 jb .Lecb_dec_ret
2336 cmp $64, LEN
2337 jb .Lecb_dec_loop1
2338.align 4
2339.Lecb_dec_loop4:
2340 movups (INP), STATE1
2341 movups 0x10(INP), STATE2
2342 movups 0x20(INP), STATE3
2343 movups 0x30(INP), STATE4
2344 call _aesni_dec4
2345 movups STATE1, (OUTP)
2346 movups STATE2, 0x10(OUTP)
2347 movups STATE3, 0x20(OUTP)
2348 movups STATE4, 0x30(OUTP)
2349 sub $64, LEN
2350 add $64, INP
2351 add $64, OUTP
2352 cmp $64, LEN
2353 jge .Lecb_dec_loop4
2354 cmp $16, LEN
2355 jb .Lecb_dec_ret
2356.align 4
2357.Lecb_dec_loop1:
2358 movups (INP), STATE1
2359 call _aesni_dec1
2360 movups STATE1, (OUTP)
2361 sub $16, LEN
2362 add $16, INP
2363 add $16, OUTP
2364 cmp $16, LEN
2365 jge .Lecb_dec_loop1
2366.Lecb_dec_ret:
2367#ifndef __x86_64__
2368 popl KLEN
2369 popl KEYP
2370 popl LEN
2371#endif
2372 ret
2373
2374/*
2375 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2376 * size_t len, u8 *iv)
2377 */
2378ENTRY(aesni_cbc_enc)
2379#ifndef __x86_64__
2380 pushl IVP
2381 pushl LEN
2382 pushl KEYP
2383 pushl KLEN
2384 movl 20(%esp), KEYP
2385 movl 24(%esp), OUTP
2386 movl 28(%esp), INP
2387 movl 32(%esp), LEN
2388 movl 36(%esp), IVP
2389#endif
2390 cmp $16, LEN
2391 jb .Lcbc_enc_ret
2392 mov 480(KEYP), KLEN
2393 movups (IVP), STATE # load iv as initial state
2394.align 4
2395.Lcbc_enc_loop:
2396 movups (INP), IN # load input
2397 pxor IN, STATE
2398 call _aesni_enc1
2399 movups STATE, (OUTP) # store output
2400 sub $16, LEN
2401 add $16, INP
2402 add $16, OUTP
2403 cmp $16, LEN
2404 jge .Lcbc_enc_loop
2405 movups STATE, (IVP)
2406.Lcbc_enc_ret:
2407#ifndef __x86_64__
2408 popl KLEN
2409 popl KEYP
2410 popl LEN
2411 popl IVP
2412#endif
2413 ret
2414
2415/*
2416 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2417 * size_t len, u8 *iv)
2418 */
2419ENTRY(aesni_cbc_dec)
2420#ifndef __x86_64__
2421 pushl IVP
2422 pushl LEN
2423 pushl KEYP
2424 pushl KLEN
2425 movl 20(%esp), KEYP
2426 movl 24(%esp), OUTP
2427 movl 28(%esp), INP
2428 movl 32(%esp), LEN
2429 movl 36(%esp), IVP
2430#endif
2431 cmp $16, LEN
2432 jb .Lcbc_dec_just_ret
2433 mov 480(KEYP), KLEN
2434 add $240, KEYP
2435 movups (IVP), IV
2436 cmp $64, LEN
2437 jb .Lcbc_dec_loop1
2438.align 4
2439.Lcbc_dec_loop4:
2440 movups (INP), IN1
2441 movaps IN1, STATE1
2442 movups 0x10(INP), IN2
2443 movaps IN2, STATE2
2444#ifdef __x86_64__
2445 movups 0x20(INP), IN3
2446 movaps IN3, STATE3
2447 movups 0x30(INP), IN4
2448 movaps IN4, STATE4
2449#else
2450 movups 0x20(INP), IN1
2451 movaps IN1, STATE3
2452 movups 0x30(INP), IN2
2453 movaps IN2, STATE4
2454#endif
2455 call _aesni_dec4
2456 pxor IV, STATE1
2457#ifdef __x86_64__
2458 pxor IN1, STATE2
2459 pxor IN2, STATE3
2460 pxor IN3, STATE4
2461 movaps IN4, IV
2462#else
2463 pxor (INP), STATE2
2464 pxor 0x10(INP), STATE3
2465 pxor IN1, STATE4
2466 movaps IN2, IV
2467#endif
2468 movups STATE1, (OUTP)
2469 movups STATE2, 0x10(OUTP)
2470 movups STATE3, 0x20(OUTP)
2471 movups STATE4, 0x30(OUTP)
2472 sub $64, LEN
2473 add $64, INP
2474 add $64, OUTP
2475 cmp $64, LEN
2476 jge .Lcbc_dec_loop4
2477 cmp $16, LEN
2478 jb .Lcbc_dec_ret
2479.align 4
2480.Lcbc_dec_loop1:
2481 movups (INP), IN
2482 movaps IN, STATE
2483 call _aesni_dec1
2484 pxor IV, STATE
2485 movups STATE, (OUTP)
2486 movaps IN, IV
2487 sub $16, LEN
2488 add $16, INP
2489 add $16, OUTP
2490 cmp $16, LEN
2491 jge .Lcbc_dec_loop1
2492.Lcbc_dec_ret:
2493 movups IV, (IVP)
2494.Lcbc_dec_just_ret:
2495#ifndef __x86_64__
2496 popl KLEN
2497 popl KEYP
2498 popl LEN
2499 popl IVP
2500#endif
2501 ret
2502
2503#ifdef __x86_64__
2504.align 16
2505.Lbswap_mask:
2506 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2507
2508/*
2509 * _aesni_inc_init: internal ABI
2510 * setup registers used by _aesni_inc
2511 * input:
2512 * IV
2513 * output:
2514 * CTR: == IV, in little endian
2515 * TCTR_LOW: == lower qword of CTR
2516 * INC: == 1, in little endian
2517 * BSWAP_MASK == endian swapping mask
2518 */
2519.align 4
2520_aesni_inc_init:
2521 movaps .Lbswap_mask, BSWAP_MASK
2522 movaps IV, CTR
2523 PSHUFB_XMM BSWAP_MASK CTR
2524 mov $1, TCTR_LOW
2525 MOVQ_R64_XMM TCTR_LOW INC
2526 MOVQ_R64_XMM CTR TCTR_LOW
2527 ret
2528
2529/*
2530 * _aesni_inc: internal ABI
2531 * Increase IV by 1, IV is in big endian
2532 * input:
2533 * IV
2534 * CTR: == IV, in little endian
2535 * TCTR_LOW: == lower qword of CTR
2536 * INC: == 1, in little endian
2537 * BSWAP_MASK == endian swapping mask
2538 * output:
2539 * IV: Increase by 1
2540 * changed:
2541 * CTR: == output IV, in little endian
2542 * TCTR_LOW: == lower qword of CTR
2543 */
2544.align 4
2545_aesni_inc:
2546 paddq INC, CTR
2547 add $1, TCTR_LOW
2548 jnc .Linc_low
2549 pslldq $8, INC
2550 paddq INC, CTR
2551 psrldq $8, INC
2552.Linc_low:
2553 movaps CTR, IV
2554 PSHUFB_XMM BSWAP_MASK IV
2555 ret
2556
2557/*
2558 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2559 * size_t len, u8 *iv)
2560 */
2561ENTRY(aesni_ctr_enc)
2562 cmp $16, LEN
2563 jb .Lctr_enc_just_ret
2564 mov 480(KEYP), KLEN
2565 movups (IVP), IV
2566 call _aesni_inc_init
2567 cmp $64, LEN
2568 jb .Lctr_enc_loop1
2569.align 4
2570.Lctr_enc_loop4:
2571 movaps IV, STATE1
2572 call _aesni_inc
2573 movups (INP), IN1
2574 movaps IV, STATE2
2575 call _aesni_inc
2576 movups 0x10(INP), IN2
2577 movaps IV, STATE3
2578 call _aesni_inc
2579 movups 0x20(INP), IN3
2580 movaps IV, STATE4
2581 call _aesni_inc
2582 movups 0x30(INP), IN4
2583 call _aesni_enc4
2584 pxor IN1, STATE1
2585 movups STATE1, (OUTP)
2586 pxor IN2, STATE2
2587 movups STATE2, 0x10(OUTP)
2588 pxor IN3, STATE3
2589 movups STATE3, 0x20(OUTP)
2590 pxor IN4, STATE4
2591 movups STATE4, 0x30(OUTP)
2592 sub $64, LEN
2593 add $64, INP
2594 add $64, OUTP
2595 cmp $64, LEN
2596 jge .Lctr_enc_loop4
2597 cmp $16, LEN
2598 jb .Lctr_enc_ret
2599.align 4
2600.Lctr_enc_loop1:
2601 movaps IV, STATE
2602 call _aesni_inc
2603 movups (INP), IN
2604 call _aesni_enc1
2605 pxor IN, STATE
2606 movups STATE, (OUTP)
2607 sub $16, LEN
2608 add $16, INP
2609 add $16, OUTP
2610 cmp $16, LEN
2611 jge .Lctr_enc_loop1
2612.Lctr_enc_ret:
2613 movups IV, (IVP)
2614.Lctr_enc_just_ret:
2615 ret
2616#endif
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35#include <asm/nospec-branch.h>
36
37/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
48#ifdef __x86_64__
49
50# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
55.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
57POLY: .octa 0xC2000000000000000000000000000001
58.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
60TWOONE: .octa 0x00000001000000000000000000000001
61
62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1: .octa 0x0000000000000000ffffffffffffffff
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2: .octa 0xffffffffffffffff0000000000000000
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE: .octa 0x00000000000000000000000000000001
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec: .octa 0x1
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc: .octa 0x2
83
84# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
86# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
92
93.text
94
95
96#define STACK_OFFSET 8*3
97
98#define AadHash 16*0
99#define AadLen 16*1
100#define InLen (16*1)+8
101#define PBlockEncKey 16*2
102#define OrigIV 16*3
103#define CurCount 16*4
104#define PBlockLen 16*5
105#define HashKey 16*6 // store HashKey <<1 mod poly here
106#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
107#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
108#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
109#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
110 // bits of HashKey <<1 mod poly here
111 //(for Karatsuba purposes)
112#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
113 // bits of HashKey^2 <<1 mod poly here
114 // (for Karatsuba purposes)
115#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
116 // bits of HashKey^3 <<1 mod poly here
117 // (for Karatsuba purposes)
118#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
119 // bits of HashKey^4 <<1 mod poly here
120 // (for Karatsuba purposes)
121
122#define arg1 rdi
123#define arg2 rsi
124#define arg3 rdx
125#define arg4 rcx
126#define arg5 r8
127#define arg6 r9
128#define arg7 STACK_OFFSET+8(%rsp)
129#define arg8 STACK_OFFSET+16(%rsp)
130#define arg9 STACK_OFFSET+24(%rsp)
131#define arg10 STACK_OFFSET+32(%rsp)
132#define arg11 STACK_OFFSET+40(%rsp)
133#define keysize 2*15*16(%arg1)
134#endif
135
136
137#define STATE1 %xmm0
138#define STATE2 %xmm4
139#define STATE3 %xmm5
140#define STATE4 %xmm6
141#define STATE STATE1
142#define IN1 %xmm1
143#define IN2 %xmm7
144#define IN3 %xmm8
145#define IN4 %xmm9
146#define IN IN1
147#define KEY %xmm2
148#define IV %xmm3
149
150#define BSWAP_MASK %xmm10
151#define CTR %xmm11
152#define INC %xmm12
153
154#define GF128MUL_MASK %xmm10
155
156#ifdef __x86_64__
157#define AREG %rax
158#define KEYP %rdi
159#define OUTP %rsi
160#define UKEYP OUTP
161#define INP %rdx
162#define LEN %rcx
163#define IVP %r8
164#define KLEN %r9d
165#define T1 %r10
166#define TKEYP T1
167#define T2 %r11
168#define TCTR_LOW T2
169#else
170#define AREG %eax
171#define KEYP %edi
172#define OUTP AREG
173#define UKEYP OUTP
174#define INP %edx
175#define LEN %esi
176#define IVP %ebp
177#define KLEN %ebx
178#define T1 %ecx
179#define TKEYP T1
180#endif
181
182.macro FUNC_SAVE
183 push %r12
184 push %r13
185 push %r14
186#
187# states of %xmm registers %xmm6:%xmm15 not saved
188# all %xmm registers are clobbered
189#
190.endm
191
192
193.macro FUNC_RESTORE
194 pop %r14
195 pop %r13
196 pop %r12
197.endm
198
199# Precompute hashkeys.
200# Input: Hash subkey.
201# Output: HashKeys stored in gcm_context_data. Only needs to be called
202# once per key.
203# clobbers r12, and tmp xmm registers.
204.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
205 mov \SUBKEY, %r12
206 movdqu (%r12), \TMP3
207 movdqa SHUF_MASK(%rip), \TMP2
208 PSHUFB_XMM \TMP2, \TMP3
209
210 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
211
212 movdqa \TMP3, \TMP2
213 psllq $1, \TMP3
214 psrlq $63, \TMP2
215 movdqa \TMP2, \TMP1
216 pslldq $8, \TMP2
217 psrldq $8, \TMP1
218 por \TMP2, \TMP3
219
220 # reduce HashKey<<1
221
222 pshufd $0x24, \TMP1, \TMP2
223 pcmpeqd TWOONE(%rip), \TMP2
224 pand POLY(%rip), \TMP2
225 pxor \TMP2, \TMP3
226 movdqa \TMP3, HashKey(%arg2)
227
228 movdqa \TMP3, \TMP5
229 pshufd $78, \TMP3, \TMP1
230 pxor \TMP3, \TMP1
231 movdqa \TMP1, HashKey_k(%arg2)
232
233 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
234# TMP5 = HashKey^2<<1 (mod poly)
235 movdqa \TMP5, HashKey_2(%arg2)
236# HashKey_2 = HashKey^2<<1 (mod poly)
237 pshufd $78, \TMP5, \TMP1
238 pxor \TMP5, \TMP1
239 movdqa \TMP1, HashKey_2_k(%arg2)
240
241 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
242# TMP5 = HashKey^3<<1 (mod poly)
243 movdqa \TMP5, HashKey_3(%arg2)
244 pshufd $78, \TMP5, \TMP1
245 pxor \TMP5, \TMP1
246 movdqa \TMP1, HashKey_3_k(%arg2)
247
248 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
249# TMP5 = HashKey^3<<1 (mod poly)
250 movdqa \TMP5, HashKey_4(%arg2)
251 pshufd $78, \TMP5, \TMP1
252 pxor \TMP5, \TMP1
253 movdqa \TMP1, HashKey_4_k(%arg2)
254.endm
255
256# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
257# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
258.macro GCM_INIT Iv SUBKEY AAD AADLEN
259 mov \AADLEN, %r11
260 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
261 xor %r11, %r11
262 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
263 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
264 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
265 mov \Iv, %rax
266 movdqu (%rax), %xmm0
267 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
268
269 movdqa SHUF_MASK(%rip), %xmm2
270 PSHUFB_XMM %xmm2, %xmm0
271 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
272
273 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
274 movdqa HashKey(%arg2), %xmm13
275
276 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
277 %xmm4, %xmm5, %xmm6
278.endm
279
280# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
281# struct has been initialized by GCM_INIT.
282# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
283# Clobbers rax, r10-r13, and xmm0-xmm15
284.macro GCM_ENC_DEC operation
285 movdqu AadHash(%arg2), %xmm8
286 movdqu HashKey(%arg2), %xmm13
287 add %arg5, InLen(%arg2)
288
289 xor %r11, %r11 # initialise the data pointer offset as zero
290 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
291
292 sub %r11, %arg5 # sub partial block data used
293 mov %arg5, %r13 # save the number of bytes
294
295 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
296 mov %r13, %r12
297 # Encrypt/Decrypt first few blocks
298
299 and $(3<<4), %r12
300 jz _initial_num_blocks_is_0_\@
301 cmp $(2<<4), %r12
302 jb _initial_num_blocks_is_1_\@
303 je _initial_num_blocks_is_2_\@
304_initial_num_blocks_is_3_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
307 sub $48, %r13
308 jmp _initial_blocks_\@
309_initial_num_blocks_is_2_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
312 sub $32, %r13
313 jmp _initial_blocks_\@
314_initial_num_blocks_is_1_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
317 sub $16, %r13
318 jmp _initial_blocks_\@
319_initial_num_blocks_is_0_\@:
320 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
321%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
322_initial_blocks_\@:
323
324 # Main loop - Encrypt/Decrypt remaining blocks
325
326 cmp $0, %r13
327 je _zero_cipher_left_\@
328 sub $64, %r13
329 je _four_cipher_left_\@
330_crypt_by_4_\@:
331 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
332 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
333 %xmm7, %xmm8, enc
334 add $64, %r11
335 sub $64, %r13
336 jne _crypt_by_4_\@
337_four_cipher_left_\@:
338 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
339%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
340_zero_cipher_left_\@:
341 movdqu %xmm8, AadHash(%arg2)
342 movdqu %xmm0, CurCount(%arg2)
343
344 mov %arg5, %r13
345 and $15, %r13 # %r13 = arg5 (mod 16)
346 je _multiple_of_16_bytes_\@
347
348 mov %r13, PBlockLen(%arg2)
349
350 # Handle the last <16 Byte block separately
351 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
352 movdqu %xmm0, CurCount(%arg2)
353 movdqa SHUF_MASK(%rip), %xmm10
354 PSHUFB_XMM %xmm10, %xmm0
355
356 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
357 movdqu %xmm0, PBlockEncKey(%arg2)
358
359 cmp $16, %arg5
360 jge _large_enough_update_\@
361
362 lea (%arg4,%r11,1), %r10
363 mov %r13, %r12
364 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
365 jmp _data_read_\@
366
367_large_enough_update_\@:
368 sub $16, %r11
369 add %r13, %r11
370
371 # receive the last <16 Byte block
372 movdqu (%arg4, %r11, 1), %xmm1
373
374 sub %r13, %r11
375 add $16, %r11
376
377 lea SHIFT_MASK+16(%rip), %r12
378 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
379 # (r13 is the number of bytes in plaintext mod 16)
380 sub %r13, %r12
381 # get the appropriate shuffle mask
382 movdqu (%r12), %xmm2
383 # shift right 16-r13 bytes
384 PSHUFB_XMM %xmm2, %xmm1
385
386_data_read_\@:
387 lea ALL_F+16(%rip), %r12
388 sub %r13, %r12
389
390.ifc \operation, dec
391 movdqa %xmm1, %xmm2
392.endif
393 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
394 movdqu (%r12), %xmm1
395 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
396 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
397.ifc \operation, dec
398 pand %xmm1, %xmm2
399 movdqa SHUF_MASK(%rip), %xmm10
400 PSHUFB_XMM %xmm10 ,%xmm2
401
402 pxor %xmm2, %xmm8
403.else
404 movdqa SHUF_MASK(%rip), %xmm10
405 PSHUFB_XMM %xmm10,%xmm0
406
407 pxor %xmm0, %xmm8
408.endif
409
410 movdqu %xmm8, AadHash(%arg2)
411.ifc \operation, enc
412 # GHASH computation for the last <16 byte block
413 movdqa SHUF_MASK(%rip), %xmm10
414 # shuffle xmm0 back to output as ciphertext
415 PSHUFB_XMM %xmm10, %xmm0
416.endif
417
418 # Output %r13 bytes
419 MOVQ_R64_XMM %xmm0, %rax
420 cmp $8, %r13
421 jle _less_than_8_bytes_left_\@
422 mov %rax, (%arg3 , %r11, 1)
423 add $8, %r11
424 psrldq $8, %xmm0
425 MOVQ_R64_XMM %xmm0, %rax
426 sub $8, %r13
427_less_than_8_bytes_left_\@:
428 mov %al, (%arg3, %r11, 1)
429 add $1, %r11
430 shr $8, %rax
431 sub $1, %r13
432 jne _less_than_8_bytes_left_\@
433_multiple_of_16_bytes_\@:
434.endm
435
436# GCM_COMPLETE Finishes update of tag of last partial block
437# Output: Authorization Tag (AUTH_TAG)
438# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
439.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
440 movdqu AadHash(%arg2), %xmm8
441 movdqu HashKey(%arg2), %xmm13
442
443 mov PBlockLen(%arg2), %r12
444
445 cmp $0, %r12
446 je _partial_done\@
447
448 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
449
450_partial_done\@:
451 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
452 shl $3, %r12 # convert into number of bits
453 movd %r12d, %xmm15 # len(A) in %xmm15
454 mov InLen(%arg2), %r12
455 shl $3, %r12 # len(C) in bits (*128)
456 MOVQ_R64_XMM %r12, %xmm1
457
458 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
459 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
460 pxor %xmm15, %xmm8
461 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
462 # final GHASH computation
463 movdqa SHUF_MASK(%rip), %xmm10
464 PSHUFB_XMM %xmm10, %xmm8
465
466 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
467 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
468 pxor %xmm8, %xmm0
469_return_T_\@:
470 mov \AUTHTAG, %r10 # %r10 = authTag
471 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
472 cmp $16, %r11
473 je _T_16_\@
474 cmp $8, %r11
475 jl _T_4_\@
476_T_8_\@:
477 MOVQ_R64_XMM %xmm0, %rax
478 mov %rax, (%r10)
479 add $8, %r10
480 sub $8, %r11
481 psrldq $8, %xmm0
482 cmp $0, %r11
483 je _return_T_done_\@
484_T_4_\@:
485 movd %xmm0, %eax
486 mov %eax, (%r10)
487 add $4, %r10
488 sub $4, %r11
489 psrldq $4, %xmm0
490 cmp $0, %r11
491 je _return_T_done_\@
492_T_123_\@:
493 movd %xmm0, %eax
494 cmp $2, %r11
495 jl _T_1_\@
496 mov %ax, (%r10)
497 cmp $2, %r11
498 je _return_T_done_\@
499 add $2, %r10
500 sar $16, %eax
501_T_1_\@:
502 mov %al, (%r10)
503 jmp _return_T_done_\@
504_T_16_\@:
505 movdqu %xmm0, (%r10)
506_return_T_done_\@:
507.endm
508
509#ifdef __x86_64__
510/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
511*
512*
513* Input: A and B (128-bits each, bit-reflected)
514* Output: C = A*B*x mod poly, (i.e. >>1 )
515* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
516* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
517*
518*/
519.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
520 movdqa \GH, \TMP1
521 pshufd $78, \GH, \TMP2
522 pshufd $78, \HK, \TMP3
523 pxor \GH, \TMP2 # TMP2 = a1+a0
524 pxor \HK, \TMP3 # TMP3 = b1+b0
525 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
526 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
527 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
528 pxor \GH, \TMP2
529 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
530 movdqa \TMP2, \TMP3
531 pslldq $8, \TMP3 # left shift TMP3 2 DWs
532 psrldq $8, \TMP2 # right shift TMP2 2 DWs
533 pxor \TMP3, \GH
534 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
535
536 # first phase of the reduction
537
538 movdqa \GH, \TMP2
539 movdqa \GH, \TMP3
540 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
541 # in in order to perform
542 # independent shifts
543 pslld $31, \TMP2 # packed right shift <<31
544 pslld $30, \TMP3 # packed right shift <<30
545 pslld $25, \TMP4 # packed right shift <<25
546 pxor \TMP3, \TMP2 # xor the shifted versions
547 pxor \TMP4, \TMP2
548 movdqa \TMP2, \TMP5
549 psrldq $4, \TMP5 # right shift TMP5 1 DW
550 pslldq $12, \TMP2 # left shift TMP2 3 DWs
551 pxor \TMP2, \GH
552
553 # second phase of the reduction
554
555 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
556 # in in order to perform
557 # independent shifts
558 movdqa \GH,\TMP3
559 movdqa \GH,\TMP4
560 psrld $1,\TMP2 # packed left shift >>1
561 psrld $2,\TMP3 # packed left shift >>2
562 psrld $7,\TMP4 # packed left shift >>7
563 pxor \TMP3,\TMP2 # xor the shifted versions
564 pxor \TMP4,\TMP2
565 pxor \TMP5, \TMP2
566 pxor \TMP2, \GH
567 pxor \TMP1, \GH # result is in TMP1
568.endm
569
570# Reads DLEN bytes starting at DPTR and stores in XMMDst
571# where 0 < DLEN < 16
572# Clobbers %rax, DLEN and XMM1
573.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
574 cmp $8, \DLEN
575 jl _read_lt8_\@
576 mov (\DPTR), %rax
577 MOVQ_R64_XMM %rax, \XMMDst
578 sub $8, \DLEN
579 jz _done_read_partial_block_\@
580 xor %eax, %eax
581_read_next_byte_\@:
582 shl $8, %rax
583 mov 7(\DPTR, \DLEN, 1), %al
584 dec \DLEN
585 jnz _read_next_byte_\@
586 MOVQ_R64_XMM %rax, \XMM1
587 pslldq $8, \XMM1
588 por \XMM1, \XMMDst
589 jmp _done_read_partial_block_\@
590_read_lt8_\@:
591 xor %eax, %eax
592_read_next_byte_lt8_\@:
593 shl $8, %rax
594 mov -1(\DPTR, \DLEN, 1), %al
595 dec \DLEN
596 jnz _read_next_byte_lt8_\@
597 MOVQ_R64_XMM %rax, \XMMDst
598_done_read_partial_block_\@:
599.endm
600
601# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
602# clobbers r10-11, xmm14
603.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
604 TMP6 TMP7
605 MOVADQ SHUF_MASK(%rip), %xmm14
606 mov \AAD, %r10 # %r10 = AAD
607 mov \AADLEN, %r11 # %r11 = aadLen
608 pxor \TMP7, \TMP7
609 pxor \TMP6, \TMP6
610
611 cmp $16, %r11
612 jl _get_AAD_rest\@
613_get_AAD_blocks\@:
614 movdqu (%r10), \TMP7
615 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
616 pxor \TMP7, \TMP6
617 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
618 add $16, %r10
619 sub $16, %r11
620 cmp $16, %r11
621 jge _get_AAD_blocks\@
622
623 movdqu \TMP6, \TMP7
624
625 /* read the last <16B of AAD */
626_get_AAD_rest\@:
627 cmp $0, %r11
628 je _get_AAD_done\@
629
630 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
631 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
632 pxor \TMP6, \TMP7
633 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
634 movdqu \TMP7, \TMP6
635
636_get_AAD_done\@:
637 movdqu \TMP6, AadHash(%arg2)
638.endm
639
640# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
641# between update calls.
642# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
643# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
644# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
645.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
646 AAD_HASH operation
647 mov PBlockLen(%arg2), %r13
648 cmp $0, %r13
649 je _partial_block_done_\@ # Leave Macro if no partial blocks
650 # Read in input data without over reading
651 cmp $16, \PLAIN_CYPH_LEN
652 jl _fewer_than_16_bytes_\@
653 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
654 jmp _data_read_\@
655
656_fewer_than_16_bytes_\@:
657 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
658 mov \PLAIN_CYPH_LEN, %r12
659 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
660
661 mov PBlockLen(%arg2), %r13
662
663_data_read_\@: # Finished reading in data
664
665 movdqu PBlockEncKey(%arg2), %xmm9
666 movdqu HashKey(%arg2), %xmm13
667
668 lea SHIFT_MASK(%rip), %r12
669
670 # adjust the shuffle mask pointer to be able to shift r13 bytes
671 # r16-r13 is the number of bytes in plaintext mod 16)
672 add %r13, %r12
673 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
674 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
675
676.ifc \operation, dec
677 movdqa %xmm1, %xmm3
678 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
679
680 mov \PLAIN_CYPH_LEN, %r10
681 add %r13, %r10
682 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
683 sub $16, %r10
684 # Determine if if partial block is not being filled and
685 # shift mask accordingly
686 jge _no_extra_mask_1_\@
687 sub %r10, %r12
688_no_extra_mask_1_\@:
689
690 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
691 # get the appropriate mask to mask out bottom r13 bytes of xmm9
692 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
693
694 pand %xmm1, %xmm3
695 movdqa SHUF_MASK(%rip), %xmm10
696 PSHUFB_XMM %xmm10, %xmm3
697 PSHUFB_XMM %xmm2, %xmm3
698 pxor %xmm3, \AAD_HASH
699
700 cmp $0, %r10
701 jl _partial_incomplete_1_\@
702
703 # GHASH computation for the last <16 Byte block
704 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
705 xor %rax,%rax
706
707 mov %rax, PBlockLen(%arg2)
708 jmp _dec_done_\@
709_partial_incomplete_1_\@:
710 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
711_dec_done_\@:
712 movdqu \AAD_HASH, AadHash(%arg2)
713.else
714 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
715
716 mov \PLAIN_CYPH_LEN, %r10
717 add %r13, %r10
718 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
719 sub $16, %r10
720 # Determine if if partial block is not being filled and
721 # shift mask accordingly
722 jge _no_extra_mask_2_\@
723 sub %r10, %r12
724_no_extra_mask_2_\@:
725
726 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
727 # get the appropriate mask to mask out bottom r13 bytes of xmm9
728 pand %xmm1, %xmm9
729
730 movdqa SHUF_MASK(%rip), %xmm1
731 PSHUFB_XMM %xmm1, %xmm9
732 PSHUFB_XMM %xmm2, %xmm9
733 pxor %xmm9, \AAD_HASH
734
735 cmp $0, %r10
736 jl _partial_incomplete_2_\@
737
738 # GHASH computation for the last <16 Byte block
739 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
740 xor %rax,%rax
741
742 mov %rax, PBlockLen(%arg2)
743 jmp _encode_done_\@
744_partial_incomplete_2_\@:
745 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
746_encode_done_\@:
747 movdqu \AAD_HASH, AadHash(%arg2)
748
749 movdqa SHUF_MASK(%rip), %xmm10
750 # shuffle xmm9 back to output as ciphertext
751 PSHUFB_XMM %xmm10, %xmm9
752 PSHUFB_XMM %xmm2, %xmm9
753.endif
754 # output encrypted Bytes
755 cmp $0, %r10
756 jl _partial_fill_\@
757 mov %r13, %r12
758 mov $16, %r13
759 # Set r13 to be the number of bytes to write out
760 sub %r12, %r13
761 jmp _count_set_\@
762_partial_fill_\@:
763 mov \PLAIN_CYPH_LEN, %r13
764_count_set_\@:
765 movdqa %xmm9, %xmm0
766 MOVQ_R64_XMM %xmm0, %rax
767 cmp $8, %r13
768 jle _less_than_8_bytes_left_\@
769
770 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
771 add $8, \DATA_OFFSET
772 psrldq $8, %xmm0
773 MOVQ_R64_XMM %xmm0, %rax
774 sub $8, %r13
775_less_than_8_bytes_left_\@:
776 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
777 add $1, \DATA_OFFSET
778 shr $8, %rax
779 sub $1, %r13
780 jne _less_than_8_bytes_left_\@
781_partial_block_done_\@:
782.endm # PARTIAL_BLOCK
783
784/*
785* if a = number of total plaintext bytes
786* b = floor(a/16)
787* num_initial_blocks = b mod 4
788* encrypt the initial num_initial_blocks blocks and apply ghash on
789* the ciphertext
790* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
791* are clobbered
792* arg1, %arg2, %arg3 are used as a pointer only, not modified
793*/
794
795
796.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
797 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
798 MOVADQ SHUF_MASK(%rip), %xmm14
799
800 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
801
802 # start AES for num_initial_blocks blocks
803
804 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
805
806.if (\i == 5) || (\i == 6) || (\i == 7)
807
808 MOVADQ ONE(%RIP),\TMP1
809 MOVADQ 0(%arg1),\TMP2
810.irpc index, \i_seq
811 paddd \TMP1, \XMM0 # INCR Y0
812.ifc \operation, dec
813 movdqa \XMM0, %xmm\index
814.else
815 MOVADQ \XMM0, %xmm\index
816.endif
817 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
818 pxor \TMP2, %xmm\index
819.endr
820 lea 0x10(%arg1),%r10
821 mov keysize,%eax
822 shr $2,%eax # 128->4, 192->6, 256->8
823 add $5,%eax # 128->9, 192->11, 256->13
824
825aes_loop_initial_\@:
826 MOVADQ (%r10),\TMP1
827.irpc index, \i_seq
828 AESENC \TMP1, %xmm\index
829.endr
830 add $16,%r10
831 sub $1,%eax
832 jnz aes_loop_initial_\@
833
834 MOVADQ (%r10), \TMP1
835.irpc index, \i_seq
836 AESENCLAST \TMP1, %xmm\index # Last Round
837.endr
838.irpc index, \i_seq
839 movdqu (%arg4 , %r11, 1), \TMP1
840 pxor \TMP1, %xmm\index
841 movdqu %xmm\index, (%arg3 , %r11, 1)
842 # write back plaintext/ciphertext for num_initial_blocks
843 add $16, %r11
844
845.ifc \operation, dec
846 movdqa \TMP1, %xmm\index
847.endif
848 PSHUFB_XMM %xmm14, %xmm\index
849
850 # prepare plaintext/ciphertext for GHASH computation
851.endr
852.endif
853
854 # apply GHASH on num_initial_blocks blocks
855
856.if \i == 5
857 pxor %xmm5, %xmm6
858 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859 pxor %xmm6, %xmm7
860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 pxor %xmm7, %xmm8
862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863.elseif \i == 6
864 pxor %xmm6, %xmm7
865 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866 pxor %xmm7, %xmm8
867 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
868.elseif \i == 7
869 pxor %xmm7, %xmm8
870 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
871.endif
872 cmp $64, %r13
873 jl _initial_blocks_done\@
874 # no need for precomputed values
875/*
876*
877* Precomputations for HashKey parallel with encryption of first 4 blocks.
878* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
879*/
880 MOVADQ ONE(%RIP),\TMP1
881 paddd \TMP1, \XMM0 # INCR Y0
882 MOVADQ \XMM0, \XMM1
883 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
884
885 paddd \TMP1, \XMM0 # INCR Y0
886 MOVADQ \XMM0, \XMM2
887 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
888
889 paddd \TMP1, \XMM0 # INCR Y0
890 MOVADQ \XMM0, \XMM3
891 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
892
893 paddd \TMP1, \XMM0 # INCR Y0
894 MOVADQ \XMM0, \XMM4
895 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
896
897 MOVADQ 0(%arg1),\TMP1
898 pxor \TMP1, \XMM1
899 pxor \TMP1, \XMM2
900 pxor \TMP1, \XMM3
901 pxor \TMP1, \XMM4
902.irpc index, 1234 # do 4 rounds
903 movaps 0x10*\index(%arg1), \TMP1
904 AESENC \TMP1, \XMM1
905 AESENC \TMP1, \XMM2
906 AESENC \TMP1, \XMM3
907 AESENC \TMP1, \XMM4
908.endr
909.irpc index, 56789 # do next 5 rounds
910 movaps 0x10*\index(%arg1), \TMP1
911 AESENC \TMP1, \XMM1
912 AESENC \TMP1, \XMM2
913 AESENC \TMP1, \XMM3
914 AESENC \TMP1, \XMM4
915.endr
916 lea 0xa0(%arg1),%r10
917 mov keysize,%eax
918 shr $2,%eax # 128->4, 192->6, 256->8
919 sub $4,%eax # 128->0, 192->2, 256->4
920 jz aes_loop_pre_done\@
921
922aes_loop_pre_\@:
923 MOVADQ (%r10),\TMP2
924.irpc index, 1234
925 AESENC \TMP2, %xmm\index
926.endr
927 add $16,%r10
928 sub $1,%eax
929 jnz aes_loop_pre_\@
930
931aes_loop_pre_done\@:
932 MOVADQ (%r10), \TMP2
933 AESENCLAST \TMP2, \XMM1
934 AESENCLAST \TMP2, \XMM2
935 AESENCLAST \TMP2, \XMM3
936 AESENCLAST \TMP2, \XMM4
937 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
938 pxor \TMP1, \XMM1
939.ifc \operation, dec
940 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
941 movdqa \TMP1, \XMM1
942.endif
943 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
944 pxor \TMP1, \XMM2
945.ifc \operation, dec
946 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
947 movdqa \TMP1, \XMM2
948.endif
949 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
950 pxor \TMP1, \XMM3
951.ifc \operation, dec
952 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
953 movdqa \TMP1, \XMM3
954.endif
955 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
956 pxor \TMP1, \XMM4
957.ifc \operation, dec
958 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
959 movdqa \TMP1, \XMM4
960.else
961 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
962 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
963 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
964 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
965.endif
966
967 add $64, %r11
968 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
969 pxor \XMMDst, \XMM1
970# combine GHASHed value with the corresponding ciphertext
971 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
972 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
973 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
974
975_initial_blocks_done\@:
976
977.endm
978
979/*
980* encrypt 4 blocks at a time
981* ghash the 4 previously encrypted ciphertext blocks
982* arg1, %arg3, %arg4 are used as pointers only, not modified
983* %r11 is the data offset value
984*/
985.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
986TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
987
988 movdqa \XMM1, \XMM5
989 movdqa \XMM2, \XMM6
990 movdqa \XMM3, \XMM7
991 movdqa \XMM4, \XMM8
992
993 movdqa SHUF_MASK(%rip), %xmm15
994 # multiply TMP5 * HashKey using karatsuba
995
996 movdqa \XMM5, \TMP4
997 pshufd $78, \XMM5, \TMP6
998 pxor \XMM5, \TMP6
999 paddd ONE(%rip), \XMM0 # INCR CNT
1000 movdqa HashKey_4(%arg2), \TMP5
1001 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1002 movdqa \XMM0, \XMM1
1003 paddd ONE(%rip), \XMM0 # INCR CNT
1004 movdqa \XMM0, \XMM2
1005 paddd ONE(%rip), \XMM0 # INCR CNT
1006 movdqa \XMM0, \XMM3
1007 paddd ONE(%rip), \XMM0 # INCR CNT
1008 movdqa \XMM0, \XMM4
1009 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1010 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1011 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1012 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1013 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1014
1015 pxor (%arg1), \XMM1
1016 pxor (%arg1), \XMM2
1017 pxor (%arg1), \XMM3
1018 pxor (%arg1), \XMM4
1019 movdqa HashKey_4_k(%arg2), \TMP5
1020 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1021 movaps 0x10(%arg1), \TMP1
1022 AESENC \TMP1, \XMM1 # Round 1
1023 AESENC \TMP1, \XMM2
1024 AESENC \TMP1, \XMM3
1025 AESENC \TMP1, \XMM4
1026 movaps 0x20(%arg1), \TMP1
1027 AESENC \TMP1, \XMM1 # Round 2
1028 AESENC \TMP1, \XMM2
1029 AESENC \TMP1, \XMM3
1030 AESENC \TMP1, \XMM4
1031 movdqa \XMM6, \TMP1
1032 pshufd $78, \XMM6, \TMP2
1033 pxor \XMM6, \TMP2
1034 movdqa HashKey_3(%arg2), \TMP5
1035 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1036 movaps 0x30(%arg1), \TMP3
1037 AESENC \TMP3, \XMM1 # Round 3
1038 AESENC \TMP3, \XMM2
1039 AESENC \TMP3, \XMM3
1040 AESENC \TMP3, \XMM4
1041 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1042 movaps 0x40(%arg1), \TMP3
1043 AESENC \TMP3, \XMM1 # Round 4
1044 AESENC \TMP3, \XMM2
1045 AESENC \TMP3, \XMM3
1046 AESENC \TMP3, \XMM4
1047 movdqa HashKey_3_k(%arg2), \TMP5
1048 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1049 movaps 0x50(%arg1), \TMP3
1050 AESENC \TMP3, \XMM1 # Round 5
1051 AESENC \TMP3, \XMM2
1052 AESENC \TMP3, \XMM3
1053 AESENC \TMP3, \XMM4
1054 pxor \TMP1, \TMP4
1055# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1056 pxor \XMM6, \XMM5
1057 pxor \TMP2, \TMP6
1058 movdqa \XMM7, \TMP1
1059 pshufd $78, \XMM7, \TMP2
1060 pxor \XMM7, \TMP2
1061 movdqa HashKey_2(%arg2), \TMP5
1062
1063 # Multiply TMP5 * HashKey using karatsuba
1064
1065 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1066 movaps 0x60(%arg1), \TMP3
1067 AESENC \TMP3, \XMM1 # Round 6
1068 AESENC \TMP3, \XMM2
1069 AESENC \TMP3, \XMM3
1070 AESENC \TMP3, \XMM4
1071 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1072 movaps 0x70(%arg1), \TMP3
1073 AESENC \TMP3, \XMM1 # Round 7
1074 AESENC \TMP3, \XMM2
1075 AESENC \TMP3, \XMM3
1076 AESENC \TMP3, \XMM4
1077 movdqa HashKey_2_k(%arg2), \TMP5
1078 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1079 movaps 0x80(%arg1), \TMP3
1080 AESENC \TMP3, \XMM1 # Round 8
1081 AESENC \TMP3, \XMM2
1082 AESENC \TMP3, \XMM3
1083 AESENC \TMP3, \XMM4
1084 pxor \TMP1, \TMP4
1085# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1086 pxor \XMM7, \XMM5
1087 pxor \TMP2, \TMP6
1088
1089 # Multiply XMM8 * HashKey
1090 # XMM8 and TMP5 hold the values for the two operands
1091
1092 movdqa \XMM8, \TMP1
1093 pshufd $78, \XMM8, \TMP2
1094 pxor \XMM8, \TMP2
1095 movdqa HashKey(%arg2), \TMP5
1096 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1097 movaps 0x90(%arg1), \TMP3
1098 AESENC \TMP3, \XMM1 # Round 9
1099 AESENC \TMP3, \XMM2
1100 AESENC \TMP3, \XMM3
1101 AESENC \TMP3, \XMM4
1102 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1103 lea 0xa0(%arg1),%r10
1104 mov keysize,%eax
1105 shr $2,%eax # 128->4, 192->6, 256->8
1106 sub $4,%eax # 128->0, 192->2, 256->4
1107 jz aes_loop_par_enc_done\@
1108
1109aes_loop_par_enc\@:
1110 MOVADQ (%r10),\TMP3
1111.irpc index, 1234
1112 AESENC \TMP3, %xmm\index
1113.endr
1114 add $16,%r10
1115 sub $1,%eax
1116 jnz aes_loop_par_enc\@
1117
1118aes_loop_par_enc_done\@:
1119 MOVADQ (%r10), \TMP3
1120 AESENCLAST \TMP3, \XMM1 # Round 10
1121 AESENCLAST \TMP3, \XMM2
1122 AESENCLAST \TMP3, \XMM3
1123 AESENCLAST \TMP3, \XMM4
1124 movdqa HashKey_k(%arg2), \TMP5
1125 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1126 movdqu (%arg4,%r11,1), \TMP3
1127 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1128 movdqu 16(%arg4,%r11,1), \TMP3
1129 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1130 movdqu 32(%arg4,%r11,1), \TMP3
1131 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1132 movdqu 48(%arg4,%r11,1), \TMP3
1133 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1134 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1135 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1136 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1137 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1138 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1139 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1140 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1141 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1142
1143 pxor \TMP4, \TMP1
1144 pxor \XMM8, \XMM5
1145 pxor \TMP6, \TMP2
1146 pxor \TMP1, \TMP2
1147 pxor \XMM5, \TMP2
1148 movdqa \TMP2, \TMP3
1149 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1150 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1151 pxor \TMP3, \XMM5
1152 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1153
1154 # first phase of reduction
1155
1156 movdqa \XMM5, \TMP2
1157 movdqa \XMM5, \TMP3
1158 movdqa \XMM5, \TMP4
1159# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1160 pslld $31, \TMP2 # packed right shift << 31
1161 pslld $30, \TMP3 # packed right shift << 30
1162 pslld $25, \TMP4 # packed right shift << 25
1163 pxor \TMP3, \TMP2 # xor the shifted versions
1164 pxor \TMP4, \TMP2
1165 movdqa \TMP2, \TMP5
1166 psrldq $4, \TMP5 # right shift T5 1 DW
1167 pslldq $12, \TMP2 # left shift T2 3 DWs
1168 pxor \TMP2, \XMM5
1169
1170 # second phase of reduction
1171
1172 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1173 movdqa \XMM5,\TMP3
1174 movdqa \XMM5,\TMP4
1175 psrld $1, \TMP2 # packed left shift >>1
1176 psrld $2, \TMP3 # packed left shift >>2
1177 psrld $7, \TMP4 # packed left shift >>7
1178 pxor \TMP3,\TMP2 # xor the shifted versions
1179 pxor \TMP4,\TMP2
1180 pxor \TMP5, \TMP2
1181 pxor \TMP2, \XMM5
1182 pxor \TMP1, \XMM5 # result is in TMP1
1183
1184 pxor \XMM5, \XMM1
1185.endm
1186
1187/*
1188* decrypt 4 blocks at a time
1189* ghash the 4 previously decrypted ciphertext blocks
1190* arg1, %arg3, %arg4 are used as pointers only, not modified
1191* %r11 is the data offset value
1192*/
1193.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1195
1196 movdqa \XMM1, \XMM5
1197 movdqa \XMM2, \XMM6
1198 movdqa \XMM3, \XMM7
1199 movdqa \XMM4, \XMM8
1200
1201 movdqa SHUF_MASK(%rip), %xmm15
1202 # multiply TMP5 * HashKey using karatsuba
1203
1204 movdqa \XMM5, \TMP4
1205 pshufd $78, \XMM5, \TMP6
1206 pxor \XMM5, \TMP6
1207 paddd ONE(%rip), \XMM0 # INCR CNT
1208 movdqa HashKey_4(%arg2), \TMP5
1209 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1210 movdqa \XMM0, \XMM1
1211 paddd ONE(%rip), \XMM0 # INCR CNT
1212 movdqa \XMM0, \XMM2
1213 paddd ONE(%rip), \XMM0 # INCR CNT
1214 movdqa \XMM0, \XMM3
1215 paddd ONE(%rip), \XMM0 # INCR CNT
1216 movdqa \XMM0, \XMM4
1217 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1218 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1219 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1220 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1221 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1222
1223 pxor (%arg1), \XMM1
1224 pxor (%arg1), \XMM2
1225 pxor (%arg1), \XMM3
1226 pxor (%arg1), \XMM4
1227 movdqa HashKey_4_k(%arg2), \TMP5
1228 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1229 movaps 0x10(%arg1), \TMP1
1230 AESENC \TMP1, \XMM1 # Round 1
1231 AESENC \TMP1, \XMM2
1232 AESENC \TMP1, \XMM3
1233 AESENC \TMP1, \XMM4
1234 movaps 0x20(%arg1), \TMP1
1235 AESENC \TMP1, \XMM1 # Round 2
1236 AESENC \TMP1, \XMM2
1237 AESENC \TMP1, \XMM3
1238 AESENC \TMP1, \XMM4
1239 movdqa \XMM6, \TMP1
1240 pshufd $78, \XMM6, \TMP2
1241 pxor \XMM6, \TMP2
1242 movdqa HashKey_3(%arg2), \TMP5
1243 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1244 movaps 0x30(%arg1), \TMP3
1245 AESENC \TMP3, \XMM1 # Round 3
1246 AESENC \TMP3, \XMM2
1247 AESENC \TMP3, \XMM3
1248 AESENC \TMP3, \XMM4
1249 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1250 movaps 0x40(%arg1), \TMP3
1251 AESENC \TMP3, \XMM1 # Round 4
1252 AESENC \TMP3, \XMM2
1253 AESENC \TMP3, \XMM3
1254 AESENC \TMP3, \XMM4
1255 movdqa HashKey_3_k(%arg2), \TMP5
1256 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1257 movaps 0x50(%arg1), \TMP3
1258 AESENC \TMP3, \XMM1 # Round 5
1259 AESENC \TMP3, \XMM2
1260 AESENC \TMP3, \XMM3
1261 AESENC \TMP3, \XMM4
1262 pxor \TMP1, \TMP4
1263# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1264 pxor \XMM6, \XMM5
1265 pxor \TMP2, \TMP6
1266 movdqa \XMM7, \TMP1
1267 pshufd $78, \XMM7, \TMP2
1268 pxor \XMM7, \TMP2
1269 movdqa HashKey_2(%arg2), \TMP5
1270
1271 # Multiply TMP5 * HashKey using karatsuba
1272
1273 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1274 movaps 0x60(%arg1), \TMP3
1275 AESENC \TMP3, \XMM1 # Round 6
1276 AESENC \TMP3, \XMM2
1277 AESENC \TMP3, \XMM3
1278 AESENC \TMP3, \XMM4
1279 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1280 movaps 0x70(%arg1), \TMP3
1281 AESENC \TMP3, \XMM1 # Round 7
1282 AESENC \TMP3, \XMM2
1283 AESENC \TMP3, \XMM3
1284 AESENC \TMP3, \XMM4
1285 movdqa HashKey_2_k(%arg2), \TMP5
1286 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1287 movaps 0x80(%arg1), \TMP3
1288 AESENC \TMP3, \XMM1 # Round 8
1289 AESENC \TMP3, \XMM2
1290 AESENC \TMP3, \XMM3
1291 AESENC \TMP3, \XMM4
1292 pxor \TMP1, \TMP4
1293# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1294 pxor \XMM7, \XMM5
1295 pxor \TMP2, \TMP6
1296
1297 # Multiply XMM8 * HashKey
1298 # XMM8 and TMP5 hold the values for the two operands
1299
1300 movdqa \XMM8, \TMP1
1301 pshufd $78, \XMM8, \TMP2
1302 pxor \XMM8, \TMP2
1303 movdqa HashKey(%arg2), \TMP5
1304 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1305 movaps 0x90(%arg1), \TMP3
1306 AESENC \TMP3, \XMM1 # Round 9
1307 AESENC \TMP3, \XMM2
1308 AESENC \TMP3, \XMM3
1309 AESENC \TMP3, \XMM4
1310 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1311 lea 0xa0(%arg1),%r10
1312 mov keysize,%eax
1313 shr $2,%eax # 128->4, 192->6, 256->8
1314 sub $4,%eax # 128->0, 192->2, 256->4
1315 jz aes_loop_par_dec_done\@
1316
1317aes_loop_par_dec\@:
1318 MOVADQ (%r10),\TMP3
1319.irpc index, 1234
1320 AESENC \TMP3, %xmm\index
1321.endr
1322 add $16,%r10
1323 sub $1,%eax
1324 jnz aes_loop_par_dec\@
1325
1326aes_loop_par_dec_done\@:
1327 MOVADQ (%r10), \TMP3
1328 AESENCLAST \TMP3, \XMM1 # last round
1329 AESENCLAST \TMP3, \XMM2
1330 AESENCLAST \TMP3, \XMM3
1331 AESENCLAST \TMP3, \XMM4
1332 movdqa HashKey_k(%arg2), \TMP5
1333 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1334 movdqu (%arg4,%r11,1), \TMP3
1335 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1336 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1337 movdqa \TMP3, \XMM1
1338 movdqu 16(%arg4,%r11,1), \TMP3
1339 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1340 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1341 movdqa \TMP3, \XMM2
1342 movdqu 32(%arg4,%r11,1), \TMP3
1343 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1344 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1345 movdqa \TMP3, \XMM3
1346 movdqu 48(%arg4,%r11,1), \TMP3
1347 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1348 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1349 movdqa \TMP3, \XMM4
1350 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1351 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1352 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1353 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1354
1355 pxor \TMP4, \TMP1
1356 pxor \XMM8, \XMM5
1357 pxor \TMP6, \TMP2
1358 pxor \TMP1, \TMP2
1359 pxor \XMM5, \TMP2
1360 movdqa \TMP2, \TMP3
1361 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1362 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1363 pxor \TMP3, \XMM5
1364 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1365
1366 # first phase of reduction
1367
1368 movdqa \XMM5, \TMP2
1369 movdqa \XMM5, \TMP3
1370 movdqa \XMM5, \TMP4
1371# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1372 pslld $31, \TMP2 # packed right shift << 31
1373 pslld $30, \TMP3 # packed right shift << 30
1374 pslld $25, \TMP4 # packed right shift << 25
1375 pxor \TMP3, \TMP2 # xor the shifted versions
1376 pxor \TMP4, \TMP2
1377 movdqa \TMP2, \TMP5
1378 psrldq $4, \TMP5 # right shift T5 1 DW
1379 pslldq $12, \TMP2 # left shift T2 3 DWs
1380 pxor \TMP2, \XMM5
1381
1382 # second phase of reduction
1383
1384 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1385 movdqa \XMM5,\TMP3
1386 movdqa \XMM5,\TMP4
1387 psrld $1, \TMP2 # packed left shift >>1
1388 psrld $2, \TMP3 # packed left shift >>2
1389 psrld $7, \TMP4 # packed left shift >>7
1390 pxor \TMP3,\TMP2 # xor the shifted versions
1391 pxor \TMP4,\TMP2
1392 pxor \TMP5, \TMP2
1393 pxor \TMP2, \XMM5
1394 pxor \TMP1, \XMM5 # result is in TMP1
1395
1396 pxor \XMM5, \XMM1
1397.endm
1398
1399/* GHASH the last 4 ciphertext blocks. */
1400.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1401TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1402
1403 # Multiply TMP6 * HashKey (using Karatsuba)
1404
1405 movdqa \XMM1, \TMP6
1406 pshufd $78, \XMM1, \TMP2
1407 pxor \XMM1, \TMP2
1408 movdqa HashKey_4(%arg2), \TMP5
1409 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1410 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1411 movdqa HashKey_4_k(%arg2), \TMP4
1412 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1413 movdqa \XMM1, \XMMDst
1414 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1415
1416 # Multiply TMP1 * HashKey (using Karatsuba)
1417
1418 movdqa \XMM2, \TMP1
1419 pshufd $78, \XMM2, \TMP2
1420 pxor \XMM2, \TMP2
1421 movdqa HashKey_3(%arg2), \TMP5
1422 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1423 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1424 movdqa HashKey_3_k(%arg2), \TMP4
1425 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1426 pxor \TMP1, \TMP6
1427 pxor \XMM2, \XMMDst
1428 pxor \TMP2, \XMM1
1429# results accumulated in TMP6, XMMDst, XMM1
1430
1431 # Multiply TMP1 * HashKey (using Karatsuba)
1432
1433 movdqa \XMM3, \TMP1
1434 pshufd $78, \XMM3, \TMP2
1435 pxor \XMM3, \TMP2
1436 movdqa HashKey_2(%arg2), \TMP5
1437 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1438 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1439 movdqa HashKey_2_k(%arg2), \TMP4
1440 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1441 pxor \TMP1, \TMP6
1442 pxor \XMM3, \XMMDst
1443 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1444
1445 # Multiply TMP1 * HashKey (using Karatsuba)
1446 movdqa \XMM4, \TMP1
1447 pshufd $78, \XMM4, \TMP2
1448 pxor \XMM4, \TMP2
1449 movdqa HashKey(%arg2), \TMP5
1450 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1451 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1452 movdqa HashKey_k(%arg2), \TMP4
1453 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1454 pxor \TMP1, \TMP6
1455 pxor \XMM4, \XMMDst
1456 pxor \XMM1, \TMP2
1457 pxor \TMP6, \TMP2
1458 pxor \XMMDst, \TMP2
1459 # middle section of the temp results combined as in karatsuba algorithm
1460 movdqa \TMP2, \TMP4
1461 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1462 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1463 pxor \TMP4, \XMMDst
1464 pxor \TMP2, \TMP6
1465# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1466 # first phase of the reduction
1467 movdqa \XMMDst, \TMP2
1468 movdqa \XMMDst, \TMP3
1469 movdqa \XMMDst, \TMP4
1470# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1471 pslld $31, \TMP2 # packed right shifting << 31
1472 pslld $30, \TMP3 # packed right shifting << 30
1473 pslld $25, \TMP4 # packed right shifting << 25
1474 pxor \TMP3, \TMP2 # xor the shifted versions
1475 pxor \TMP4, \TMP2
1476 movdqa \TMP2, \TMP7
1477 psrldq $4, \TMP7 # right shift TMP7 1 DW
1478 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1479 pxor \TMP2, \XMMDst
1480
1481 # second phase of the reduction
1482 movdqa \XMMDst, \TMP2
1483 # make 3 copies of XMMDst for doing 3 shift operations
1484 movdqa \XMMDst, \TMP3
1485 movdqa \XMMDst, \TMP4
1486 psrld $1, \TMP2 # packed left shift >> 1
1487 psrld $2, \TMP3 # packed left shift >> 2
1488 psrld $7, \TMP4 # packed left shift >> 7
1489 pxor \TMP3, \TMP2 # xor the shifted versions
1490 pxor \TMP4, \TMP2
1491 pxor \TMP7, \TMP2
1492 pxor \TMP2, \XMMDst
1493 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1494.endm
1495
1496
1497/* Encryption of a single block
1498* uses eax & r10
1499*/
1500
1501.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1502
1503 pxor (%arg1), \XMM0
1504 mov keysize,%eax
1505 shr $2,%eax # 128->4, 192->6, 256->8
1506 add $5,%eax # 128->9, 192->11, 256->13
1507 lea 16(%arg1), %r10 # get first expanded key address
1508
1509_esb_loop_\@:
1510 MOVADQ (%r10),\TMP1
1511 AESENC \TMP1,\XMM0
1512 add $16,%r10
1513 sub $1,%eax
1514 jnz _esb_loop_\@
1515
1516 MOVADQ (%r10),\TMP1
1517 AESENCLAST \TMP1,\XMM0
1518.endm
1519/*****************************************************************************
1520* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1521* struct gcm_context_data *data
1522* // Context data
1523* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1524* const u8 *in, // Ciphertext input
1525* u64 plaintext_len, // Length of data in bytes for decryption.
1526* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1527* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1528* // concatenated with 0x00000001. 16-byte aligned pointer.
1529* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1530* const u8 *aad, // Additional Authentication Data (AAD)
1531* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1532* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1533* // given authentication tag and only return the plaintext if they match.
1534* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1535* // (most likely), 12 or 8.
1536*
1537* Assumptions:
1538*
1539* keys:
1540* keys are pre-expanded and aligned to 16 bytes. we are using the first
1541* set of 11 keys in the data structure void *aes_ctx
1542*
1543* iv:
1544* 0 1 2 3
1545* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1546* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547* | Salt (From the SA) |
1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549* | Initialization Vector |
1550* | (This is the sequence number from IPSec header) |
1551* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1552* | 0x1 |
1553* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554*
1555*
1556*
1557* AAD:
1558* AAD padded to 128 bits with 0
1559* for example, assume AAD is a u32 vector
1560*
1561* if AAD is 8 bytes:
1562* AAD[3] = {A0, A1};
1563* padded AAD in xmm register = {A1 A0 0 0}
1564*
1565* 0 1 2 3
1566* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1567* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568* | SPI (A1) |
1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570* | 32-bit Sequence Number (A0) |
1571* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572* | 0x0 |
1573* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1574*
1575* AAD Format with 32-bit Sequence Number
1576*
1577* if AAD is 12 bytes:
1578* AAD[3] = {A0, A1, A2};
1579* padded AAD in xmm register = {A2 A1 A0 0}
1580*
1581* 0 1 2 3
1582* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586* | SPI (A2) |
1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588* | 64-bit Extended Sequence Number {A1,A0} |
1589* | |
1590* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1591* | 0x0 |
1592* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1593*
1594* AAD Format with 64-bit Extended Sequence Number
1595*
1596* poly = x^128 + x^127 + x^126 + x^121 + 1
1597*
1598*****************************************************************************/
1599ENTRY(aesni_gcm_dec)
1600 FUNC_SAVE
1601
1602 GCM_INIT %arg6, arg7, arg8, arg9
1603 GCM_ENC_DEC dec
1604 GCM_COMPLETE arg10, arg11
1605 FUNC_RESTORE
1606 ret
1607ENDPROC(aesni_gcm_dec)
1608
1609
1610/*****************************************************************************
1611* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1612* struct gcm_context_data *data
1613* // Context data
1614* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1615* const u8 *in, // Plaintext input
1616* u64 plaintext_len, // Length of data in bytes for encryption.
1617* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1618* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1619* // concatenated with 0x00000001. 16-byte aligned pointer.
1620* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1621* const u8 *aad, // Additional Authentication Data (AAD)
1622* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1623* u8 *auth_tag, // Authenticated Tag output.
1624* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1625* // 12 or 8.
1626*
1627* Assumptions:
1628*
1629* keys:
1630* keys are pre-expanded and aligned to 16 bytes. we are using the
1631* first set of 11 keys in the data structure void *aes_ctx
1632*
1633*
1634* iv:
1635* 0 1 2 3
1636* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1637* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638* | Salt (From the SA) |
1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640* | Initialization Vector |
1641* | (This is the sequence number from IPSec header) |
1642* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643* | 0x1 |
1644* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645*
1646*
1647*
1648* AAD:
1649* AAD padded to 128 bits with 0
1650* for example, assume AAD is a u32 vector
1651*
1652* if AAD is 8 bytes:
1653* AAD[3] = {A0, A1};
1654* padded AAD in xmm register = {A1 A0 0 0}
1655*
1656* 0 1 2 3
1657* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659* | SPI (A1) |
1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661* | 32-bit Sequence Number (A0) |
1662* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663* | 0x0 |
1664* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1665*
1666* AAD Format with 32-bit Sequence Number
1667*
1668* if AAD is 12 bytes:
1669* AAD[3] = {A0, A1, A2};
1670* padded AAD in xmm register = {A2 A1 A0 0}
1671*
1672* 0 1 2 3
1673* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1674* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675* | SPI (A2) |
1676* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677* | 64-bit Extended Sequence Number {A1,A0} |
1678* | |
1679* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1680* | 0x0 |
1681* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1682*
1683* AAD Format with 64-bit Extended Sequence Number
1684*
1685* poly = x^128 + x^127 + x^126 + x^121 + 1
1686***************************************************************************/
1687ENTRY(aesni_gcm_enc)
1688 FUNC_SAVE
1689
1690 GCM_INIT %arg6, arg7, arg8, arg9
1691 GCM_ENC_DEC enc
1692
1693 GCM_COMPLETE arg10, arg11
1694 FUNC_RESTORE
1695 ret
1696ENDPROC(aesni_gcm_enc)
1697
1698/*****************************************************************************
1699* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1700* struct gcm_context_data *data,
1701* // context data
1702* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1703* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1704* // concatenated with 0x00000001. 16-byte aligned pointer.
1705* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1706* const u8 *aad, // Additional Authentication Data (AAD)
1707* u64 aad_len) // Length of AAD in bytes.
1708*/
1709ENTRY(aesni_gcm_init)
1710 FUNC_SAVE
1711 GCM_INIT %arg3, %arg4,%arg5, %arg6
1712 FUNC_RESTORE
1713 ret
1714ENDPROC(aesni_gcm_init)
1715
1716/*****************************************************************************
1717* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1718* struct gcm_context_data *data,
1719* // context data
1720* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1721* const u8 *in, // Plaintext input
1722* u64 plaintext_len, // Length of data in bytes for encryption.
1723*/
1724ENTRY(aesni_gcm_enc_update)
1725 FUNC_SAVE
1726 GCM_ENC_DEC enc
1727 FUNC_RESTORE
1728 ret
1729ENDPROC(aesni_gcm_enc_update)
1730
1731/*****************************************************************************
1732* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1733* struct gcm_context_data *data,
1734* // context data
1735* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1736* const u8 *in, // Plaintext input
1737* u64 plaintext_len, // Length of data in bytes for encryption.
1738*/
1739ENTRY(aesni_gcm_dec_update)
1740 FUNC_SAVE
1741 GCM_ENC_DEC dec
1742 FUNC_RESTORE
1743 ret
1744ENDPROC(aesni_gcm_dec_update)
1745
1746/*****************************************************************************
1747* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1748* struct gcm_context_data *data,
1749* // context data
1750* u8 *auth_tag, // Authenticated Tag output.
1751* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1752* // 12 or 8.
1753*/
1754ENTRY(aesni_gcm_finalize)
1755 FUNC_SAVE
1756 GCM_COMPLETE %arg3 %arg4
1757 FUNC_RESTORE
1758 ret
1759ENDPROC(aesni_gcm_finalize)
1760
1761#endif
1762
1763
1764.align 4
1765_key_expansion_128:
1766_key_expansion_256a:
1767 pshufd $0b11111111, %xmm1, %xmm1
1768 shufps $0b00010000, %xmm0, %xmm4
1769 pxor %xmm4, %xmm0
1770 shufps $0b10001100, %xmm0, %xmm4
1771 pxor %xmm4, %xmm0
1772 pxor %xmm1, %xmm0
1773 movaps %xmm0, (TKEYP)
1774 add $0x10, TKEYP
1775 ret
1776ENDPROC(_key_expansion_128)
1777ENDPROC(_key_expansion_256a)
1778
1779.align 4
1780_key_expansion_192a:
1781 pshufd $0b01010101, %xmm1, %xmm1
1782 shufps $0b00010000, %xmm0, %xmm4
1783 pxor %xmm4, %xmm0
1784 shufps $0b10001100, %xmm0, %xmm4
1785 pxor %xmm4, %xmm0
1786 pxor %xmm1, %xmm0
1787
1788 movaps %xmm2, %xmm5
1789 movaps %xmm2, %xmm6
1790 pslldq $4, %xmm5
1791 pshufd $0b11111111, %xmm0, %xmm3
1792 pxor %xmm3, %xmm2
1793 pxor %xmm5, %xmm2
1794
1795 movaps %xmm0, %xmm1
1796 shufps $0b01000100, %xmm0, %xmm6
1797 movaps %xmm6, (TKEYP)
1798 shufps $0b01001110, %xmm2, %xmm1
1799 movaps %xmm1, 0x10(TKEYP)
1800 add $0x20, TKEYP
1801 ret
1802ENDPROC(_key_expansion_192a)
1803
1804.align 4
1805_key_expansion_192b:
1806 pshufd $0b01010101, %xmm1, %xmm1
1807 shufps $0b00010000, %xmm0, %xmm4
1808 pxor %xmm4, %xmm0
1809 shufps $0b10001100, %xmm0, %xmm4
1810 pxor %xmm4, %xmm0
1811 pxor %xmm1, %xmm0
1812
1813 movaps %xmm2, %xmm5
1814 pslldq $4, %xmm5
1815 pshufd $0b11111111, %xmm0, %xmm3
1816 pxor %xmm3, %xmm2
1817 pxor %xmm5, %xmm2
1818
1819 movaps %xmm0, (TKEYP)
1820 add $0x10, TKEYP
1821 ret
1822ENDPROC(_key_expansion_192b)
1823
1824.align 4
1825_key_expansion_256b:
1826 pshufd $0b10101010, %xmm1, %xmm1
1827 shufps $0b00010000, %xmm2, %xmm4
1828 pxor %xmm4, %xmm2
1829 shufps $0b10001100, %xmm2, %xmm4
1830 pxor %xmm4, %xmm2
1831 pxor %xmm1, %xmm2
1832 movaps %xmm2, (TKEYP)
1833 add $0x10, TKEYP
1834 ret
1835ENDPROC(_key_expansion_256b)
1836
1837/*
1838 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1839 * unsigned int key_len)
1840 */
1841ENTRY(aesni_set_key)
1842 FRAME_BEGIN
1843#ifndef __x86_64__
1844 pushl KEYP
1845 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1846 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1847 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1848#endif
1849 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1850 movaps %xmm0, (KEYP)
1851 lea 0x10(KEYP), TKEYP # key addr
1852 movl %edx, 480(KEYP)
1853 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1854 cmp $24, %dl
1855 jb .Lenc_key128
1856 je .Lenc_key192
1857 movups 0x10(UKEYP), %xmm2 # other user key
1858 movaps %xmm2, (TKEYP)
1859 add $0x10, TKEYP
1860 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1861 call _key_expansion_256a
1862 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1863 call _key_expansion_256b
1864 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1865 call _key_expansion_256a
1866 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1867 call _key_expansion_256b
1868 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1869 call _key_expansion_256a
1870 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1871 call _key_expansion_256b
1872 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1873 call _key_expansion_256a
1874 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1875 call _key_expansion_256b
1876 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1877 call _key_expansion_256a
1878 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1879 call _key_expansion_256b
1880 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1881 call _key_expansion_256a
1882 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1883 call _key_expansion_256b
1884 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1885 call _key_expansion_256a
1886 jmp .Ldec_key
1887.Lenc_key192:
1888 movq 0x10(UKEYP), %xmm2 # other user key
1889 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1890 call _key_expansion_192a
1891 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1892 call _key_expansion_192b
1893 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1894 call _key_expansion_192a
1895 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1896 call _key_expansion_192b
1897 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1898 call _key_expansion_192a
1899 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1900 call _key_expansion_192b
1901 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1902 call _key_expansion_192a
1903 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1904 call _key_expansion_192b
1905 jmp .Ldec_key
1906.Lenc_key128:
1907 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1908 call _key_expansion_128
1909 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1910 call _key_expansion_128
1911 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1912 call _key_expansion_128
1913 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1914 call _key_expansion_128
1915 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1916 call _key_expansion_128
1917 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1918 call _key_expansion_128
1919 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1920 call _key_expansion_128
1921 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1922 call _key_expansion_128
1923 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1924 call _key_expansion_128
1925 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1926 call _key_expansion_128
1927.Ldec_key:
1928 sub $0x10, TKEYP
1929 movaps (KEYP), %xmm0
1930 movaps (TKEYP), %xmm1
1931 movaps %xmm0, 240(TKEYP)
1932 movaps %xmm1, 240(KEYP)
1933 add $0x10, KEYP
1934 lea 240-16(TKEYP), UKEYP
1935.align 4
1936.Ldec_key_loop:
1937 movaps (KEYP), %xmm0
1938 AESIMC %xmm0 %xmm1
1939 movaps %xmm1, (UKEYP)
1940 add $0x10, KEYP
1941 sub $0x10, UKEYP
1942 cmp TKEYP, KEYP
1943 jb .Ldec_key_loop
1944 xor AREG, AREG
1945#ifndef __x86_64__
1946 popl KEYP
1947#endif
1948 FRAME_END
1949 ret
1950ENDPROC(aesni_set_key)
1951
1952/*
1953 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1954 */
1955ENTRY(aesni_enc)
1956 FRAME_BEGIN
1957#ifndef __x86_64__
1958 pushl KEYP
1959 pushl KLEN
1960 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1961 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1962 movl (FRAME_OFFSET+20)(%esp), INP # src
1963#endif
1964 movl 480(KEYP), KLEN # key length
1965 movups (INP), STATE # input
1966 call _aesni_enc1
1967 movups STATE, (OUTP) # output
1968#ifndef __x86_64__
1969 popl KLEN
1970 popl KEYP
1971#endif
1972 FRAME_END
1973 ret
1974ENDPROC(aesni_enc)
1975
1976/*
1977 * _aesni_enc1: internal ABI
1978 * input:
1979 * KEYP: key struct pointer
1980 * KLEN: round count
1981 * STATE: initial state (input)
1982 * output:
1983 * STATE: finial state (output)
1984 * changed:
1985 * KEY
1986 * TKEYP (T1)
1987 */
1988.align 4
1989_aesni_enc1:
1990 movaps (KEYP), KEY # key
1991 mov KEYP, TKEYP
1992 pxor KEY, STATE # round 0
1993 add $0x30, TKEYP
1994 cmp $24, KLEN
1995 jb .Lenc128
1996 lea 0x20(TKEYP), TKEYP
1997 je .Lenc192
1998 add $0x20, TKEYP
1999 movaps -0x60(TKEYP), KEY
2000 AESENC KEY STATE
2001 movaps -0x50(TKEYP), KEY
2002 AESENC KEY STATE
2003.align 4
2004.Lenc192:
2005 movaps -0x40(TKEYP), KEY
2006 AESENC KEY STATE
2007 movaps -0x30(TKEYP), KEY
2008 AESENC KEY STATE
2009.align 4
2010.Lenc128:
2011 movaps -0x20(TKEYP), KEY
2012 AESENC KEY STATE
2013 movaps -0x10(TKEYP), KEY
2014 AESENC KEY STATE
2015 movaps (TKEYP), KEY
2016 AESENC KEY STATE
2017 movaps 0x10(TKEYP), KEY
2018 AESENC KEY STATE
2019 movaps 0x20(TKEYP), KEY
2020 AESENC KEY STATE
2021 movaps 0x30(TKEYP), KEY
2022 AESENC KEY STATE
2023 movaps 0x40(TKEYP), KEY
2024 AESENC KEY STATE
2025 movaps 0x50(TKEYP), KEY
2026 AESENC KEY STATE
2027 movaps 0x60(TKEYP), KEY
2028 AESENC KEY STATE
2029 movaps 0x70(TKEYP), KEY
2030 AESENCLAST KEY STATE
2031 ret
2032ENDPROC(_aesni_enc1)
2033
2034/*
2035 * _aesni_enc4: internal ABI
2036 * input:
2037 * KEYP: key struct pointer
2038 * KLEN: round count
2039 * STATE1: initial state (input)
2040 * STATE2
2041 * STATE3
2042 * STATE4
2043 * output:
2044 * STATE1: finial state (output)
2045 * STATE2
2046 * STATE3
2047 * STATE4
2048 * changed:
2049 * KEY
2050 * TKEYP (T1)
2051 */
2052.align 4
2053_aesni_enc4:
2054 movaps (KEYP), KEY # key
2055 mov KEYP, TKEYP
2056 pxor KEY, STATE1 # round 0
2057 pxor KEY, STATE2
2058 pxor KEY, STATE3
2059 pxor KEY, STATE4
2060 add $0x30, TKEYP
2061 cmp $24, KLEN
2062 jb .L4enc128
2063 lea 0x20(TKEYP), TKEYP
2064 je .L4enc192
2065 add $0x20, TKEYP
2066 movaps -0x60(TKEYP), KEY
2067 AESENC KEY STATE1
2068 AESENC KEY STATE2
2069 AESENC KEY STATE3
2070 AESENC KEY STATE4
2071 movaps -0x50(TKEYP), KEY
2072 AESENC KEY STATE1
2073 AESENC KEY STATE2
2074 AESENC KEY STATE3
2075 AESENC KEY STATE4
2076#.align 4
2077.L4enc192:
2078 movaps -0x40(TKEYP), KEY
2079 AESENC KEY STATE1
2080 AESENC KEY STATE2
2081 AESENC KEY STATE3
2082 AESENC KEY STATE4
2083 movaps -0x30(TKEYP), KEY
2084 AESENC KEY STATE1
2085 AESENC KEY STATE2
2086 AESENC KEY STATE3
2087 AESENC KEY STATE4
2088#.align 4
2089.L4enc128:
2090 movaps -0x20(TKEYP), KEY
2091 AESENC KEY STATE1
2092 AESENC KEY STATE2
2093 AESENC KEY STATE3
2094 AESENC KEY STATE4
2095 movaps -0x10(TKEYP), KEY
2096 AESENC KEY STATE1
2097 AESENC KEY STATE2
2098 AESENC KEY STATE3
2099 AESENC KEY STATE4
2100 movaps (TKEYP), KEY
2101 AESENC KEY STATE1
2102 AESENC KEY STATE2
2103 AESENC KEY STATE3
2104 AESENC KEY STATE4
2105 movaps 0x10(TKEYP), KEY
2106 AESENC KEY STATE1
2107 AESENC KEY STATE2
2108 AESENC KEY STATE3
2109 AESENC KEY STATE4
2110 movaps 0x20(TKEYP), KEY
2111 AESENC KEY STATE1
2112 AESENC KEY STATE2
2113 AESENC KEY STATE3
2114 AESENC KEY STATE4
2115 movaps 0x30(TKEYP), KEY
2116 AESENC KEY STATE1
2117 AESENC KEY STATE2
2118 AESENC KEY STATE3
2119 AESENC KEY STATE4
2120 movaps 0x40(TKEYP), KEY
2121 AESENC KEY STATE1
2122 AESENC KEY STATE2
2123 AESENC KEY STATE3
2124 AESENC KEY STATE4
2125 movaps 0x50(TKEYP), KEY
2126 AESENC KEY STATE1
2127 AESENC KEY STATE2
2128 AESENC KEY STATE3
2129 AESENC KEY STATE4
2130 movaps 0x60(TKEYP), KEY
2131 AESENC KEY STATE1
2132 AESENC KEY STATE2
2133 AESENC KEY STATE3
2134 AESENC KEY STATE4
2135 movaps 0x70(TKEYP), KEY
2136 AESENCLAST KEY STATE1 # last round
2137 AESENCLAST KEY STATE2
2138 AESENCLAST KEY STATE3
2139 AESENCLAST KEY STATE4
2140 ret
2141ENDPROC(_aesni_enc4)
2142
2143/*
2144 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2145 */
2146ENTRY(aesni_dec)
2147 FRAME_BEGIN
2148#ifndef __x86_64__
2149 pushl KEYP
2150 pushl KLEN
2151 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2152 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2153 movl (FRAME_OFFSET+20)(%esp), INP # src
2154#endif
2155 mov 480(KEYP), KLEN # key length
2156 add $240, KEYP
2157 movups (INP), STATE # input
2158 call _aesni_dec1
2159 movups STATE, (OUTP) #output
2160#ifndef __x86_64__
2161 popl KLEN
2162 popl KEYP
2163#endif
2164 FRAME_END
2165 ret
2166ENDPROC(aesni_dec)
2167
2168/*
2169 * _aesni_dec1: internal ABI
2170 * input:
2171 * KEYP: key struct pointer
2172 * KLEN: key length
2173 * STATE: initial state (input)
2174 * output:
2175 * STATE: finial state (output)
2176 * changed:
2177 * KEY
2178 * TKEYP (T1)
2179 */
2180.align 4
2181_aesni_dec1:
2182 movaps (KEYP), KEY # key
2183 mov KEYP, TKEYP
2184 pxor KEY, STATE # round 0
2185 add $0x30, TKEYP
2186 cmp $24, KLEN
2187 jb .Ldec128
2188 lea 0x20(TKEYP), TKEYP
2189 je .Ldec192
2190 add $0x20, TKEYP
2191 movaps -0x60(TKEYP), KEY
2192 AESDEC KEY STATE
2193 movaps -0x50(TKEYP), KEY
2194 AESDEC KEY STATE
2195.align 4
2196.Ldec192:
2197 movaps -0x40(TKEYP), KEY
2198 AESDEC KEY STATE
2199 movaps -0x30(TKEYP), KEY
2200 AESDEC KEY STATE
2201.align 4
2202.Ldec128:
2203 movaps -0x20(TKEYP), KEY
2204 AESDEC KEY STATE
2205 movaps -0x10(TKEYP), KEY
2206 AESDEC KEY STATE
2207 movaps (TKEYP), KEY
2208 AESDEC KEY STATE
2209 movaps 0x10(TKEYP), KEY
2210 AESDEC KEY STATE
2211 movaps 0x20(TKEYP), KEY
2212 AESDEC KEY STATE
2213 movaps 0x30(TKEYP), KEY
2214 AESDEC KEY STATE
2215 movaps 0x40(TKEYP), KEY
2216 AESDEC KEY STATE
2217 movaps 0x50(TKEYP), KEY
2218 AESDEC KEY STATE
2219 movaps 0x60(TKEYP), KEY
2220 AESDEC KEY STATE
2221 movaps 0x70(TKEYP), KEY
2222 AESDECLAST KEY STATE
2223 ret
2224ENDPROC(_aesni_dec1)
2225
2226/*
2227 * _aesni_dec4: internal ABI
2228 * input:
2229 * KEYP: key struct pointer
2230 * KLEN: key length
2231 * STATE1: initial state (input)
2232 * STATE2
2233 * STATE3
2234 * STATE4
2235 * output:
2236 * STATE1: finial state (output)
2237 * STATE2
2238 * STATE3
2239 * STATE4
2240 * changed:
2241 * KEY
2242 * TKEYP (T1)
2243 */
2244.align 4
2245_aesni_dec4:
2246 movaps (KEYP), KEY # key
2247 mov KEYP, TKEYP
2248 pxor KEY, STATE1 # round 0
2249 pxor KEY, STATE2
2250 pxor KEY, STATE3
2251 pxor KEY, STATE4
2252 add $0x30, TKEYP
2253 cmp $24, KLEN
2254 jb .L4dec128
2255 lea 0x20(TKEYP), TKEYP
2256 je .L4dec192
2257 add $0x20, TKEYP
2258 movaps -0x60(TKEYP), KEY
2259 AESDEC KEY STATE1
2260 AESDEC KEY STATE2
2261 AESDEC KEY STATE3
2262 AESDEC KEY STATE4
2263 movaps -0x50(TKEYP), KEY
2264 AESDEC KEY STATE1
2265 AESDEC KEY STATE2
2266 AESDEC KEY STATE3
2267 AESDEC KEY STATE4
2268.align 4
2269.L4dec192:
2270 movaps -0x40(TKEYP), KEY
2271 AESDEC KEY STATE1
2272 AESDEC KEY STATE2
2273 AESDEC KEY STATE3
2274 AESDEC KEY STATE4
2275 movaps -0x30(TKEYP), KEY
2276 AESDEC KEY STATE1
2277 AESDEC KEY STATE2
2278 AESDEC KEY STATE3
2279 AESDEC KEY STATE4
2280.align 4
2281.L4dec128:
2282 movaps -0x20(TKEYP), KEY
2283 AESDEC KEY STATE1
2284 AESDEC KEY STATE2
2285 AESDEC KEY STATE3
2286 AESDEC KEY STATE4
2287 movaps -0x10(TKEYP), KEY
2288 AESDEC KEY STATE1
2289 AESDEC KEY STATE2
2290 AESDEC KEY STATE3
2291 AESDEC KEY STATE4
2292 movaps (TKEYP), KEY
2293 AESDEC KEY STATE1
2294 AESDEC KEY STATE2
2295 AESDEC KEY STATE3
2296 AESDEC KEY STATE4
2297 movaps 0x10(TKEYP), KEY
2298 AESDEC KEY STATE1
2299 AESDEC KEY STATE2
2300 AESDEC KEY STATE3
2301 AESDEC KEY STATE4
2302 movaps 0x20(TKEYP), KEY
2303 AESDEC KEY STATE1
2304 AESDEC KEY STATE2
2305 AESDEC KEY STATE3
2306 AESDEC KEY STATE4
2307 movaps 0x30(TKEYP), KEY
2308 AESDEC KEY STATE1
2309 AESDEC KEY STATE2
2310 AESDEC KEY STATE3
2311 AESDEC KEY STATE4
2312 movaps 0x40(TKEYP), KEY
2313 AESDEC KEY STATE1
2314 AESDEC KEY STATE2
2315 AESDEC KEY STATE3
2316 AESDEC KEY STATE4
2317 movaps 0x50(TKEYP), KEY
2318 AESDEC KEY STATE1
2319 AESDEC KEY STATE2
2320 AESDEC KEY STATE3
2321 AESDEC KEY STATE4
2322 movaps 0x60(TKEYP), KEY
2323 AESDEC KEY STATE1
2324 AESDEC KEY STATE2
2325 AESDEC KEY STATE3
2326 AESDEC KEY STATE4
2327 movaps 0x70(TKEYP), KEY
2328 AESDECLAST KEY STATE1 # last round
2329 AESDECLAST KEY STATE2
2330 AESDECLAST KEY STATE3
2331 AESDECLAST KEY STATE4
2332 ret
2333ENDPROC(_aesni_dec4)
2334
2335/*
2336 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2337 * size_t len)
2338 */
2339ENTRY(aesni_ecb_enc)
2340 FRAME_BEGIN
2341#ifndef __x86_64__
2342 pushl LEN
2343 pushl KEYP
2344 pushl KLEN
2345 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2346 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2347 movl (FRAME_OFFSET+24)(%esp), INP # src
2348 movl (FRAME_OFFSET+28)(%esp), LEN # len
2349#endif
2350 test LEN, LEN # check length
2351 jz .Lecb_enc_ret
2352 mov 480(KEYP), KLEN
2353 cmp $16, LEN
2354 jb .Lecb_enc_ret
2355 cmp $64, LEN
2356 jb .Lecb_enc_loop1
2357.align 4
2358.Lecb_enc_loop4:
2359 movups (INP), STATE1
2360 movups 0x10(INP), STATE2
2361 movups 0x20(INP), STATE3
2362 movups 0x30(INP), STATE4
2363 call _aesni_enc4
2364 movups STATE1, (OUTP)
2365 movups STATE2, 0x10(OUTP)
2366 movups STATE3, 0x20(OUTP)
2367 movups STATE4, 0x30(OUTP)
2368 sub $64, LEN
2369 add $64, INP
2370 add $64, OUTP
2371 cmp $64, LEN
2372 jge .Lecb_enc_loop4
2373 cmp $16, LEN
2374 jb .Lecb_enc_ret
2375.align 4
2376.Lecb_enc_loop1:
2377 movups (INP), STATE1
2378 call _aesni_enc1
2379 movups STATE1, (OUTP)
2380 sub $16, LEN
2381 add $16, INP
2382 add $16, OUTP
2383 cmp $16, LEN
2384 jge .Lecb_enc_loop1
2385.Lecb_enc_ret:
2386#ifndef __x86_64__
2387 popl KLEN
2388 popl KEYP
2389 popl LEN
2390#endif
2391 FRAME_END
2392 ret
2393ENDPROC(aesni_ecb_enc)
2394
2395/*
2396 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2397 * size_t len);
2398 */
2399ENTRY(aesni_ecb_dec)
2400 FRAME_BEGIN
2401#ifndef __x86_64__
2402 pushl LEN
2403 pushl KEYP
2404 pushl KLEN
2405 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2406 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2407 movl (FRAME_OFFSET+24)(%esp), INP # src
2408 movl (FRAME_OFFSET+28)(%esp), LEN # len
2409#endif
2410 test LEN, LEN
2411 jz .Lecb_dec_ret
2412 mov 480(KEYP), KLEN
2413 add $240, KEYP
2414 cmp $16, LEN
2415 jb .Lecb_dec_ret
2416 cmp $64, LEN
2417 jb .Lecb_dec_loop1
2418.align 4
2419.Lecb_dec_loop4:
2420 movups (INP), STATE1
2421 movups 0x10(INP), STATE2
2422 movups 0x20(INP), STATE3
2423 movups 0x30(INP), STATE4
2424 call _aesni_dec4
2425 movups STATE1, (OUTP)
2426 movups STATE2, 0x10(OUTP)
2427 movups STATE3, 0x20(OUTP)
2428 movups STATE4, 0x30(OUTP)
2429 sub $64, LEN
2430 add $64, INP
2431 add $64, OUTP
2432 cmp $64, LEN
2433 jge .Lecb_dec_loop4
2434 cmp $16, LEN
2435 jb .Lecb_dec_ret
2436.align 4
2437.Lecb_dec_loop1:
2438 movups (INP), STATE1
2439 call _aesni_dec1
2440 movups STATE1, (OUTP)
2441 sub $16, LEN
2442 add $16, INP
2443 add $16, OUTP
2444 cmp $16, LEN
2445 jge .Lecb_dec_loop1
2446.Lecb_dec_ret:
2447#ifndef __x86_64__
2448 popl KLEN
2449 popl KEYP
2450 popl LEN
2451#endif
2452 FRAME_END
2453 ret
2454ENDPROC(aesni_ecb_dec)
2455
2456/*
2457 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2458 * size_t len, u8 *iv)
2459 */
2460ENTRY(aesni_cbc_enc)
2461 FRAME_BEGIN
2462#ifndef __x86_64__
2463 pushl IVP
2464 pushl LEN
2465 pushl KEYP
2466 pushl KLEN
2467 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2468 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2469 movl (FRAME_OFFSET+28)(%esp), INP # src
2470 movl (FRAME_OFFSET+32)(%esp), LEN # len
2471 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2472#endif
2473 cmp $16, LEN
2474 jb .Lcbc_enc_ret
2475 mov 480(KEYP), KLEN
2476 movups (IVP), STATE # load iv as initial state
2477.align 4
2478.Lcbc_enc_loop:
2479 movups (INP), IN # load input
2480 pxor IN, STATE
2481 call _aesni_enc1
2482 movups STATE, (OUTP) # store output
2483 sub $16, LEN
2484 add $16, INP
2485 add $16, OUTP
2486 cmp $16, LEN
2487 jge .Lcbc_enc_loop
2488 movups STATE, (IVP)
2489.Lcbc_enc_ret:
2490#ifndef __x86_64__
2491 popl KLEN
2492 popl KEYP
2493 popl LEN
2494 popl IVP
2495#endif
2496 FRAME_END
2497 ret
2498ENDPROC(aesni_cbc_enc)
2499
2500/*
2501 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2502 * size_t len, u8 *iv)
2503 */
2504ENTRY(aesni_cbc_dec)
2505 FRAME_BEGIN
2506#ifndef __x86_64__
2507 pushl IVP
2508 pushl LEN
2509 pushl KEYP
2510 pushl KLEN
2511 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2512 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2513 movl (FRAME_OFFSET+28)(%esp), INP # src
2514 movl (FRAME_OFFSET+32)(%esp), LEN # len
2515 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2516#endif
2517 cmp $16, LEN
2518 jb .Lcbc_dec_just_ret
2519 mov 480(KEYP), KLEN
2520 add $240, KEYP
2521 movups (IVP), IV
2522 cmp $64, LEN
2523 jb .Lcbc_dec_loop1
2524.align 4
2525.Lcbc_dec_loop4:
2526 movups (INP), IN1
2527 movaps IN1, STATE1
2528 movups 0x10(INP), IN2
2529 movaps IN2, STATE2
2530#ifdef __x86_64__
2531 movups 0x20(INP), IN3
2532 movaps IN3, STATE3
2533 movups 0x30(INP), IN4
2534 movaps IN4, STATE4
2535#else
2536 movups 0x20(INP), IN1
2537 movaps IN1, STATE3
2538 movups 0x30(INP), IN2
2539 movaps IN2, STATE4
2540#endif
2541 call _aesni_dec4
2542 pxor IV, STATE1
2543#ifdef __x86_64__
2544 pxor IN1, STATE2
2545 pxor IN2, STATE3
2546 pxor IN3, STATE4
2547 movaps IN4, IV
2548#else
2549 pxor IN1, STATE4
2550 movaps IN2, IV
2551 movups (INP), IN1
2552 pxor IN1, STATE2
2553 movups 0x10(INP), IN2
2554 pxor IN2, STATE3
2555#endif
2556 movups STATE1, (OUTP)
2557 movups STATE2, 0x10(OUTP)
2558 movups STATE3, 0x20(OUTP)
2559 movups STATE4, 0x30(OUTP)
2560 sub $64, LEN
2561 add $64, INP
2562 add $64, OUTP
2563 cmp $64, LEN
2564 jge .Lcbc_dec_loop4
2565 cmp $16, LEN
2566 jb .Lcbc_dec_ret
2567.align 4
2568.Lcbc_dec_loop1:
2569 movups (INP), IN
2570 movaps IN, STATE
2571 call _aesni_dec1
2572 pxor IV, STATE
2573 movups STATE, (OUTP)
2574 movaps IN, IV
2575 sub $16, LEN
2576 add $16, INP
2577 add $16, OUTP
2578 cmp $16, LEN
2579 jge .Lcbc_dec_loop1
2580.Lcbc_dec_ret:
2581 movups IV, (IVP)
2582.Lcbc_dec_just_ret:
2583#ifndef __x86_64__
2584 popl KLEN
2585 popl KEYP
2586 popl LEN
2587 popl IVP
2588#endif
2589 FRAME_END
2590 ret
2591ENDPROC(aesni_cbc_dec)
2592
2593#ifdef __x86_64__
2594.pushsection .rodata
2595.align 16
2596.Lbswap_mask:
2597 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2598.popsection
2599
2600/*
2601 * _aesni_inc_init: internal ABI
2602 * setup registers used by _aesni_inc
2603 * input:
2604 * IV
2605 * output:
2606 * CTR: == IV, in little endian
2607 * TCTR_LOW: == lower qword of CTR
2608 * INC: == 1, in little endian
2609 * BSWAP_MASK == endian swapping mask
2610 */
2611.align 4
2612_aesni_inc_init:
2613 movaps .Lbswap_mask, BSWAP_MASK
2614 movaps IV, CTR
2615 PSHUFB_XMM BSWAP_MASK CTR
2616 mov $1, TCTR_LOW
2617 MOVQ_R64_XMM TCTR_LOW INC
2618 MOVQ_R64_XMM CTR TCTR_LOW
2619 ret
2620ENDPROC(_aesni_inc_init)
2621
2622/*
2623 * _aesni_inc: internal ABI
2624 * Increase IV by 1, IV is in big endian
2625 * input:
2626 * IV
2627 * CTR: == IV, in little endian
2628 * TCTR_LOW: == lower qword of CTR
2629 * INC: == 1, in little endian
2630 * BSWAP_MASK == endian swapping mask
2631 * output:
2632 * IV: Increase by 1
2633 * changed:
2634 * CTR: == output IV, in little endian
2635 * TCTR_LOW: == lower qword of CTR
2636 */
2637.align 4
2638_aesni_inc:
2639 paddq INC, CTR
2640 add $1, TCTR_LOW
2641 jnc .Linc_low
2642 pslldq $8, INC
2643 paddq INC, CTR
2644 psrldq $8, INC
2645.Linc_low:
2646 movaps CTR, IV
2647 PSHUFB_XMM BSWAP_MASK IV
2648 ret
2649ENDPROC(_aesni_inc)
2650
2651/*
2652 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2653 * size_t len, u8 *iv)
2654 */
2655ENTRY(aesni_ctr_enc)
2656 FRAME_BEGIN
2657 cmp $16, LEN
2658 jb .Lctr_enc_just_ret
2659 mov 480(KEYP), KLEN
2660 movups (IVP), IV
2661 call _aesni_inc_init
2662 cmp $64, LEN
2663 jb .Lctr_enc_loop1
2664.align 4
2665.Lctr_enc_loop4:
2666 movaps IV, STATE1
2667 call _aesni_inc
2668 movups (INP), IN1
2669 movaps IV, STATE2
2670 call _aesni_inc
2671 movups 0x10(INP), IN2
2672 movaps IV, STATE3
2673 call _aesni_inc
2674 movups 0x20(INP), IN3
2675 movaps IV, STATE4
2676 call _aesni_inc
2677 movups 0x30(INP), IN4
2678 call _aesni_enc4
2679 pxor IN1, STATE1
2680 movups STATE1, (OUTP)
2681 pxor IN2, STATE2
2682 movups STATE2, 0x10(OUTP)
2683 pxor IN3, STATE3
2684 movups STATE3, 0x20(OUTP)
2685 pxor IN4, STATE4
2686 movups STATE4, 0x30(OUTP)
2687 sub $64, LEN
2688 add $64, INP
2689 add $64, OUTP
2690 cmp $64, LEN
2691 jge .Lctr_enc_loop4
2692 cmp $16, LEN
2693 jb .Lctr_enc_ret
2694.align 4
2695.Lctr_enc_loop1:
2696 movaps IV, STATE
2697 call _aesni_inc
2698 movups (INP), IN
2699 call _aesni_enc1
2700 pxor IN, STATE
2701 movups STATE, (OUTP)
2702 sub $16, LEN
2703 add $16, INP
2704 add $16, OUTP
2705 cmp $16, LEN
2706 jge .Lctr_enc_loop1
2707.Lctr_enc_ret:
2708 movups IV, (IVP)
2709.Lctr_enc_just_ret:
2710 FRAME_END
2711 ret
2712ENDPROC(aesni_ctr_enc)
2713
2714/*
2715 * _aesni_gf128mul_x_ble: internal ABI
2716 * Multiply in GF(2^128) for XTS IVs
2717 * input:
2718 * IV: current IV
2719 * GF128MUL_MASK == mask with 0x87 and 0x01
2720 * output:
2721 * IV: next IV
2722 * changed:
2723 * CTR: == temporary value
2724 */
2725#define _aesni_gf128mul_x_ble() \
2726 pshufd $0x13, IV, CTR; \
2727 paddq IV, IV; \
2728 psrad $31, CTR; \
2729 pand GF128MUL_MASK, CTR; \
2730 pxor CTR, IV;
2731
2732/*
2733 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2734 * bool enc, u8 *iv)
2735 */
2736ENTRY(aesni_xts_crypt8)
2737 FRAME_BEGIN
2738 cmpb $0, %cl
2739 movl $0, %ecx
2740 movl $240, %r10d
2741 leaq _aesni_enc4, %r11
2742 leaq _aesni_dec4, %rax
2743 cmovel %r10d, %ecx
2744 cmoveq %rax, %r11
2745
2746 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2747 movups (IVP), IV
2748
2749 mov 480(KEYP), KLEN
2750 addq %rcx, KEYP
2751
2752 movdqa IV, STATE1
2753 movdqu 0x00(INP), INC
2754 pxor INC, STATE1
2755 movdqu IV, 0x00(OUTP)
2756
2757 _aesni_gf128mul_x_ble()
2758 movdqa IV, STATE2
2759 movdqu 0x10(INP), INC
2760 pxor INC, STATE2
2761 movdqu IV, 0x10(OUTP)
2762
2763 _aesni_gf128mul_x_ble()
2764 movdqa IV, STATE3
2765 movdqu 0x20(INP), INC
2766 pxor INC, STATE3
2767 movdqu IV, 0x20(OUTP)
2768
2769 _aesni_gf128mul_x_ble()
2770 movdqa IV, STATE4
2771 movdqu 0x30(INP), INC
2772 pxor INC, STATE4
2773 movdqu IV, 0x30(OUTP)
2774
2775 CALL_NOSPEC %r11
2776
2777 movdqu 0x00(OUTP), INC
2778 pxor INC, STATE1
2779 movdqu STATE1, 0x00(OUTP)
2780
2781 _aesni_gf128mul_x_ble()
2782 movdqa IV, STATE1
2783 movdqu 0x40(INP), INC
2784 pxor INC, STATE1
2785 movdqu IV, 0x40(OUTP)
2786
2787 movdqu 0x10(OUTP), INC
2788 pxor INC, STATE2
2789 movdqu STATE2, 0x10(OUTP)
2790
2791 _aesni_gf128mul_x_ble()
2792 movdqa IV, STATE2
2793 movdqu 0x50(INP), INC
2794 pxor INC, STATE2
2795 movdqu IV, 0x50(OUTP)
2796
2797 movdqu 0x20(OUTP), INC
2798 pxor INC, STATE3
2799 movdqu STATE3, 0x20(OUTP)
2800
2801 _aesni_gf128mul_x_ble()
2802 movdqa IV, STATE3
2803 movdqu 0x60(INP), INC
2804 pxor INC, STATE3
2805 movdqu IV, 0x60(OUTP)
2806
2807 movdqu 0x30(OUTP), INC
2808 pxor INC, STATE4
2809 movdqu STATE4, 0x30(OUTP)
2810
2811 _aesni_gf128mul_x_ble()
2812 movdqa IV, STATE4
2813 movdqu 0x70(INP), INC
2814 pxor INC, STATE4
2815 movdqu IV, 0x70(OUTP)
2816
2817 _aesni_gf128mul_x_ble()
2818 movups IV, (IVP)
2819
2820 CALL_NOSPEC %r11
2821
2822 movdqu 0x40(OUTP), INC
2823 pxor INC, STATE1
2824 movdqu STATE1, 0x40(OUTP)
2825
2826 movdqu 0x50(OUTP), INC
2827 pxor INC, STATE2
2828 movdqu STATE2, 0x50(OUTP)
2829
2830 movdqu 0x60(OUTP), INC
2831 pxor INC, STATE3
2832 movdqu STATE3, 0x60(OUTP)
2833
2834 movdqu 0x70(OUTP), INC
2835 pxor INC, STATE4
2836 movdqu STATE4, 0x70(OUTP)
2837
2838 FRAME_END
2839 ret
2840ENDPROC(aesni_xts_crypt8)
2841
2842#endif