Loading...
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34
35#ifdef __x86_64__
36.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
56.text
57
58
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
91#define STATE1 %xmm0
92#define STATE2 %xmm4
93#define STATE3 %xmm5
94#define STATE4 %xmm6
95#define STATE STATE1
96#define IN1 %xmm1
97#define IN2 %xmm7
98#define IN3 %xmm8
99#define IN4 %xmm9
100#define IN IN1
101#define KEY %xmm2
102#define IV %xmm3
103
104#define BSWAP_MASK %xmm10
105#define CTR %xmm11
106#define INC %xmm12
107
108#ifdef __x86_64__
109#define AREG %rax
110#define KEYP %rdi
111#define OUTP %rsi
112#define UKEYP OUTP
113#define INP %rdx
114#define LEN %rcx
115#define IVP %r8
116#define KLEN %r9d
117#define T1 %r10
118#define TKEYP T1
119#define T2 %r11
120#define TCTR_LOW T2
121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
300
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651 add $64, %r11
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267 push %r12
1268 push %r13
1269 push %r14
1270 mov %rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275 sub $VARIABLE_OFFSET, %rsp
1276 and $~63, %rsp # align rsp to 64 bytes
1277 mov %arg6, %r12
1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285 movdqa %xmm13, %xmm2
1286 psllq $1, %xmm13
1287 psrlq $63, %xmm2
1288 movdqa %xmm2, %xmm1
1289 pslldq $8, %xmm2
1290 psrldq $8, %xmm1
1291 por %xmm2, %xmm13
1292
1293 # Reduction
1294
1295 pshufd $0x24, %xmm1, %xmm2
1296 pcmpeqd TWOONE(%rip), %xmm2
1297 pand POLY(%rip), %xmm2
1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301 # Decrypt first few blocks
1302
1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306 mov %r13, %r12
1307 and $(3<<4), %r12
1308 jz _initial_num_blocks_is_0_decrypt
1309 cmp $(2<<4), %r12
1310 jb _initial_num_blocks_is_1_decrypt
1311 je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 sub $48, %r13
1316 jmp _initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 sub $32, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 sub $16, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331 cmp $0, %r13
1332 je _zero_cipher_left_decrypt
1333 sub $64, %r13
1334 je _four_cipher_left_decrypt
1335_decrypt_by_4:
1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338 add $64, %r11
1339 sub $64, %r13
1340 jne _decrypt_by_4
1341_four_cipher_left_decrypt:
1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345 mov %arg4, %r13
1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt
1348
1349 # Handle the last <16 byte block separately
1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11
1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366 movdqa %xmm1, %xmm2
1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 pand %xmm1, %xmm2
1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
1375 pxor %xmm2, %xmm8
1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377 # GHASH computation for the last <16 byte block
1378 sub %r13, %r11
1379 add $16, %r11
1380
1381 # output %r13 bytes
1382 MOVQ_R64_XMM %xmm0, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left_decrypt
1385 mov %rax, (%arg2 , %r11, 1)
1386 add $8, %r11
1387 psrldq $8, %xmm0
1388 MOVQ_R64_XMM %xmm0, %rax
1389 sub $8, %r13
1390_less_than_8_bytes_left_decrypt:
1391 mov %al, (%arg2, %r11, 1)
1392 add $1, %r11
1393 shr $8, %rax
1394 sub $1, %r13
1395 jne _less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398 shl $3, %r12 # convert into number of bits
1399 movd %r12d, %xmm15 # len(A) in %xmm15
1400 shl $3, %arg4 # len(C) in bits (*128)
1401 MOVQ_R64_XMM %arg4, %xmm1
1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 pxor %xmm15, %xmm8
1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406 # final GHASH computation
1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
1410 mov %arg5, %rax # %rax = *Y0
1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413 pxor %xmm8, %xmm0
1414_return_T_decrypt:
1415 mov arg9, %r10 # %r10 = authTag
1416 mov arg10, %r11 # %r11 = auth_tag_len
1417 cmp $16, %r11
1418 je _T_16_decrypt
1419 cmp $12, %r11
1420 je _T_12_decrypt
1421_T_8_decrypt:
1422 MOVQ_R64_XMM %xmm0, %rax
1423 mov %rax, (%r10)
1424 jmp _return_T_done_decrypt
1425_T_12_decrypt:
1426 MOVQ_R64_XMM %xmm0, %rax
1427 mov %rax, (%r10)
1428 psrldq $8, %xmm0
1429 movd %xmm0, %eax
1430 mov %eax, 8(%r10)
1431 jmp _return_T_done_decrypt
1432_T_16_decrypt:
1433 movdqu %xmm0, (%r10)
1434_return_T_done_decrypt:
1435 mov %r14, %rsp
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439 ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1617 sub $16, %r11
1618 add %r13, %r11
1619 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1620 lea SHIFT_MASK+16(%rip), %r12
1621 sub %r13, %r12
1622 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623 # (%r13 is the number of bytes in plaintext mod 16)
1624 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1625 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1626 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1627 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1628 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10,%xmm0
1632
1633 pxor %xmm0, %xmm8
1634 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635 # GHASH computation for the last <16 byte block
1636 sub %r13, %r11
1637 add $16, %r11
1638
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
1641
1642 # shuffle xmm0 back to output as ciphertext
1643
1644 # Output %r13 bytes
1645 MOVQ_R64_XMM %xmm0, %rax
1646 cmp $8, %r13
1647 jle _less_than_8_bytes_left_encrypt
1648 mov %rax, (%arg2 , %r11, 1)
1649 add $8, %r11
1650 psrldq $8, %xmm0
1651 MOVQ_R64_XMM %xmm0, %rax
1652 sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654 mov %al, (%arg2, %r11, 1)
1655 add $1, %r11
1656 shr $8, %rax
1657 sub $1, %r13
1658 jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660 mov arg8, %r12 # %r12 = addLen (number of bytes)
1661 shl $3, %r12
1662 movd %r12d, %xmm15 # len(A) in %xmm15
1663 shl $3, %arg4 # len(C) in bits (*128)
1664 MOVQ_R64_XMM %arg4, %xmm1
1665 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1666 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1667 pxor %xmm15, %xmm8
1668 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669 # final GHASH computation
1670 movdqa SHUF_MASK(%rip), %xmm10
1671 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1672
1673 mov %arg5, %rax # %rax = *Y0
1674 movdqu (%rax), %xmm0 # %xmm0 = Y0
1675 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1676 pxor %xmm8, %xmm0
1677_return_T_encrypt:
1678 mov arg9, %r10 # %r10 = authTag
1679 mov arg10, %r11 # %r11 = auth_tag_len
1680 cmp $16, %r11
1681 je _T_16_encrypt
1682 cmp $12, %r11
1683 je _T_12_encrypt
1684_T_8_encrypt:
1685 MOVQ_R64_XMM %xmm0, %rax
1686 mov %rax, (%r10)
1687 jmp _return_T_done_encrypt
1688_T_12_encrypt:
1689 MOVQ_R64_XMM %xmm0, %rax
1690 mov %rax, (%r10)
1691 psrldq $8, %xmm0
1692 movd %xmm0, %eax
1693 mov %eax, 8(%r10)
1694 jmp _return_T_done_encrypt
1695_T_16_encrypt:
1696 movdqu %xmm0, (%r10)
1697_return_T_done_encrypt:
1698 mov %r14, %rsp
1699 pop %r14
1700 pop %r13
1701 pop %r12
1702 ret
1703
1704#endif
1705
1706
1707_key_expansion_128:
1708_key_expansion_256a:
1709 pshufd $0b11111111, %xmm1, %xmm1
1710 shufps $0b00010000, %xmm0, %xmm4
1711 pxor %xmm4, %xmm0
1712 shufps $0b10001100, %xmm0, %xmm4
1713 pxor %xmm4, %xmm0
1714 pxor %xmm1, %xmm0
1715 movaps %xmm0, (TKEYP)
1716 add $0x10, TKEYP
1717 ret
1718
1719.align 4
1720_key_expansion_192a:
1721 pshufd $0b01010101, %xmm1, %xmm1
1722 shufps $0b00010000, %xmm0, %xmm4
1723 pxor %xmm4, %xmm0
1724 shufps $0b10001100, %xmm0, %xmm4
1725 pxor %xmm4, %xmm0
1726 pxor %xmm1, %xmm0
1727
1728 movaps %xmm2, %xmm5
1729 movaps %xmm2, %xmm6
1730 pslldq $4, %xmm5
1731 pshufd $0b11111111, %xmm0, %xmm3
1732 pxor %xmm3, %xmm2
1733 pxor %xmm5, %xmm2
1734
1735 movaps %xmm0, %xmm1
1736 shufps $0b01000100, %xmm0, %xmm6
1737 movaps %xmm6, (TKEYP)
1738 shufps $0b01001110, %xmm2, %xmm1
1739 movaps %xmm1, 0x10(TKEYP)
1740 add $0x20, TKEYP
1741 ret
1742
1743.align 4
1744_key_expansion_192b:
1745 pshufd $0b01010101, %xmm1, %xmm1
1746 shufps $0b00010000, %xmm0, %xmm4
1747 pxor %xmm4, %xmm0
1748 shufps $0b10001100, %xmm0, %xmm4
1749 pxor %xmm4, %xmm0
1750 pxor %xmm1, %xmm0
1751
1752 movaps %xmm2, %xmm5
1753 pslldq $4, %xmm5
1754 pshufd $0b11111111, %xmm0, %xmm3
1755 pxor %xmm3, %xmm2
1756 pxor %xmm5, %xmm2
1757
1758 movaps %xmm0, (TKEYP)
1759 add $0x10, TKEYP
1760 ret
1761
1762.align 4
1763_key_expansion_256b:
1764 pshufd $0b10101010, %xmm1, %xmm1
1765 shufps $0b00010000, %xmm2, %xmm4
1766 pxor %xmm4, %xmm2
1767 shufps $0b10001100, %xmm2, %xmm4
1768 pxor %xmm4, %xmm2
1769 pxor %xmm1, %xmm2
1770 movaps %xmm2, (TKEYP)
1771 add $0x10, TKEYP
1772 ret
1773
1774/*
1775 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1776 * unsigned int key_len)
1777 */
1778ENTRY(aesni_set_key)
1779#ifndef __x86_64__
1780 pushl KEYP
1781 movl 8(%esp), KEYP # ctx
1782 movl 12(%esp), UKEYP # in_key
1783 movl 16(%esp), %edx # key_len
1784#endif
1785 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1786 movaps %xmm0, (KEYP)
1787 lea 0x10(KEYP), TKEYP # key addr
1788 movl %edx, 480(KEYP)
1789 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1790 cmp $24, %dl
1791 jb .Lenc_key128
1792 je .Lenc_key192
1793 movups 0x10(UKEYP), %xmm2 # other user key
1794 movaps %xmm2, (TKEYP)
1795 add $0x10, TKEYP
1796 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1797 call _key_expansion_256a
1798 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1799 call _key_expansion_256b
1800 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1801 call _key_expansion_256a
1802 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1803 call _key_expansion_256b
1804 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1805 call _key_expansion_256a
1806 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1807 call _key_expansion_256b
1808 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1809 call _key_expansion_256a
1810 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1811 call _key_expansion_256b
1812 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1813 call _key_expansion_256a
1814 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1815 call _key_expansion_256b
1816 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1817 call _key_expansion_256a
1818 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1819 call _key_expansion_256b
1820 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1821 call _key_expansion_256a
1822 jmp .Ldec_key
1823.Lenc_key192:
1824 movq 0x10(UKEYP), %xmm2 # other user key
1825 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1826 call _key_expansion_192a
1827 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1828 call _key_expansion_192b
1829 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1830 call _key_expansion_192a
1831 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1832 call _key_expansion_192b
1833 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1834 call _key_expansion_192a
1835 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1836 call _key_expansion_192b
1837 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1838 call _key_expansion_192a
1839 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1840 call _key_expansion_192b
1841 jmp .Ldec_key
1842.Lenc_key128:
1843 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1844 call _key_expansion_128
1845 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1846 call _key_expansion_128
1847 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1848 call _key_expansion_128
1849 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1850 call _key_expansion_128
1851 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1852 call _key_expansion_128
1853 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1854 call _key_expansion_128
1855 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1856 call _key_expansion_128
1857 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1858 call _key_expansion_128
1859 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1860 call _key_expansion_128
1861 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1862 call _key_expansion_128
1863.Ldec_key:
1864 sub $0x10, TKEYP
1865 movaps (KEYP), %xmm0
1866 movaps (TKEYP), %xmm1
1867 movaps %xmm0, 240(TKEYP)
1868 movaps %xmm1, 240(KEYP)
1869 add $0x10, KEYP
1870 lea 240-16(TKEYP), UKEYP
1871.align 4
1872.Ldec_key_loop:
1873 movaps (KEYP), %xmm0
1874 AESIMC %xmm0 %xmm1
1875 movaps %xmm1, (UKEYP)
1876 add $0x10, KEYP
1877 sub $0x10, UKEYP
1878 cmp TKEYP, KEYP
1879 jb .Ldec_key_loop
1880 xor AREG, AREG
1881#ifndef __x86_64__
1882 popl KEYP
1883#endif
1884 ret
1885
1886/*
1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1888 */
1889ENTRY(aesni_enc)
1890#ifndef __x86_64__
1891 pushl KEYP
1892 pushl KLEN
1893 movl 12(%esp), KEYP
1894 movl 16(%esp), OUTP
1895 movl 20(%esp), INP
1896#endif
1897 movl 480(KEYP), KLEN # key length
1898 movups (INP), STATE # input
1899 call _aesni_enc1
1900 movups STATE, (OUTP) # output
1901#ifndef __x86_64__
1902 popl KLEN
1903 popl KEYP
1904#endif
1905 ret
1906
1907/*
1908 * _aesni_enc1: internal ABI
1909 * input:
1910 * KEYP: key struct pointer
1911 * KLEN: round count
1912 * STATE: initial state (input)
1913 * output:
1914 * STATE: finial state (output)
1915 * changed:
1916 * KEY
1917 * TKEYP (T1)
1918 */
1919.align 4
1920_aesni_enc1:
1921 movaps (KEYP), KEY # key
1922 mov KEYP, TKEYP
1923 pxor KEY, STATE # round 0
1924 add $0x30, TKEYP
1925 cmp $24, KLEN
1926 jb .Lenc128
1927 lea 0x20(TKEYP), TKEYP
1928 je .Lenc192
1929 add $0x20, TKEYP
1930 movaps -0x60(TKEYP), KEY
1931 AESENC KEY STATE
1932 movaps -0x50(TKEYP), KEY
1933 AESENC KEY STATE
1934.align 4
1935.Lenc192:
1936 movaps -0x40(TKEYP), KEY
1937 AESENC KEY STATE
1938 movaps -0x30(TKEYP), KEY
1939 AESENC KEY STATE
1940.align 4
1941.Lenc128:
1942 movaps -0x20(TKEYP), KEY
1943 AESENC KEY STATE
1944 movaps -0x10(TKEYP), KEY
1945 AESENC KEY STATE
1946 movaps (TKEYP), KEY
1947 AESENC KEY STATE
1948 movaps 0x10(TKEYP), KEY
1949 AESENC KEY STATE
1950 movaps 0x20(TKEYP), KEY
1951 AESENC KEY STATE
1952 movaps 0x30(TKEYP), KEY
1953 AESENC KEY STATE
1954 movaps 0x40(TKEYP), KEY
1955 AESENC KEY STATE
1956 movaps 0x50(TKEYP), KEY
1957 AESENC KEY STATE
1958 movaps 0x60(TKEYP), KEY
1959 AESENC KEY STATE
1960 movaps 0x70(TKEYP), KEY
1961 AESENCLAST KEY STATE
1962 ret
1963
1964/*
1965 * _aesni_enc4: internal ABI
1966 * input:
1967 * KEYP: key struct pointer
1968 * KLEN: round count
1969 * STATE1: initial state (input)
1970 * STATE2
1971 * STATE3
1972 * STATE4
1973 * output:
1974 * STATE1: finial state (output)
1975 * STATE2
1976 * STATE3
1977 * STATE4
1978 * changed:
1979 * KEY
1980 * TKEYP (T1)
1981 */
1982.align 4
1983_aesni_enc4:
1984 movaps (KEYP), KEY # key
1985 mov KEYP, TKEYP
1986 pxor KEY, STATE1 # round 0
1987 pxor KEY, STATE2
1988 pxor KEY, STATE3
1989 pxor KEY, STATE4
1990 add $0x30, TKEYP
1991 cmp $24, KLEN
1992 jb .L4enc128
1993 lea 0x20(TKEYP), TKEYP
1994 je .L4enc192
1995 add $0x20, TKEYP
1996 movaps -0x60(TKEYP), KEY
1997 AESENC KEY STATE1
1998 AESENC KEY STATE2
1999 AESENC KEY STATE3
2000 AESENC KEY STATE4
2001 movaps -0x50(TKEYP), KEY
2002 AESENC KEY STATE1
2003 AESENC KEY STATE2
2004 AESENC KEY STATE3
2005 AESENC KEY STATE4
2006#.align 4
2007.L4enc192:
2008 movaps -0x40(TKEYP), KEY
2009 AESENC KEY STATE1
2010 AESENC KEY STATE2
2011 AESENC KEY STATE3
2012 AESENC KEY STATE4
2013 movaps -0x30(TKEYP), KEY
2014 AESENC KEY STATE1
2015 AESENC KEY STATE2
2016 AESENC KEY STATE3
2017 AESENC KEY STATE4
2018#.align 4
2019.L4enc128:
2020 movaps -0x20(TKEYP), KEY
2021 AESENC KEY STATE1
2022 AESENC KEY STATE2
2023 AESENC KEY STATE3
2024 AESENC KEY STATE4
2025 movaps -0x10(TKEYP), KEY
2026 AESENC KEY STATE1
2027 AESENC KEY STATE2
2028 AESENC KEY STATE3
2029 AESENC KEY STATE4
2030 movaps (TKEYP), KEY
2031 AESENC KEY STATE1
2032 AESENC KEY STATE2
2033 AESENC KEY STATE3
2034 AESENC KEY STATE4
2035 movaps 0x10(TKEYP), KEY
2036 AESENC KEY STATE1
2037 AESENC KEY STATE2
2038 AESENC KEY STATE3
2039 AESENC KEY STATE4
2040 movaps 0x20(TKEYP), KEY
2041 AESENC KEY STATE1
2042 AESENC KEY STATE2
2043 AESENC KEY STATE3
2044 AESENC KEY STATE4
2045 movaps 0x30(TKEYP), KEY
2046 AESENC KEY STATE1
2047 AESENC KEY STATE2
2048 AESENC KEY STATE3
2049 AESENC KEY STATE4
2050 movaps 0x40(TKEYP), KEY
2051 AESENC KEY STATE1
2052 AESENC KEY STATE2
2053 AESENC KEY STATE3
2054 AESENC KEY STATE4
2055 movaps 0x50(TKEYP), KEY
2056 AESENC KEY STATE1
2057 AESENC KEY STATE2
2058 AESENC KEY STATE3
2059 AESENC KEY STATE4
2060 movaps 0x60(TKEYP), KEY
2061 AESENC KEY STATE1
2062 AESENC KEY STATE2
2063 AESENC KEY STATE3
2064 AESENC KEY STATE4
2065 movaps 0x70(TKEYP), KEY
2066 AESENCLAST KEY STATE1 # last round
2067 AESENCLAST KEY STATE2
2068 AESENCLAST KEY STATE3
2069 AESENCLAST KEY STATE4
2070 ret
2071
2072/*
2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2074 */
2075ENTRY(aesni_dec)
2076#ifndef __x86_64__
2077 pushl KEYP
2078 pushl KLEN
2079 movl 12(%esp), KEYP
2080 movl 16(%esp), OUTP
2081 movl 20(%esp), INP
2082#endif
2083 mov 480(KEYP), KLEN # key length
2084 add $240, KEYP
2085 movups (INP), STATE # input
2086 call _aesni_dec1
2087 movups STATE, (OUTP) #output
2088#ifndef __x86_64__
2089 popl KLEN
2090 popl KEYP
2091#endif
2092 ret
2093
2094/*
2095 * _aesni_dec1: internal ABI
2096 * input:
2097 * KEYP: key struct pointer
2098 * KLEN: key length
2099 * STATE: initial state (input)
2100 * output:
2101 * STATE: finial state (output)
2102 * changed:
2103 * KEY
2104 * TKEYP (T1)
2105 */
2106.align 4
2107_aesni_dec1:
2108 movaps (KEYP), KEY # key
2109 mov KEYP, TKEYP
2110 pxor KEY, STATE # round 0
2111 add $0x30, TKEYP
2112 cmp $24, KLEN
2113 jb .Ldec128
2114 lea 0x20(TKEYP), TKEYP
2115 je .Ldec192
2116 add $0x20, TKEYP
2117 movaps -0x60(TKEYP), KEY
2118 AESDEC KEY STATE
2119 movaps -0x50(TKEYP), KEY
2120 AESDEC KEY STATE
2121.align 4
2122.Ldec192:
2123 movaps -0x40(TKEYP), KEY
2124 AESDEC KEY STATE
2125 movaps -0x30(TKEYP), KEY
2126 AESDEC KEY STATE
2127.align 4
2128.Ldec128:
2129 movaps -0x20(TKEYP), KEY
2130 AESDEC KEY STATE
2131 movaps -0x10(TKEYP), KEY
2132 AESDEC KEY STATE
2133 movaps (TKEYP), KEY
2134 AESDEC KEY STATE
2135 movaps 0x10(TKEYP), KEY
2136 AESDEC KEY STATE
2137 movaps 0x20(TKEYP), KEY
2138 AESDEC KEY STATE
2139 movaps 0x30(TKEYP), KEY
2140 AESDEC KEY STATE
2141 movaps 0x40(TKEYP), KEY
2142 AESDEC KEY STATE
2143 movaps 0x50(TKEYP), KEY
2144 AESDEC KEY STATE
2145 movaps 0x60(TKEYP), KEY
2146 AESDEC KEY STATE
2147 movaps 0x70(TKEYP), KEY
2148 AESDECLAST KEY STATE
2149 ret
2150
2151/*
2152 * _aesni_dec4: internal ABI
2153 * input:
2154 * KEYP: key struct pointer
2155 * KLEN: key length
2156 * STATE1: initial state (input)
2157 * STATE2
2158 * STATE3
2159 * STATE4
2160 * output:
2161 * STATE1: finial state (output)
2162 * STATE2
2163 * STATE3
2164 * STATE4
2165 * changed:
2166 * KEY
2167 * TKEYP (T1)
2168 */
2169.align 4
2170_aesni_dec4:
2171 movaps (KEYP), KEY # key
2172 mov KEYP, TKEYP
2173 pxor KEY, STATE1 # round 0
2174 pxor KEY, STATE2
2175 pxor KEY, STATE3
2176 pxor KEY, STATE4
2177 add $0x30, TKEYP
2178 cmp $24, KLEN
2179 jb .L4dec128
2180 lea 0x20(TKEYP), TKEYP
2181 je .L4dec192
2182 add $0x20, TKEYP
2183 movaps -0x60(TKEYP), KEY
2184 AESDEC KEY STATE1
2185 AESDEC KEY STATE2
2186 AESDEC KEY STATE3
2187 AESDEC KEY STATE4
2188 movaps -0x50(TKEYP), KEY
2189 AESDEC KEY STATE1
2190 AESDEC KEY STATE2
2191 AESDEC KEY STATE3
2192 AESDEC KEY STATE4
2193.align 4
2194.L4dec192:
2195 movaps -0x40(TKEYP), KEY
2196 AESDEC KEY STATE1
2197 AESDEC KEY STATE2
2198 AESDEC KEY STATE3
2199 AESDEC KEY STATE4
2200 movaps -0x30(TKEYP), KEY
2201 AESDEC KEY STATE1
2202 AESDEC KEY STATE2
2203 AESDEC KEY STATE3
2204 AESDEC KEY STATE4
2205.align 4
2206.L4dec128:
2207 movaps -0x20(TKEYP), KEY
2208 AESDEC KEY STATE1
2209 AESDEC KEY STATE2
2210 AESDEC KEY STATE3
2211 AESDEC KEY STATE4
2212 movaps -0x10(TKEYP), KEY
2213 AESDEC KEY STATE1
2214 AESDEC KEY STATE2
2215 AESDEC KEY STATE3
2216 AESDEC KEY STATE4
2217 movaps (TKEYP), KEY
2218 AESDEC KEY STATE1
2219 AESDEC KEY STATE2
2220 AESDEC KEY STATE3
2221 AESDEC KEY STATE4
2222 movaps 0x10(TKEYP), KEY
2223 AESDEC KEY STATE1
2224 AESDEC KEY STATE2
2225 AESDEC KEY STATE3
2226 AESDEC KEY STATE4
2227 movaps 0x20(TKEYP), KEY
2228 AESDEC KEY STATE1
2229 AESDEC KEY STATE2
2230 AESDEC KEY STATE3
2231 AESDEC KEY STATE4
2232 movaps 0x30(TKEYP), KEY
2233 AESDEC KEY STATE1
2234 AESDEC KEY STATE2
2235 AESDEC KEY STATE3
2236 AESDEC KEY STATE4
2237 movaps 0x40(TKEYP), KEY
2238 AESDEC KEY STATE1
2239 AESDEC KEY STATE2
2240 AESDEC KEY STATE3
2241 AESDEC KEY STATE4
2242 movaps 0x50(TKEYP), KEY
2243 AESDEC KEY STATE1
2244 AESDEC KEY STATE2
2245 AESDEC KEY STATE3
2246 AESDEC KEY STATE4
2247 movaps 0x60(TKEYP), KEY
2248 AESDEC KEY STATE1
2249 AESDEC KEY STATE2
2250 AESDEC KEY STATE3
2251 AESDEC KEY STATE4
2252 movaps 0x70(TKEYP), KEY
2253 AESDECLAST KEY STATE1 # last round
2254 AESDECLAST KEY STATE2
2255 AESDECLAST KEY STATE3
2256 AESDECLAST KEY STATE4
2257 ret
2258
2259/*
2260 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 * size_t len)
2262 */
2263ENTRY(aesni_ecb_enc)
2264#ifndef __x86_64__
2265 pushl LEN
2266 pushl KEYP
2267 pushl KLEN
2268 movl 16(%esp), KEYP
2269 movl 20(%esp), OUTP
2270 movl 24(%esp), INP
2271 movl 28(%esp), LEN
2272#endif
2273 test LEN, LEN # check length
2274 jz .Lecb_enc_ret
2275 mov 480(KEYP), KLEN
2276 cmp $16, LEN
2277 jb .Lecb_enc_ret
2278 cmp $64, LEN
2279 jb .Lecb_enc_loop1
2280.align 4
2281.Lecb_enc_loop4:
2282 movups (INP), STATE1
2283 movups 0x10(INP), STATE2
2284 movups 0x20(INP), STATE3
2285 movups 0x30(INP), STATE4
2286 call _aesni_enc4
2287 movups STATE1, (OUTP)
2288 movups STATE2, 0x10(OUTP)
2289 movups STATE3, 0x20(OUTP)
2290 movups STATE4, 0x30(OUTP)
2291 sub $64, LEN
2292 add $64, INP
2293 add $64, OUTP
2294 cmp $64, LEN
2295 jge .Lecb_enc_loop4
2296 cmp $16, LEN
2297 jb .Lecb_enc_ret
2298.align 4
2299.Lecb_enc_loop1:
2300 movups (INP), STATE1
2301 call _aesni_enc1
2302 movups STATE1, (OUTP)
2303 sub $16, LEN
2304 add $16, INP
2305 add $16, OUTP
2306 cmp $16, LEN
2307 jge .Lecb_enc_loop1
2308.Lecb_enc_ret:
2309#ifndef __x86_64__
2310 popl KLEN
2311 popl KEYP
2312 popl LEN
2313#endif
2314 ret
2315
2316/*
2317 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 * size_t len);
2319 */
2320ENTRY(aesni_ecb_dec)
2321#ifndef __x86_64__
2322 pushl LEN
2323 pushl KEYP
2324 pushl KLEN
2325 movl 16(%esp), KEYP
2326 movl 20(%esp), OUTP
2327 movl 24(%esp), INP
2328 movl 28(%esp), LEN
2329#endif
2330 test LEN, LEN
2331 jz .Lecb_dec_ret
2332 mov 480(KEYP), KLEN
2333 add $240, KEYP
2334 cmp $16, LEN
2335 jb .Lecb_dec_ret
2336 cmp $64, LEN
2337 jb .Lecb_dec_loop1
2338.align 4
2339.Lecb_dec_loop4:
2340 movups (INP), STATE1
2341 movups 0x10(INP), STATE2
2342 movups 0x20(INP), STATE3
2343 movups 0x30(INP), STATE4
2344 call _aesni_dec4
2345 movups STATE1, (OUTP)
2346 movups STATE2, 0x10(OUTP)
2347 movups STATE3, 0x20(OUTP)
2348 movups STATE4, 0x30(OUTP)
2349 sub $64, LEN
2350 add $64, INP
2351 add $64, OUTP
2352 cmp $64, LEN
2353 jge .Lecb_dec_loop4
2354 cmp $16, LEN
2355 jb .Lecb_dec_ret
2356.align 4
2357.Lecb_dec_loop1:
2358 movups (INP), STATE1
2359 call _aesni_dec1
2360 movups STATE1, (OUTP)
2361 sub $16, LEN
2362 add $16, INP
2363 add $16, OUTP
2364 cmp $16, LEN
2365 jge .Lecb_dec_loop1
2366.Lecb_dec_ret:
2367#ifndef __x86_64__
2368 popl KLEN
2369 popl KEYP
2370 popl LEN
2371#endif
2372 ret
2373
2374/*
2375 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2376 * size_t len, u8 *iv)
2377 */
2378ENTRY(aesni_cbc_enc)
2379#ifndef __x86_64__
2380 pushl IVP
2381 pushl LEN
2382 pushl KEYP
2383 pushl KLEN
2384 movl 20(%esp), KEYP
2385 movl 24(%esp), OUTP
2386 movl 28(%esp), INP
2387 movl 32(%esp), LEN
2388 movl 36(%esp), IVP
2389#endif
2390 cmp $16, LEN
2391 jb .Lcbc_enc_ret
2392 mov 480(KEYP), KLEN
2393 movups (IVP), STATE # load iv as initial state
2394.align 4
2395.Lcbc_enc_loop:
2396 movups (INP), IN # load input
2397 pxor IN, STATE
2398 call _aesni_enc1
2399 movups STATE, (OUTP) # store output
2400 sub $16, LEN
2401 add $16, INP
2402 add $16, OUTP
2403 cmp $16, LEN
2404 jge .Lcbc_enc_loop
2405 movups STATE, (IVP)
2406.Lcbc_enc_ret:
2407#ifndef __x86_64__
2408 popl KLEN
2409 popl KEYP
2410 popl LEN
2411 popl IVP
2412#endif
2413 ret
2414
2415/*
2416 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2417 * size_t len, u8 *iv)
2418 */
2419ENTRY(aesni_cbc_dec)
2420#ifndef __x86_64__
2421 pushl IVP
2422 pushl LEN
2423 pushl KEYP
2424 pushl KLEN
2425 movl 20(%esp), KEYP
2426 movl 24(%esp), OUTP
2427 movl 28(%esp), INP
2428 movl 32(%esp), LEN
2429 movl 36(%esp), IVP
2430#endif
2431 cmp $16, LEN
2432 jb .Lcbc_dec_just_ret
2433 mov 480(KEYP), KLEN
2434 add $240, KEYP
2435 movups (IVP), IV
2436 cmp $64, LEN
2437 jb .Lcbc_dec_loop1
2438.align 4
2439.Lcbc_dec_loop4:
2440 movups (INP), IN1
2441 movaps IN1, STATE1
2442 movups 0x10(INP), IN2
2443 movaps IN2, STATE2
2444#ifdef __x86_64__
2445 movups 0x20(INP), IN3
2446 movaps IN3, STATE3
2447 movups 0x30(INP), IN4
2448 movaps IN4, STATE4
2449#else
2450 movups 0x20(INP), IN1
2451 movaps IN1, STATE3
2452 movups 0x30(INP), IN2
2453 movaps IN2, STATE4
2454#endif
2455 call _aesni_dec4
2456 pxor IV, STATE1
2457#ifdef __x86_64__
2458 pxor IN1, STATE2
2459 pxor IN2, STATE3
2460 pxor IN3, STATE4
2461 movaps IN4, IV
2462#else
2463 pxor (INP), STATE2
2464 pxor 0x10(INP), STATE3
2465 pxor IN1, STATE4
2466 movaps IN2, IV
2467#endif
2468 movups STATE1, (OUTP)
2469 movups STATE2, 0x10(OUTP)
2470 movups STATE3, 0x20(OUTP)
2471 movups STATE4, 0x30(OUTP)
2472 sub $64, LEN
2473 add $64, INP
2474 add $64, OUTP
2475 cmp $64, LEN
2476 jge .Lcbc_dec_loop4
2477 cmp $16, LEN
2478 jb .Lcbc_dec_ret
2479.align 4
2480.Lcbc_dec_loop1:
2481 movups (INP), IN
2482 movaps IN, STATE
2483 call _aesni_dec1
2484 pxor IV, STATE
2485 movups STATE, (OUTP)
2486 movaps IN, IV
2487 sub $16, LEN
2488 add $16, INP
2489 add $16, OUTP
2490 cmp $16, LEN
2491 jge .Lcbc_dec_loop1
2492.Lcbc_dec_ret:
2493 movups IV, (IVP)
2494.Lcbc_dec_just_ret:
2495#ifndef __x86_64__
2496 popl KLEN
2497 popl KEYP
2498 popl LEN
2499 popl IVP
2500#endif
2501 ret
2502
2503#ifdef __x86_64__
2504.align 16
2505.Lbswap_mask:
2506 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2507
2508/*
2509 * _aesni_inc_init: internal ABI
2510 * setup registers used by _aesni_inc
2511 * input:
2512 * IV
2513 * output:
2514 * CTR: == IV, in little endian
2515 * TCTR_LOW: == lower qword of CTR
2516 * INC: == 1, in little endian
2517 * BSWAP_MASK == endian swapping mask
2518 */
2519.align 4
2520_aesni_inc_init:
2521 movaps .Lbswap_mask, BSWAP_MASK
2522 movaps IV, CTR
2523 PSHUFB_XMM BSWAP_MASK CTR
2524 mov $1, TCTR_LOW
2525 MOVQ_R64_XMM TCTR_LOW INC
2526 MOVQ_R64_XMM CTR TCTR_LOW
2527 ret
2528
2529/*
2530 * _aesni_inc: internal ABI
2531 * Increase IV by 1, IV is in big endian
2532 * input:
2533 * IV
2534 * CTR: == IV, in little endian
2535 * TCTR_LOW: == lower qword of CTR
2536 * INC: == 1, in little endian
2537 * BSWAP_MASK == endian swapping mask
2538 * output:
2539 * IV: Increase by 1
2540 * changed:
2541 * CTR: == output IV, in little endian
2542 * TCTR_LOW: == lower qword of CTR
2543 */
2544.align 4
2545_aesni_inc:
2546 paddq INC, CTR
2547 add $1, TCTR_LOW
2548 jnc .Linc_low
2549 pslldq $8, INC
2550 paddq INC, CTR
2551 psrldq $8, INC
2552.Linc_low:
2553 movaps CTR, IV
2554 PSHUFB_XMM BSWAP_MASK IV
2555 ret
2556
2557/*
2558 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2559 * size_t len, u8 *iv)
2560 */
2561ENTRY(aesni_ctr_enc)
2562 cmp $16, LEN
2563 jb .Lctr_enc_just_ret
2564 mov 480(KEYP), KLEN
2565 movups (IVP), IV
2566 call _aesni_inc_init
2567 cmp $64, LEN
2568 jb .Lctr_enc_loop1
2569.align 4
2570.Lctr_enc_loop4:
2571 movaps IV, STATE1
2572 call _aesni_inc
2573 movups (INP), IN1
2574 movaps IV, STATE2
2575 call _aesni_inc
2576 movups 0x10(INP), IN2
2577 movaps IV, STATE3
2578 call _aesni_inc
2579 movups 0x20(INP), IN3
2580 movaps IV, STATE4
2581 call _aesni_inc
2582 movups 0x30(INP), IN4
2583 call _aesni_enc4
2584 pxor IN1, STATE1
2585 movups STATE1, (OUTP)
2586 pxor IN2, STATE2
2587 movups STATE2, 0x10(OUTP)
2588 pxor IN3, STATE3
2589 movups STATE3, 0x20(OUTP)
2590 pxor IN4, STATE4
2591 movups STATE4, 0x30(OUTP)
2592 sub $64, LEN
2593 add $64, INP
2594 add $64, OUTP
2595 cmp $64, LEN
2596 jge .Lctr_enc_loop4
2597 cmp $16, LEN
2598 jb .Lctr_enc_ret
2599.align 4
2600.Lctr_enc_loop1:
2601 movaps IV, STATE
2602 call _aesni_inc
2603 movups (INP), IN
2604 call _aesni_enc1
2605 pxor IN, STATE
2606 movups STATE, (OUTP)
2607 sub $16, LEN
2608 add $16, INP
2609 add $16, OUTP
2610 cmp $16, LEN
2611 jge .Lctr_enc_loop1
2612.Lctr_enc_ret:
2613 movups IV, (IVP)
2614.Lctr_enc_just_ret:
2615 ret
2616#endif
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35
36/*
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register. This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned). It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released. However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
43 */
44#define MOVADQ movaps
45#define MOVUDQ movups
46
47#ifdef __x86_64__
48
49.data
50.align 16
51.Lgf128mul_x_ble_mask:
52 .octa 0x00000000000000010000000000000087
53POLY: .octa 0xC2000000000000000000000000000001
54TWOONE: .octa 0x00000001000000000000000000000001
55
56# order of these constants should not change.
57# more specifically, ALL_F should follow SHIFT_MASK,
58# and ZERO should follow ALL_F
59
60SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
61MASK1: .octa 0x0000000000000000ffffffffffffffff
62MASK2: .octa 0xffffffffffffffff0000000000000000
63SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
64ALL_F: .octa 0xffffffffffffffffffffffffffffffff
65ZERO: .octa 0x00000000000000000000000000000000
66ONE: .octa 0x00000000000000000000000000000001
67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68dec: .octa 0x1
69enc: .octa 0x2
70
71
72.text
73
74
75#define STACK_OFFSET 8*3
76#define HashKey 16*0 // store HashKey <<1 mod poly here
77#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
78#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
79#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
80#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
81 // bits of HashKey <<1 mod poly here
82 //(for Karatsuba purposes)
83#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
84 // bits of HashKey^2 <<1 mod poly here
85 // (for Karatsuba purposes)
86#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
87 // bits of HashKey^3 <<1 mod poly here
88 // (for Karatsuba purposes)
89#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
90 // bits of HashKey^4 <<1 mod poly here
91 // (for Karatsuba purposes)
92#define VARIABLE_OFFSET 16*8
93
94#define arg1 rdi
95#define arg2 rsi
96#define arg3 rdx
97#define arg4 rcx
98#define arg5 r8
99#define arg6 r9
100#define arg7 STACK_OFFSET+8(%r14)
101#define arg8 STACK_OFFSET+16(%r14)
102#define arg9 STACK_OFFSET+24(%r14)
103#define arg10 STACK_OFFSET+32(%r14)
104#define keysize 2*15*16(%arg1)
105#endif
106
107
108#define STATE1 %xmm0
109#define STATE2 %xmm4
110#define STATE3 %xmm5
111#define STATE4 %xmm6
112#define STATE STATE1
113#define IN1 %xmm1
114#define IN2 %xmm7
115#define IN3 %xmm8
116#define IN4 %xmm9
117#define IN IN1
118#define KEY %xmm2
119#define IV %xmm3
120
121#define BSWAP_MASK %xmm10
122#define CTR %xmm11
123#define INC %xmm12
124
125#define GF128MUL_MASK %xmm10
126
127#ifdef __x86_64__
128#define AREG %rax
129#define KEYP %rdi
130#define OUTP %rsi
131#define UKEYP OUTP
132#define INP %rdx
133#define LEN %rcx
134#define IVP %r8
135#define KLEN %r9d
136#define T1 %r10
137#define TKEYP T1
138#define T2 %r11
139#define TCTR_LOW T2
140#else
141#define AREG %eax
142#define KEYP %edi
143#define OUTP AREG
144#define UKEYP OUTP
145#define INP %edx
146#define LEN %esi
147#define IVP %ebp
148#define KLEN %ebx
149#define T1 %ecx
150#define TKEYP T1
151#endif
152
153
154#ifdef __x86_64__
155/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
156*
157*
158* Input: A and B (128-bits each, bit-reflected)
159* Output: C = A*B*x mod poly, (i.e. >>1 )
160* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
161* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
162*
163*/
164.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
165 movdqa \GH, \TMP1
166 pshufd $78, \GH, \TMP2
167 pshufd $78, \HK, \TMP3
168 pxor \GH, \TMP2 # TMP2 = a1+a0
169 pxor \HK, \TMP3 # TMP3 = b1+b0
170 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
171 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
172 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
173 pxor \GH, \TMP2
174 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
175 movdqa \TMP2, \TMP3
176 pslldq $8, \TMP3 # left shift TMP3 2 DWs
177 psrldq $8, \TMP2 # right shift TMP2 2 DWs
178 pxor \TMP3, \GH
179 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
180
181 # first phase of the reduction
182
183 movdqa \GH, \TMP2
184 movdqa \GH, \TMP3
185 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
186 # in in order to perform
187 # independent shifts
188 pslld $31, \TMP2 # packed right shift <<31
189 pslld $30, \TMP3 # packed right shift <<30
190 pslld $25, \TMP4 # packed right shift <<25
191 pxor \TMP3, \TMP2 # xor the shifted versions
192 pxor \TMP4, \TMP2
193 movdqa \TMP2, \TMP5
194 psrldq $4, \TMP5 # right shift TMP5 1 DW
195 pslldq $12, \TMP2 # left shift TMP2 3 DWs
196 pxor \TMP2, \GH
197
198 # second phase of the reduction
199
200 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
201 # in in order to perform
202 # independent shifts
203 movdqa \GH,\TMP3
204 movdqa \GH,\TMP4
205 psrld $1,\TMP2 # packed left shift >>1
206 psrld $2,\TMP3 # packed left shift >>2
207 psrld $7,\TMP4 # packed left shift >>7
208 pxor \TMP3,\TMP2 # xor the shifted versions
209 pxor \TMP4,\TMP2
210 pxor \TMP5, \TMP2
211 pxor \TMP2, \GH
212 pxor \TMP1, \GH # result is in TMP1
213.endm
214
215/*
216* if a = number of total plaintext bytes
217* b = floor(a/16)
218* num_initial_blocks = b mod 4
219* encrypt the initial num_initial_blocks blocks and apply ghash on
220* the ciphertext
221* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
222* are clobbered
223* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
224*/
225
226
227.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
228XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
229 MOVADQ SHUF_MASK(%rip), %xmm14
230 mov arg7, %r10 # %r10 = AAD
231 mov arg8, %r12 # %r12 = aadLen
232 mov %r12, %r11
233 pxor %xmm\i, %xmm\i
234
235_get_AAD_loop\num_initial_blocks\operation:
236 movd (%r10), \TMP1
237 pslldq $12, \TMP1
238 psrldq $4, %xmm\i
239 pxor \TMP1, %xmm\i
240 add $4, %r10
241 sub $4, %r12
242 jne _get_AAD_loop\num_initial_blocks\operation
243
244 cmp $16, %r11
245 je _get_AAD_loop2_done\num_initial_blocks\operation
246
247 mov $16, %r12
248_get_AAD_loop2\num_initial_blocks\operation:
249 psrldq $4, %xmm\i
250 sub $4, %r12
251 cmp %r11, %r12
252 jne _get_AAD_loop2\num_initial_blocks\operation
253
254_get_AAD_loop2_done\num_initial_blocks\operation:
255 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
256
257 xor %r11, %r11 # initialise the data pointer offset as zero
258
259 # start AES for num_initial_blocks blocks
260
261 mov %arg5, %rax # %rax = *Y0
262 movdqu (%rax), \XMM0 # XMM0 = Y0
263 PSHUFB_XMM %xmm14, \XMM0
264
265.if (\i == 5) || (\i == 6) || (\i == 7)
266 MOVADQ ONE(%RIP),\TMP1
267 MOVADQ (%arg1),\TMP2
268.irpc index, \i_seq
269 paddd \TMP1, \XMM0 # INCR Y0
270 movdqa \XMM0, %xmm\index
271 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
272 pxor \TMP2, %xmm\index
273.endr
274 lea 0x10(%arg1),%r10
275 mov keysize,%eax
276 shr $2,%eax # 128->4, 192->6, 256->8
277 add $5,%eax # 128->9, 192->11, 256->13
278
279aes_loop_initial_dec\num_initial_blocks:
280 MOVADQ (%r10),\TMP1
281.irpc index, \i_seq
282 AESENC \TMP1, %xmm\index
283.endr
284 add $16,%r10
285 sub $1,%eax
286 jnz aes_loop_initial_dec\num_initial_blocks
287
288 MOVADQ (%r10), \TMP1
289.irpc index, \i_seq
290 AESENCLAST \TMP1, %xmm\index # Last Round
291.endr
292.irpc index, \i_seq
293 movdqu (%arg3 , %r11, 1), \TMP1
294 pxor \TMP1, %xmm\index
295 movdqu %xmm\index, (%arg2 , %r11, 1)
296 # write back plaintext/ciphertext for num_initial_blocks
297 add $16, %r11
298
299 movdqa \TMP1, %xmm\index
300 PSHUFB_XMM %xmm14, %xmm\index
301 # prepare plaintext/ciphertext for GHASH computation
302.endr
303.endif
304 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
305 # apply GHASH on num_initial_blocks blocks
306
307.if \i == 5
308 pxor %xmm5, %xmm6
309 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
310 pxor %xmm6, %xmm7
311 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
312 pxor %xmm7, %xmm8
313 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314.elseif \i == 6
315 pxor %xmm6, %xmm7
316 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
317 pxor %xmm7, %xmm8
318 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
319.elseif \i == 7
320 pxor %xmm7, %xmm8
321 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
322.endif
323 cmp $64, %r13
324 jl _initial_blocks_done\num_initial_blocks\operation
325 # no need for precomputed values
326/*
327*
328* Precomputations for HashKey parallel with encryption of first 4 blocks.
329* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
330*/
331 MOVADQ ONE(%rip), \TMP1
332 paddd \TMP1, \XMM0 # INCR Y0
333 MOVADQ \XMM0, \XMM1
334 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
335
336 paddd \TMP1, \XMM0 # INCR Y0
337 MOVADQ \XMM0, \XMM2
338 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
339
340 paddd \TMP1, \XMM0 # INCR Y0
341 MOVADQ \XMM0, \XMM3
342 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
343
344 paddd \TMP1, \XMM0 # INCR Y0
345 MOVADQ \XMM0, \XMM4
346 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
347
348 MOVADQ 0(%arg1),\TMP1
349 pxor \TMP1, \XMM1
350 pxor \TMP1, \XMM2
351 pxor \TMP1, \XMM3
352 pxor \TMP1, \XMM4
353 movdqa \TMP3, \TMP5
354 pshufd $78, \TMP3, \TMP1
355 pxor \TMP3, \TMP1
356 movdqa \TMP1, HashKey_k(%rsp)
357 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
358# TMP5 = HashKey^2<<1 (mod poly)
359 movdqa \TMP5, HashKey_2(%rsp)
360# HashKey_2 = HashKey^2<<1 (mod poly)
361 pshufd $78, \TMP5, \TMP1
362 pxor \TMP5, \TMP1
363 movdqa \TMP1, HashKey_2_k(%rsp)
364.irpc index, 1234 # do 4 rounds
365 movaps 0x10*\index(%arg1), \TMP1
366 AESENC \TMP1, \XMM1
367 AESENC \TMP1, \XMM2
368 AESENC \TMP1, \XMM3
369 AESENC \TMP1, \XMM4
370.endr
371 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
372# TMP5 = HashKey^3<<1 (mod poly)
373 movdqa \TMP5, HashKey_3(%rsp)
374 pshufd $78, \TMP5, \TMP1
375 pxor \TMP5, \TMP1
376 movdqa \TMP1, HashKey_3_k(%rsp)
377.irpc index, 56789 # do next 5 rounds
378 movaps 0x10*\index(%arg1), \TMP1
379 AESENC \TMP1, \XMM1
380 AESENC \TMP1, \XMM2
381 AESENC \TMP1, \XMM3
382 AESENC \TMP1, \XMM4
383.endr
384 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
385# TMP5 = HashKey^3<<1 (mod poly)
386 movdqa \TMP5, HashKey_4(%rsp)
387 pshufd $78, \TMP5, \TMP1
388 pxor \TMP5, \TMP1
389 movdqa \TMP1, HashKey_4_k(%rsp)
390 lea 0xa0(%arg1),%r10
391 mov keysize,%eax
392 shr $2,%eax # 128->4, 192->6, 256->8
393 sub $4,%eax # 128->0, 192->2, 256->4
394 jz aes_loop_pre_dec_done\num_initial_blocks
395
396aes_loop_pre_dec\num_initial_blocks:
397 MOVADQ (%r10),\TMP2
398.irpc index, 1234
399 AESENC \TMP2, %xmm\index
400.endr
401 add $16,%r10
402 sub $1,%eax
403 jnz aes_loop_pre_dec\num_initial_blocks
404
405aes_loop_pre_dec_done\num_initial_blocks:
406 MOVADQ (%r10), \TMP2
407 AESENCLAST \TMP2, \XMM1
408 AESENCLAST \TMP2, \XMM2
409 AESENCLAST \TMP2, \XMM3
410 AESENCLAST \TMP2, \XMM4
411 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
412 pxor \TMP1, \XMM1
413 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
414 movdqa \TMP1, \XMM1
415 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
416 pxor \TMP1, \XMM2
417 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
418 movdqa \TMP1, \XMM2
419 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
420 pxor \TMP1, \XMM3
421 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
422 movdqa \TMP1, \XMM3
423 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
424 pxor \TMP1, \XMM4
425 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
426 movdqa \TMP1, \XMM4
427 add $64, %r11
428 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
429 pxor \XMMDst, \XMM1
430# combine GHASHed value with the corresponding ciphertext
431 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
432 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
434
435_initial_blocks_done\num_initial_blocks\operation:
436
437.endm
438
439
440/*
441* if a = number of total plaintext bytes
442* b = floor(a/16)
443* num_initial_blocks = b mod 4
444* encrypt the initial num_initial_blocks blocks and apply ghash on
445* the ciphertext
446* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
447* are clobbered
448* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
449*/
450
451
452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
454 MOVADQ SHUF_MASK(%rip), %xmm14
455 mov arg7, %r10 # %r10 = AAD
456 mov arg8, %r12 # %r12 = aadLen
457 mov %r12, %r11
458 pxor %xmm\i, %xmm\i
459_get_AAD_loop\num_initial_blocks\operation:
460 movd (%r10), \TMP1
461 pslldq $12, \TMP1
462 psrldq $4, %xmm\i
463 pxor \TMP1, %xmm\i
464 add $4, %r10
465 sub $4, %r12
466 jne _get_AAD_loop\num_initial_blocks\operation
467 cmp $16, %r11
468 je _get_AAD_loop2_done\num_initial_blocks\operation
469 mov $16, %r12
470_get_AAD_loop2\num_initial_blocks\operation:
471 psrldq $4, %xmm\i
472 sub $4, %r12
473 cmp %r11, %r12
474 jne _get_AAD_loop2\num_initial_blocks\operation
475_get_AAD_loop2_done\num_initial_blocks\operation:
476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
477
478 xor %r11, %r11 # initialise the data pointer offset as zero
479
480 # start AES for num_initial_blocks blocks
481
482 mov %arg5, %rax # %rax = *Y0
483 movdqu (%rax), \XMM0 # XMM0 = Y0
484 PSHUFB_XMM %xmm14, \XMM0
485
486.if (\i == 5) || (\i == 6) || (\i == 7)
487
488 MOVADQ ONE(%RIP),\TMP1
489 MOVADQ 0(%arg1),\TMP2
490.irpc index, \i_seq
491 paddd \TMP1, \XMM0 # INCR Y0
492 MOVADQ \XMM0, %xmm\index
493 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
494 pxor \TMP2, %xmm\index
495.endr
496 lea 0x10(%arg1),%r10
497 mov keysize,%eax
498 shr $2,%eax # 128->4, 192->6, 256->8
499 add $5,%eax # 128->9, 192->11, 256->13
500
501aes_loop_initial_enc\num_initial_blocks:
502 MOVADQ (%r10),\TMP1
503.irpc index, \i_seq
504 AESENC \TMP1, %xmm\index
505.endr
506 add $16,%r10
507 sub $1,%eax
508 jnz aes_loop_initial_enc\num_initial_blocks
509
510 MOVADQ (%r10), \TMP1
511.irpc index, \i_seq
512 AESENCLAST \TMP1, %xmm\index # Last Round
513.endr
514.irpc index, \i_seq
515 movdqu (%arg3 , %r11, 1), \TMP1
516 pxor \TMP1, %xmm\index
517 movdqu %xmm\index, (%arg2 , %r11, 1)
518 # write back plaintext/ciphertext for num_initial_blocks
519 add $16, %r11
520 PSHUFB_XMM %xmm14, %xmm\index
521
522 # prepare plaintext/ciphertext for GHASH computation
523.endr
524.endif
525 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
526 # apply GHASH on num_initial_blocks blocks
527
528.if \i == 5
529 pxor %xmm5, %xmm6
530 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
531 pxor %xmm6, %xmm7
532 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
533 pxor %xmm7, %xmm8
534 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
535.elseif \i == 6
536 pxor %xmm6, %xmm7
537 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
538 pxor %xmm7, %xmm8
539 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
540.elseif \i == 7
541 pxor %xmm7, %xmm8
542 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
543.endif
544 cmp $64, %r13
545 jl _initial_blocks_done\num_initial_blocks\operation
546 # no need for precomputed values
547/*
548*
549* Precomputations for HashKey parallel with encryption of first 4 blocks.
550* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
551*/
552 MOVADQ ONE(%RIP),\TMP1
553 paddd \TMP1, \XMM0 # INCR Y0
554 MOVADQ \XMM0, \XMM1
555 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
556
557 paddd \TMP1, \XMM0 # INCR Y0
558 MOVADQ \XMM0, \XMM2
559 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
560
561 paddd \TMP1, \XMM0 # INCR Y0
562 MOVADQ \XMM0, \XMM3
563 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
564
565 paddd \TMP1, \XMM0 # INCR Y0
566 MOVADQ \XMM0, \XMM4
567 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
568
569 MOVADQ 0(%arg1),\TMP1
570 pxor \TMP1, \XMM1
571 pxor \TMP1, \XMM2
572 pxor \TMP1, \XMM3
573 pxor \TMP1, \XMM4
574 movdqa \TMP3, \TMP5
575 pshufd $78, \TMP3, \TMP1
576 pxor \TMP3, \TMP1
577 movdqa \TMP1, HashKey_k(%rsp)
578 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
579# TMP5 = HashKey^2<<1 (mod poly)
580 movdqa \TMP5, HashKey_2(%rsp)
581# HashKey_2 = HashKey^2<<1 (mod poly)
582 pshufd $78, \TMP5, \TMP1
583 pxor \TMP5, \TMP1
584 movdqa \TMP1, HashKey_2_k(%rsp)
585.irpc index, 1234 # do 4 rounds
586 movaps 0x10*\index(%arg1), \TMP1
587 AESENC \TMP1, \XMM1
588 AESENC \TMP1, \XMM2
589 AESENC \TMP1, \XMM3
590 AESENC \TMP1, \XMM4
591.endr
592 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
593# TMP5 = HashKey^3<<1 (mod poly)
594 movdqa \TMP5, HashKey_3(%rsp)
595 pshufd $78, \TMP5, \TMP1
596 pxor \TMP5, \TMP1
597 movdqa \TMP1, HashKey_3_k(%rsp)
598.irpc index, 56789 # do next 5 rounds
599 movaps 0x10*\index(%arg1), \TMP1
600 AESENC \TMP1, \XMM1
601 AESENC \TMP1, \XMM2
602 AESENC \TMP1, \XMM3
603 AESENC \TMP1, \XMM4
604.endr
605 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
606# TMP5 = HashKey^3<<1 (mod poly)
607 movdqa \TMP5, HashKey_4(%rsp)
608 pshufd $78, \TMP5, \TMP1
609 pxor \TMP5, \TMP1
610 movdqa \TMP1, HashKey_4_k(%rsp)
611 lea 0xa0(%arg1),%r10
612 mov keysize,%eax
613 shr $2,%eax # 128->4, 192->6, 256->8
614 sub $4,%eax # 128->0, 192->2, 256->4
615 jz aes_loop_pre_enc_done\num_initial_blocks
616
617aes_loop_pre_enc\num_initial_blocks:
618 MOVADQ (%r10),\TMP2
619.irpc index, 1234
620 AESENC \TMP2, %xmm\index
621.endr
622 add $16,%r10
623 sub $1,%eax
624 jnz aes_loop_pre_enc\num_initial_blocks
625
626aes_loop_pre_enc_done\num_initial_blocks:
627 MOVADQ (%r10), \TMP2
628 AESENCLAST \TMP2, \XMM1
629 AESENCLAST \TMP2, \XMM2
630 AESENCLAST \TMP2, \XMM3
631 AESENCLAST \TMP2, \XMM4
632 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
633 pxor \TMP1, \XMM1
634 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
635 pxor \TMP1, \XMM2
636 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
637 pxor \TMP1, \XMM3
638 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM4
640 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
641 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
642 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
643 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
644
645 add $64, %r11
646 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
647 pxor \XMMDst, \XMM1
648# combine GHASHed value with the corresponding ciphertext
649 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
650 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
651 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
652
653_initial_blocks_done\num_initial_blocks\operation:
654
655.endm
656
657/*
658* encrypt 4 blocks at a time
659* ghash the 4 previously encrypted ciphertext blocks
660* arg1, %arg2, %arg3 are used as pointers only, not modified
661* %r11 is the data offset value
662*/
663.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
664TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
665
666 movdqa \XMM1, \XMM5
667 movdqa \XMM2, \XMM6
668 movdqa \XMM3, \XMM7
669 movdqa \XMM4, \XMM8
670
671 movdqa SHUF_MASK(%rip), %xmm15
672 # multiply TMP5 * HashKey using karatsuba
673
674 movdqa \XMM5, \TMP4
675 pshufd $78, \XMM5, \TMP6
676 pxor \XMM5, \TMP6
677 paddd ONE(%rip), \XMM0 # INCR CNT
678 movdqa HashKey_4(%rsp), \TMP5
679 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
680 movdqa \XMM0, \XMM1
681 paddd ONE(%rip), \XMM0 # INCR CNT
682 movdqa \XMM0, \XMM2
683 paddd ONE(%rip), \XMM0 # INCR CNT
684 movdqa \XMM0, \XMM3
685 paddd ONE(%rip), \XMM0 # INCR CNT
686 movdqa \XMM0, \XMM4
687 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
688 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
689 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
690 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
691 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
692
693 pxor (%arg1), \XMM1
694 pxor (%arg1), \XMM2
695 pxor (%arg1), \XMM3
696 pxor (%arg1), \XMM4
697 movdqa HashKey_4_k(%rsp), \TMP5
698 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
699 movaps 0x10(%arg1), \TMP1
700 AESENC \TMP1, \XMM1 # Round 1
701 AESENC \TMP1, \XMM2
702 AESENC \TMP1, \XMM3
703 AESENC \TMP1, \XMM4
704 movaps 0x20(%arg1), \TMP1
705 AESENC \TMP1, \XMM1 # Round 2
706 AESENC \TMP1, \XMM2
707 AESENC \TMP1, \XMM3
708 AESENC \TMP1, \XMM4
709 movdqa \XMM6, \TMP1
710 pshufd $78, \XMM6, \TMP2
711 pxor \XMM6, \TMP2
712 movdqa HashKey_3(%rsp), \TMP5
713 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
714 movaps 0x30(%arg1), \TMP3
715 AESENC \TMP3, \XMM1 # Round 3
716 AESENC \TMP3, \XMM2
717 AESENC \TMP3, \XMM3
718 AESENC \TMP3, \XMM4
719 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
720 movaps 0x40(%arg1), \TMP3
721 AESENC \TMP3, \XMM1 # Round 4
722 AESENC \TMP3, \XMM2
723 AESENC \TMP3, \XMM3
724 AESENC \TMP3, \XMM4
725 movdqa HashKey_3_k(%rsp), \TMP5
726 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
727 movaps 0x50(%arg1), \TMP3
728 AESENC \TMP3, \XMM1 # Round 5
729 AESENC \TMP3, \XMM2
730 AESENC \TMP3, \XMM3
731 AESENC \TMP3, \XMM4
732 pxor \TMP1, \TMP4
733# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
734 pxor \XMM6, \XMM5
735 pxor \TMP2, \TMP6
736 movdqa \XMM7, \TMP1
737 pshufd $78, \XMM7, \TMP2
738 pxor \XMM7, \TMP2
739 movdqa HashKey_2(%rsp ), \TMP5
740
741 # Multiply TMP5 * HashKey using karatsuba
742
743 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
744 movaps 0x60(%arg1), \TMP3
745 AESENC \TMP3, \XMM1 # Round 6
746 AESENC \TMP3, \XMM2
747 AESENC \TMP3, \XMM3
748 AESENC \TMP3, \XMM4
749 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
750 movaps 0x70(%arg1), \TMP3
751 AESENC \TMP3, \XMM1 # Round 7
752 AESENC \TMP3, \XMM2
753 AESENC \TMP3, \XMM3
754 AESENC \TMP3, \XMM4
755 movdqa HashKey_2_k(%rsp), \TMP5
756 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
757 movaps 0x80(%arg1), \TMP3
758 AESENC \TMP3, \XMM1 # Round 8
759 AESENC \TMP3, \XMM2
760 AESENC \TMP3, \XMM3
761 AESENC \TMP3, \XMM4
762 pxor \TMP1, \TMP4
763# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
764 pxor \XMM7, \XMM5
765 pxor \TMP2, \TMP6
766
767 # Multiply XMM8 * HashKey
768 # XMM8 and TMP5 hold the values for the two operands
769
770 movdqa \XMM8, \TMP1
771 pshufd $78, \XMM8, \TMP2
772 pxor \XMM8, \TMP2
773 movdqa HashKey(%rsp), \TMP5
774 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
775 movaps 0x90(%arg1), \TMP3
776 AESENC \TMP3, \XMM1 # Round 9
777 AESENC \TMP3, \XMM2
778 AESENC \TMP3, \XMM3
779 AESENC \TMP3, \XMM4
780 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
781 lea 0xa0(%arg1),%r10
782 mov keysize,%eax
783 shr $2,%eax # 128->4, 192->6, 256->8
784 sub $4,%eax # 128->0, 192->2, 256->4
785 jz aes_loop_par_enc_done
786
787aes_loop_par_enc:
788 MOVADQ (%r10),\TMP3
789.irpc index, 1234
790 AESENC \TMP3, %xmm\index
791.endr
792 add $16,%r10
793 sub $1,%eax
794 jnz aes_loop_par_enc
795
796aes_loop_par_enc_done:
797 MOVADQ (%r10), \TMP3
798 AESENCLAST \TMP3, \XMM1 # Round 10
799 AESENCLAST \TMP3, \XMM2
800 AESENCLAST \TMP3, \XMM3
801 AESENCLAST \TMP3, \XMM4
802 movdqa HashKey_k(%rsp), \TMP5
803 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
804 movdqu (%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
806 movdqu 16(%arg3,%r11,1), \TMP3
807 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
808 movdqu 32(%arg3,%r11,1), \TMP3
809 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
810 movdqu 48(%arg3,%r11,1), \TMP3
811 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
812 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
813 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
814 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
815 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
816 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
817 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
818 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
819 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
820
821 pxor \TMP4, \TMP1
822 pxor \XMM8, \XMM5
823 pxor \TMP6, \TMP2
824 pxor \TMP1, \TMP2
825 pxor \XMM5, \TMP2
826 movdqa \TMP2, \TMP3
827 pslldq $8, \TMP3 # left shift TMP3 2 DWs
828 psrldq $8, \TMP2 # right shift TMP2 2 DWs
829 pxor \TMP3, \XMM5
830 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
831
832 # first phase of reduction
833
834 movdqa \XMM5, \TMP2
835 movdqa \XMM5, \TMP3
836 movdqa \XMM5, \TMP4
837# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
838 pslld $31, \TMP2 # packed right shift << 31
839 pslld $30, \TMP3 # packed right shift << 30
840 pslld $25, \TMP4 # packed right shift << 25
841 pxor \TMP3, \TMP2 # xor the shifted versions
842 pxor \TMP4, \TMP2
843 movdqa \TMP2, \TMP5
844 psrldq $4, \TMP5 # right shift T5 1 DW
845 pslldq $12, \TMP2 # left shift T2 3 DWs
846 pxor \TMP2, \XMM5
847
848 # second phase of reduction
849
850 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
851 movdqa \XMM5,\TMP3
852 movdqa \XMM5,\TMP4
853 psrld $1, \TMP2 # packed left shift >>1
854 psrld $2, \TMP3 # packed left shift >>2
855 psrld $7, \TMP4 # packed left shift >>7
856 pxor \TMP3,\TMP2 # xor the shifted versions
857 pxor \TMP4,\TMP2
858 pxor \TMP5, \TMP2
859 pxor \TMP2, \XMM5
860 pxor \TMP1, \XMM5 # result is in TMP1
861
862 pxor \XMM5, \XMM1
863.endm
864
865/*
866* decrypt 4 blocks at a time
867* ghash the 4 previously decrypted ciphertext blocks
868* arg1, %arg2, %arg3 are used as pointers only, not modified
869* %r11 is the data offset value
870*/
871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
873
874 movdqa \XMM1, \XMM5
875 movdqa \XMM2, \XMM6
876 movdqa \XMM3, \XMM7
877 movdqa \XMM4, \XMM8
878
879 movdqa SHUF_MASK(%rip), %xmm15
880 # multiply TMP5 * HashKey using karatsuba
881
882 movdqa \XMM5, \TMP4
883 pshufd $78, \XMM5, \TMP6
884 pxor \XMM5, \TMP6
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa HashKey_4(%rsp), \TMP5
887 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
888 movdqa \XMM0, \XMM1
889 paddd ONE(%rip), \XMM0 # INCR CNT
890 movdqa \XMM0, \XMM2
891 paddd ONE(%rip), \XMM0 # INCR CNT
892 movdqa \XMM0, \XMM3
893 paddd ONE(%rip), \XMM0 # INCR CNT
894 movdqa \XMM0, \XMM4
895 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
896 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
897 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
898 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
899 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
900
901 pxor (%arg1), \XMM1
902 pxor (%arg1), \XMM2
903 pxor (%arg1), \XMM3
904 pxor (%arg1), \XMM4
905 movdqa HashKey_4_k(%rsp), \TMP5
906 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
907 movaps 0x10(%arg1), \TMP1
908 AESENC \TMP1, \XMM1 # Round 1
909 AESENC \TMP1, \XMM2
910 AESENC \TMP1, \XMM3
911 AESENC \TMP1, \XMM4
912 movaps 0x20(%arg1), \TMP1
913 AESENC \TMP1, \XMM1 # Round 2
914 AESENC \TMP1, \XMM2
915 AESENC \TMP1, \XMM3
916 AESENC \TMP1, \XMM4
917 movdqa \XMM6, \TMP1
918 pshufd $78, \XMM6, \TMP2
919 pxor \XMM6, \TMP2
920 movdqa HashKey_3(%rsp), \TMP5
921 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
922 movaps 0x30(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 3
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
928 movaps 0x40(%arg1), \TMP3
929 AESENC \TMP3, \XMM1 # Round 4
930 AESENC \TMP3, \XMM2
931 AESENC \TMP3, \XMM3
932 AESENC \TMP3, \XMM4
933 movdqa HashKey_3_k(%rsp), \TMP5
934 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
935 movaps 0x50(%arg1), \TMP3
936 AESENC \TMP3, \XMM1 # Round 5
937 AESENC \TMP3, \XMM2
938 AESENC \TMP3, \XMM3
939 AESENC \TMP3, \XMM4
940 pxor \TMP1, \TMP4
941# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
942 pxor \XMM6, \XMM5
943 pxor \TMP2, \TMP6
944 movdqa \XMM7, \TMP1
945 pshufd $78, \XMM7, \TMP2
946 pxor \XMM7, \TMP2
947 movdqa HashKey_2(%rsp ), \TMP5
948
949 # Multiply TMP5 * HashKey using karatsuba
950
951 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
952 movaps 0x60(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 6
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
958 movaps 0x70(%arg1), \TMP3
959 AESENC \TMP3, \XMM1 # Round 7
960 AESENC \TMP3, \XMM2
961 AESENC \TMP3, \XMM3
962 AESENC \TMP3, \XMM4
963 movdqa HashKey_2_k(%rsp), \TMP5
964 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
965 movaps 0x80(%arg1), \TMP3
966 AESENC \TMP3, \XMM1 # Round 8
967 AESENC \TMP3, \XMM2
968 AESENC \TMP3, \XMM3
969 AESENC \TMP3, \XMM4
970 pxor \TMP1, \TMP4
971# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
972 pxor \XMM7, \XMM5
973 pxor \TMP2, \TMP6
974
975 # Multiply XMM8 * HashKey
976 # XMM8 and TMP5 hold the values for the two operands
977
978 movdqa \XMM8, \TMP1
979 pshufd $78, \XMM8, \TMP2
980 pxor \XMM8, \TMP2
981 movdqa HashKey(%rsp), \TMP5
982 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
983 movaps 0x90(%arg1), \TMP3
984 AESENC \TMP3, \XMM1 # Round 9
985 AESENC \TMP3, \XMM2
986 AESENC \TMP3, \XMM3
987 AESENC \TMP3, \XMM4
988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
989 lea 0xa0(%arg1),%r10
990 mov keysize,%eax
991 shr $2,%eax # 128->4, 192->6, 256->8
992 sub $4,%eax # 128->0, 192->2, 256->4
993 jz aes_loop_par_dec_done
994
995aes_loop_par_dec:
996 MOVADQ (%r10),\TMP3
997.irpc index, 1234
998 AESENC \TMP3, %xmm\index
999.endr
1000 add $16,%r10
1001 sub $1,%eax
1002 jnz aes_loop_par_dec
1003
1004aes_loop_par_dec_done:
1005 MOVADQ (%r10), \TMP3
1006 AESENCLAST \TMP3, \XMM1 # last round
1007 AESENCLAST \TMP3, \XMM2
1008 AESENCLAST \TMP3, \XMM3
1009 AESENCLAST \TMP3, \XMM4
1010 movdqa HashKey_k(%rsp), \TMP5
1011 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1012 movdqu (%arg3,%r11,1), \TMP3
1013 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1014 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1015 movdqa \TMP3, \XMM1
1016 movdqu 16(%arg3,%r11,1), \TMP3
1017 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1018 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1019 movdqa \TMP3, \XMM2
1020 movdqu 32(%arg3,%r11,1), \TMP3
1021 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1022 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1023 movdqa \TMP3, \XMM3
1024 movdqu 48(%arg3,%r11,1), \TMP3
1025 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1026 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1027 movdqa \TMP3, \XMM4
1028 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1029 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1030 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1031 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1032
1033 pxor \TMP4, \TMP1
1034 pxor \XMM8, \XMM5
1035 pxor \TMP6, \TMP2
1036 pxor \TMP1, \TMP2
1037 pxor \XMM5, \TMP2
1038 movdqa \TMP2, \TMP3
1039 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1040 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1041 pxor \TMP3, \XMM5
1042 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1043
1044 # first phase of reduction
1045
1046 movdqa \XMM5, \TMP2
1047 movdqa \XMM5, \TMP3
1048 movdqa \XMM5, \TMP4
1049# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1050 pslld $31, \TMP2 # packed right shift << 31
1051 pslld $30, \TMP3 # packed right shift << 30
1052 pslld $25, \TMP4 # packed right shift << 25
1053 pxor \TMP3, \TMP2 # xor the shifted versions
1054 pxor \TMP4, \TMP2
1055 movdqa \TMP2, \TMP5
1056 psrldq $4, \TMP5 # right shift T5 1 DW
1057 pslldq $12, \TMP2 # left shift T2 3 DWs
1058 pxor \TMP2, \XMM5
1059
1060 # second phase of reduction
1061
1062 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1063 movdqa \XMM5,\TMP3
1064 movdqa \XMM5,\TMP4
1065 psrld $1, \TMP2 # packed left shift >>1
1066 psrld $2, \TMP3 # packed left shift >>2
1067 psrld $7, \TMP4 # packed left shift >>7
1068 pxor \TMP3,\TMP2 # xor the shifted versions
1069 pxor \TMP4,\TMP2
1070 pxor \TMP5, \TMP2
1071 pxor \TMP2, \XMM5
1072 pxor \TMP1, \XMM5 # result is in TMP1
1073
1074 pxor \XMM5, \XMM1
1075.endm
1076
1077/* GHASH the last 4 ciphertext blocks. */
1078.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1079TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1080
1081 # Multiply TMP6 * HashKey (using Karatsuba)
1082
1083 movdqa \XMM1, \TMP6
1084 pshufd $78, \XMM1, \TMP2
1085 pxor \XMM1, \TMP2
1086 movdqa HashKey_4(%rsp), \TMP5
1087 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1088 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1089 movdqa HashKey_4_k(%rsp), \TMP4
1090 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1091 movdqa \XMM1, \XMMDst
1092 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1093
1094 # Multiply TMP1 * HashKey (using Karatsuba)
1095
1096 movdqa \XMM2, \TMP1
1097 pshufd $78, \XMM2, \TMP2
1098 pxor \XMM2, \TMP2
1099 movdqa HashKey_3(%rsp), \TMP5
1100 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1101 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1102 movdqa HashKey_3_k(%rsp), \TMP4
1103 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1104 pxor \TMP1, \TMP6
1105 pxor \XMM2, \XMMDst
1106 pxor \TMP2, \XMM1
1107# results accumulated in TMP6, XMMDst, XMM1
1108
1109 # Multiply TMP1 * HashKey (using Karatsuba)
1110
1111 movdqa \XMM3, \TMP1
1112 pshufd $78, \XMM3, \TMP2
1113 pxor \XMM3, \TMP2
1114 movdqa HashKey_2(%rsp), \TMP5
1115 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1116 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1117 movdqa HashKey_2_k(%rsp), \TMP4
1118 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1119 pxor \TMP1, \TMP6
1120 pxor \XMM3, \XMMDst
1121 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1122
1123 # Multiply TMP1 * HashKey (using Karatsuba)
1124 movdqa \XMM4, \TMP1
1125 pshufd $78, \XMM4, \TMP2
1126 pxor \XMM4, \TMP2
1127 movdqa HashKey(%rsp), \TMP5
1128 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1129 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1130 movdqa HashKey_k(%rsp), \TMP4
1131 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1132 pxor \TMP1, \TMP6
1133 pxor \XMM4, \XMMDst
1134 pxor \XMM1, \TMP2
1135 pxor \TMP6, \TMP2
1136 pxor \XMMDst, \TMP2
1137 # middle section of the temp results combined as in karatsuba algorithm
1138 movdqa \TMP2, \TMP4
1139 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1140 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1141 pxor \TMP4, \XMMDst
1142 pxor \TMP2, \TMP6
1143# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1144 # first phase of the reduction
1145 movdqa \XMMDst, \TMP2
1146 movdqa \XMMDst, \TMP3
1147 movdqa \XMMDst, \TMP4
1148# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1149 pslld $31, \TMP2 # packed right shifting << 31
1150 pslld $30, \TMP3 # packed right shifting << 30
1151 pslld $25, \TMP4 # packed right shifting << 25
1152 pxor \TMP3, \TMP2 # xor the shifted versions
1153 pxor \TMP4, \TMP2
1154 movdqa \TMP2, \TMP7
1155 psrldq $4, \TMP7 # right shift TMP7 1 DW
1156 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1157 pxor \TMP2, \XMMDst
1158
1159 # second phase of the reduction
1160 movdqa \XMMDst, \TMP2
1161 # make 3 copies of XMMDst for doing 3 shift operations
1162 movdqa \XMMDst, \TMP3
1163 movdqa \XMMDst, \TMP4
1164 psrld $1, \TMP2 # packed left shift >> 1
1165 psrld $2, \TMP3 # packed left shift >> 2
1166 psrld $7, \TMP4 # packed left shift >> 7
1167 pxor \TMP3, \TMP2 # xor the shifted versions
1168 pxor \TMP4, \TMP2
1169 pxor \TMP7, \TMP2
1170 pxor \TMP2, \XMMDst
1171 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1172.endm
1173
1174
1175/* Encryption of a single block
1176* uses eax & r10
1177*/
1178
1179.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1180
1181 pxor (%arg1), \XMM0
1182 mov keysize,%eax
1183 shr $2,%eax # 128->4, 192->6, 256->8
1184 add $5,%eax # 128->9, 192->11, 256->13
1185 lea 16(%arg1), %r10 # get first expanded key address
1186
1187_esb_loop_\@:
1188 MOVADQ (%r10),\TMP1
1189 AESENC \TMP1,\XMM0
1190 add $16,%r10
1191 sub $1,%eax
1192 jnz _esb_loop_\@
1193
1194 MOVADQ (%r10),\TMP1
1195 AESENCLAST \TMP1,\XMM0
1196.endm
1197/*****************************************************************************
1198* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1199* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1200* const u8 *in, // Ciphertext input
1201* u64 plaintext_len, // Length of data in bytes for decryption.
1202* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1203* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1204* // concatenated with 0x00000001. 16-byte aligned pointer.
1205* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1206* const u8 *aad, // Additional Authentication Data (AAD)
1207* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1208* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1209* // given authentication tag and only return the plaintext if they match.
1210* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1211* // (most likely), 12 or 8.
1212*
1213* Assumptions:
1214*
1215* keys:
1216* keys are pre-expanded and aligned to 16 bytes. we are using the first
1217* set of 11 keys in the data structure void *aes_ctx
1218*
1219* iv:
1220* 0 1 2 3
1221* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1222* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1223* | Salt (From the SA) |
1224* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1225* | Initialization Vector |
1226* | (This is the sequence number from IPSec header) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 0x1 |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230*
1231*
1232*
1233* AAD:
1234* AAD padded to 128 bits with 0
1235* for example, assume AAD is a u32 vector
1236*
1237* if AAD is 8 bytes:
1238* AAD[3] = {A0, A1};
1239* padded AAD in xmm register = {A1 A0 0 0}
1240*
1241* 0 1 2 3
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A1) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 32-bit Sequence Number (A0) |
1247* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1248* | 0x0 |
1249* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250*
1251* AAD Format with 32-bit Sequence Number
1252*
1253* if AAD is 12 bytes:
1254* AAD[3] = {A0, A1, A2};
1255* padded AAD in xmm register = {A2 A1 A0 0}
1256*
1257* 0 1 2 3
1258* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1259* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1260* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1261* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1262* | SPI (A2) |
1263* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1264* | 64-bit Extended Sequence Number {A1,A0} |
1265* | |
1266* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1267* | 0x0 |
1268* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1269*
1270* AAD Format with 64-bit Extended Sequence Number
1271*
1272* aadLen:
1273* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1274* The code supports 16 too but for other sizes, the code will fail.
1275*
1276* TLen:
1277* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1278* For other sizes, the code will fail.
1279*
1280* poly = x^128 + x^127 + x^126 + x^121 + 1
1281*
1282*****************************************************************************/
1283ENTRY(aesni_gcm_dec)
1284 push %r12
1285 push %r13
1286 push %r14
1287 mov %rsp, %r14
1288/*
1289* states of %xmm registers %xmm6:%xmm15 not saved
1290* all %xmm registers are clobbered
1291*/
1292 sub $VARIABLE_OFFSET, %rsp
1293 and $~63, %rsp # align rsp to 64 bytes
1294 mov %arg6, %r12
1295 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1296 movdqa SHUF_MASK(%rip), %xmm2
1297 PSHUFB_XMM %xmm2, %xmm13
1298
1299
1300# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1301
1302 movdqa %xmm13, %xmm2
1303 psllq $1, %xmm13
1304 psrlq $63, %xmm2
1305 movdqa %xmm2, %xmm1
1306 pslldq $8, %xmm2
1307 psrldq $8, %xmm1
1308 por %xmm2, %xmm13
1309
1310 # Reduction
1311
1312 pshufd $0x24, %xmm1, %xmm2
1313 pcmpeqd TWOONE(%rip), %xmm2
1314 pand POLY(%rip), %xmm2
1315 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1316
1317
1318 # Decrypt first few blocks
1319
1320 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1321 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1322 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1323 mov %r13, %r12
1324 and $(3<<4), %r12
1325 jz _initial_num_blocks_is_0_decrypt
1326 cmp $(2<<4), %r12
1327 jb _initial_num_blocks_is_1_decrypt
1328 je _initial_num_blocks_is_2_decrypt
1329_initial_num_blocks_is_3_decrypt:
1330 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1331%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1332 sub $48, %r13
1333 jmp _initial_blocks_decrypted
1334_initial_num_blocks_is_2_decrypt:
1335 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1336%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1337 sub $32, %r13
1338 jmp _initial_blocks_decrypted
1339_initial_num_blocks_is_1_decrypt:
1340 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1341%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1342 sub $16, %r13
1343 jmp _initial_blocks_decrypted
1344_initial_num_blocks_is_0_decrypt:
1345 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1346%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1347_initial_blocks_decrypted:
1348 cmp $0, %r13
1349 je _zero_cipher_left_decrypt
1350 sub $64, %r13
1351 je _four_cipher_left_decrypt
1352_decrypt_by_4:
1353 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1354%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1355 add $64, %r11
1356 sub $64, %r13
1357 jne _decrypt_by_4
1358_four_cipher_left_decrypt:
1359 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1360%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1361_zero_cipher_left_decrypt:
1362 mov %arg4, %r13
1363 and $15, %r13 # %r13 = arg4 (mod 16)
1364 je _multiple_of_16_bytes_decrypt
1365
1366 # Handle the last <16 byte block separately
1367
1368 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1369 movdqa SHUF_MASK(%rip), %xmm10
1370 PSHUFB_XMM %xmm10, %xmm0
1371
1372 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1373 sub $16, %r11
1374 add %r13, %r11
1375 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1376 lea SHIFT_MASK+16(%rip), %r12
1377 sub %r13, %r12
1378# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1379# (%r13 is the number of bytes in plaintext mod 16)
1380 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1381 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1382
1383 movdqa %xmm1, %xmm2
1384 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1385 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1386 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1387 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1388 pand %xmm1, %xmm2
1389 movdqa SHUF_MASK(%rip), %xmm10
1390 PSHUFB_XMM %xmm10 ,%xmm2
1391
1392 pxor %xmm2, %xmm8
1393 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1394 # GHASH computation for the last <16 byte block
1395 sub %r13, %r11
1396 add $16, %r11
1397
1398 # output %r13 bytes
1399 MOVQ_R64_XMM %xmm0, %rax
1400 cmp $8, %r13
1401 jle _less_than_8_bytes_left_decrypt
1402 mov %rax, (%arg2 , %r11, 1)
1403 add $8, %r11
1404 psrldq $8, %xmm0
1405 MOVQ_R64_XMM %xmm0, %rax
1406 sub $8, %r13
1407_less_than_8_bytes_left_decrypt:
1408 mov %al, (%arg2, %r11, 1)
1409 add $1, %r11
1410 shr $8, %rax
1411 sub $1, %r13
1412 jne _less_than_8_bytes_left_decrypt
1413_multiple_of_16_bytes_decrypt:
1414 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1415 shl $3, %r12 # convert into number of bits
1416 movd %r12d, %xmm15 # len(A) in %xmm15
1417 shl $3, %arg4 # len(C) in bits (*128)
1418 MOVQ_R64_XMM %arg4, %xmm1
1419 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1420 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1421 pxor %xmm15, %xmm8
1422 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1423 # final GHASH computation
1424 movdqa SHUF_MASK(%rip), %xmm10
1425 PSHUFB_XMM %xmm10, %xmm8
1426
1427 mov %arg5, %rax # %rax = *Y0
1428 movdqu (%rax), %xmm0 # %xmm0 = Y0
1429 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1430 pxor %xmm8, %xmm0
1431_return_T_decrypt:
1432 mov arg9, %r10 # %r10 = authTag
1433 mov arg10, %r11 # %r11 = auth_tag_len
1434 cmp $16, %r11
1435 je _T_16_decrypt
1436 cmp $12, %r11
1437 je _T_12_decrypt
1438_T_8_decrypt:
1439 MOVQ_R64_XMM %xmm0, %rax
1440 mov %rax, (%r10)
1441 jmp _return_T_done_decrypt
1442_T_12_decrypt:
1443 MOVQ_R64_XMM %xmm0, %rax
1444 mov %rax, (%r10)
1445 psrldq $8, %xmm0
1446 movd %xmm0, %eax
1447 mov %eax, 8(%r10)
1448 jmp _return_T_done_decrypt
1449_T_16_decrypt:
1450 movdqu %xmm0, (%r10)
1451_return_T_done_decrypt:
1452 mov %r14, %rsp
1453 pop %r14
1454 pop %r13
1455 pop %r12
1456 ret
1457ENDPROC(aesni_gcm_dec)
1458
1459
1460/*****************************************************************************
1461* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1462* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1463* const u8 *in, // Plaintext input
1464* u64 plaintext_len, // Length of data in bytes for encryption.
1465* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1466* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1467* // concatenated with 0x00000001. 16-byte aligned pointer.
1468* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1469* const u8 *aad, // Additional Authentication Data (AAD)
1470* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1471* u8 *auth_tag, // Authenticated Tag output.
1472* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1473* // 12 or 8.
1474*
1475* Assumptions:
1476*
1477* keys:
1478* keys are pre-expanded and aligned to 16 bytes. we are using the
1479* first set of 11 keys in the data structure void *aes_ctx
1480*
1481*
1482* iv:
1483* 0 1 2 3
1484* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1485* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1486* | Salt (From the SA) |
1487* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1488* | Initialization Vector |
1489* | (This is the sequence number from IPSec header) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 0x1 |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493*
1494*
1495*
1496* AAD:
1497* AAD padded to 128 bits with 0
1498* for example, assume AAD is a u32 vector
1499*
1500* if AAD is 8 bytes:
1501* AAD[3] = {A0, A1};
1502* padded AAD in xmm register = {A1 A0 0 0}
1503*
1504* 0 1 2 3
1505* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | SPI (A1) |
1508* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1509* | 32-bit Sequence Number (A0) |
1510* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1511* | 0x0 |
1512* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513*
1514* AAD Format with 32-bit Sequence Number
1515*
1516* if AAD is 12 bytes:
1517* AAD[3] = {A0, A1, A2};
1518* padded AAD in xmm register = {A2 A1 A0 0}
1519*
1520* 0 1 2 3
1521* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1522* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1523* | SPI (A2) |
1524* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1525* | 64-bit Extended Sequence Number {A1,A0} |
1526* | |
1527* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1528* | 0x0 |
1529* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1530*
1531* AAD Format with 64-bit Extended Sequence Number
1532*
1533* aadLen:
1534* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1535* The code supports 16 too but for other sizes, the code will fail.
1536*
1537* TLen:
1538* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1539* For other sizes, the code will fail.
1540*
1541* poly = x^128 + x^127 + x^126 + x^121 + 1
1542***************************************************************************/
1543ENTRY(aesni_gcm_enc)
1544 push %r12
1545 push %r13
1546 push %r14
1547 mov %rsp, %r14
1548#
1549# states of %xmm registers %xmm6:%xmm15 not saved
1550# all %xmm registers are clobbered
1551#
1552 sub $VARIABLE_OFFSET, %rsp
1553 and $~63, %rsp
1554 mov %arg6, %r12
1555 movdqu (%r12), %xmm13
1556 movdqa SHUF_MASK(%rip), %xmm2
1557 PSHUFB_XMM %xmm2, %xmm13
1558
1559
1560# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1561
1562 movdqa %xmm13, %xmm2
1563 psllq $1, %xmm13
1564 psrlq $63, %xmm2
1565 movdqa %xmm2, %xmm1
1566 pslldq $8, %xmm2
1567 psrldq $8, %xmm1
1568 por %xmm2, %xmm13
1569
1570 # reduce HashKey<<1
1571
1572 pshufd $0x24, %xmm1, %xmm2
1573 pcmpeqd TWOONE(%rip), %xmm2
1574 pand POLY(%rip), %xmm2
1575 pxor %xmm2, %xmm13
1576 movdqa %xmm13, HashKey(%rsp)
1577 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1578 and $-16, %r13
1579 mov %r13, %r12
1580
1581 # Encrypt first few blocks
1582
1583 and $(3<<4), %r12
1584 jz _initial_num_blocks_is_0_encrypt
1585 cmp $(2<<4), %r12
1586 jb _initial_num_blocks_is_1_encrypt
1587 je _initial_num_blocks_is_2_encrypt
1588_initial_num_blocks_is_3_encrypt:
1589 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1590%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1591 sub $48, %r13
1592 jmp _initial_blocks_encrypted
1593_initial_num_blocks_is_2_encrypt:
1594 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1595%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1596 sub $32, %r13
1597 jmp _initial_blocks_encrypted
1598_initial_num_blocks_is_1_encrypt:
1599 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1600%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1601 sub $16, %r13
1602 jmp _initial_blocks_encrypted
1603_initial_num_blocks_is_0_encrypt:
1604 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1605%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1606_initial_blocks_encrypted:
1607
1608 # Main loop - Encrypt remaining blocks
1609
1610 cmp $0, %r13
1611 je _zero_cipher_left_encrypt
1612 sub $64, %r13
1613 je _four_cipher_left_encrypt
1614_encrypt_by_4_encrypt:
1615 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1616%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1617 add $64, %r11
1618 sub $64, %r13
1619 jne _encrypt_by_4_encrypt
1620_four_cipher_left_encrypt:
1621 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1622%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1623_zero_cipher_left_encrypt:
1624 mov %arg4, %r13
1625 and $15, %r13 # %r13 = arg4 (mod 16)
1626 je _multiple_of_16_bytes_encrypt
1627
1628 # Handle the last <16 Byte block separately
1629 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10, %xmm0
1632
1633
1634 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1635 sub $16, %r11
1636 add %r13, %r11
1637 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1638 lea SHIFT_MASK+16(%rip), %r12
1639 sub %r13, %r12
1640 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1641 # (%r13 is the number of bytes in plaintext mod 16)
1642 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1643 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1644 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1645 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1646 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1647 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1648 movdqa SHUF_MASK(%rip), %xmm10
1649 PSHUFB_XMM %xmm10,%xmm0
1650
1651 pxor %xmm0, %xmm8
1652 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1653 # GHASH computation for the last <16 byte block
1654 sub %r13, %r11
1655 add $16, %r11
1656
1657 movdqa SHUF_MASK(%rip), %xmm10
1658 PSHUFB_XMM %xmm10, %xmm0
1659
1660 # shuffle xmm0 back to output as ciphertext
1661
1662 # Output %r13 bytes
1663 MOVQ_R64_XMM %xmm0, %rax
1664 cmp $8, %r13
1665 jle _less_than_8_bytes_left_encrypt
1666 mov %rax, (%arg2 , %r11, 1)
1667 add $8, %r11
1668 psrldq $8, %xmm0
1669 MOVQ_R64_XMM %xmm0, %rax
1670 sub $8, %r13
1671_less_than_8_bytes_left_encrypt:
1672 mov %al, (%arg2, %r11, 1)
1673 add $1, %r11
1674 shr $8, %rax
1675 sub $1, %r13
1676 jne _less_than_8_bytes_left_encrypt
1677_multiple_of_16_bytes_encrypt:
1678 mov arg8, %r12 # %r12 = addLen (number of bytes)
1679 shl $3, %r12
1680 movd %r12d, %xmm15 # len(A) in %xmm15
1681 shl $3, %arg4 # len(C) in bits (*128)
1682 MOVQ_R64_XMM %arg4, %xmm1
1683 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1684 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1685 pxor %xmm15, %xmm8
1686 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1687 # final GHASH computation
1688 movdqa SHUF_MASK(%rip), %xmm10
1689 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1690
1691 mov %arg5, %rax # %rax = *Y0
1692 movdqu (%rax), %xmm0 # %xmm0 = Y0
1693 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1694 pxor %xmm8, %xmm0
1695_return_T_encrypt:
1696 mov arg9, %r10 # %r10 = authTag
1697 mov arg10, %r11 # %r11 = auth_tag_len
1698 cmp $16, %r11
1699 je _T_16_encrypt
1700 cmp $12, %r11
1701 je _T_12_encrypt
1702_T_8_encrypt:
1703 MOVQ_R64_XMM %xmm0, %rax
1704 mov %rax, (%r10)
1705 jmp _return_T_done_encrypt
1706_T_12_encrypt:
1707 MOVQ_R64_XMM %xmm0, %rax
1708 mov %rax, (%r10)
1709 psrldq $8, %xmm0
1710 movd %xmm0, %eax
1711 mov %eax, 8(%r10)
1712 jmp _return_T_done_encrypt
1713_T_16_encrypt:
1714 movdqu %xmm0, (%r10)
1715_return_T_done_encrypt:
1716 mov %r14, %rsp
1717 pop %r14
1718 pop %r13
1719 pop %r12
1720 ret
1721ENDPROC(aesni_gcm_enc)
1722
1723#endif
1724
1725
1726.align 4
1727_key_expansion_128:
1728_key_expansion_256a:
1729 pshufd $0b11111111, %xmm1, %xmm1
1730 shufps $0b00010000, %xmm0, %xmm4
1731 pxor %xmm4, %xmm0
1732 shufps $0b10001100, %xmm0, %xmm4
1733 pxor %xmm4, %xmm0
1734 pxor %xmm1, %xmm0
1735 movaps %xmm0, (TKEYP)
1736 add $0x10, TKEYP
1737 ret
1738ENDPROC(_key_expansion_128)
1739ENDPROC(_key_expansion_256a)
1740
1741.align 4
1742_key_expansion_192a:
1743 pshufd $0b01010101, %xmm1, %xmm1
1744 shufps $0b00010000, %xmm0, %xmm4
1745 pxor %xmm4, %xmm0
1746 shufps $0b10001100, %xmm0, %xmm4
1747 pxor %xmm4, %xmm0
1748 pxor %xmm1, %xmm0
1749
1750 movaps %xmm2, %xmm5
1751 movaps %xmm2, %xmm6
1752 pslldq $4, %xmm5
1753 pshufd $0b11111111, %xmm0, %xmm3
1754 pxor %xmm3, %xmm2
1755 pxor %xmm5, %xmm2
1756
1757 movaps %xmm0, %xmm1
1758 shufps $0b01000100, %xmm0, %xmm6
1759 movaps %xmm6, (TKEYP)
1760 shufps $0b01001110, %xmm2, %xmm1
1761 movaps %xmm1, 0x10(TKEYP)
1762 add $0x20, TKEYP
1763 ret
1764ENDPROC(_key_expansion_192a)
1765
1766.align 4
1767_key_expansion_192b:
1768 pshufd $0b01010101, %xmm1, %xmm1
1769 shufps $0b00010000, %xmm0, %xmm4
1770 pxor %xmm4, %xmm0
1771 shufps $0b10001100, %xmm0, %xmm4
1772 pxor %xmm4, %xmm0
1773 pxor %xmm1, %xmm0
1774
1775 movaps %xmm2, %xmm5
1776 pslldq $4, %xmm5
1777 pshufd $0b11111111, %xmm0, %xmm3
1778 pxor %xmm3, %xmm2
1779 pxor %xmm5, %xmm2
1780
1781 movaps %xmm0, (TKEYP)
1782 add $0x10, TKEYP
1783 ret
1784ENDPROC(_key_expansion_192b)
1785
1786.align 4
1787_key_expansion_256b:
1788 pshufd $0b10101010, %xmm1, %xmm1
1789 shufps $0b00010000, %xmm2, %xmm4
1790 pxor %xmm4, %xmm2
1791 shufps $0b10001100, %xmm2, %xmm4
1792 pxor %xmm4, %xmm2
1793 pxor %xmm1, %xmm2
1794 movaps %xmm2, (TKEYP)
1795 add $0x10, TKEYP
1796 ret
1797ENDPROC(_key_expansion_256b)
1798
1799/*
1800 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1801 * unsigned int key_len)
1802 */
1803ENTRY(aesni_set_key)
1804 FRAME_BEGIN
1805#ifndef __x86_64__
1806 pushl KEYP
1807 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1808 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1809 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1810#endif
1811 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1812 movaps %xmm0, (KEYP)
1813 lea 0x10(KEYP), TKEYP # key addr
1814 movl %edx, 480(KEYP)
1815 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1816 cmp $24, %dl
1817 jb .Lenc_key128
1818 je .Lenc_key192
1819 movups 0x10(UKEYP), %xmm2 # other user key
1820 movaps %xmm2, (TKEYP)
1821 add $0x10, TKEYP
1822 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1823 call _key_expansion_256a
1824 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1825 call _key_expansion_256b
1826 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1827 call _key_expansion_256a
1828 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1829 call _key_expansion_256b
1830 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1831 call _key_expansion_256a
1832 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1833 call _key_expansion_256b
1834 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1835 call _key_expansion_256a
1836 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1837 call _key_expansion_256b
1838 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1839 call _key_expansion_256a
1840 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1841 call _key_expansion_256b
1842 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1843 call _key_expansion_256a
1844 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1845 call _key_expansion_256b
1846 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1847 call _key_expansion_256a
1848 jmp .Ldec_key
1849.Lenc_key192:
1850 movq 0x10(UKEYP), %xmm2 # other user key
1851 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1852 call _key_expansion_192a
1853 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1854 call _key_expansion_192b
1855 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1856 call _key_expansion_192a
1857 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1858 call _key_expansion_192b
1859 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1860 call _key_expansion_192a
1861 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1862 call _key_expansion_192b
1863 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1864 call _key_expansion_192a
1865 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1866 call _key_expansion_192b
1867 jmp .Ldec_key
1868.Lenc_key128:
1869 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1870 call _key_expansion_128
1871 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1872 call _key_expansion_128
1873 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1874 call _key_expansion_128
1875 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1876 call _key_expansion_128
1877 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1878 call _key_expansion_128
1879 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1880 call _key_expansion_128
1881 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1882 call _key_expansion_128
1883 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1884 call _key_expansion_128
1885 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1886 call _key_expansion_128
1887 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1888 call _key_expansion_128
1889.Ldec_key:
1890 sub $0x10, TKEYP
1891 movaps (KEYP), %xmm0
1892 movaps (TKEYP), %xmm1
1893 movaps %xmm0, 240(TKEYP)
1894 movaps %xmm1, 240(KEYP)
1895 add $0x10, KEYP
1896 lea 240-16(TKEYP), UKEYP
1897.align 4
1898.Ldec_key_loop:
1899 movaps (KEYP), %xmm0
1900 AESIMC %xmm0 %xmm1
1901 movaps %xmm1, (UKEYP)
1902 add $0x10, KEYP
1903 sub $0x10, UKEYP
1904 cmp TKEYP, KEYP
1905 jb .Ldec_key_loop
1906 xor AREG, AREG
1907#ifndef __x86_64__
1908 popl KEYP
1909#endif
1910 FRAME_END
1911 ret
1912ENDPROC(aesni_set_key)
1913
1914/*
1915 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1916 */
1917ENTRY(aesni_enc)
1918 FRAME_BEGIN
1919#ifndef __x86_64__
1920 pushl KEYP
1921 pushl KLEN
1922 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1923 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1924 movl (FRAME_OFFSET+20)(%esp), INP # src
1925#endif
1926 movl 480(KEYP), KLEN # key length
1927 movups (INP), STATE # input
1928 call _aesni_enc1
1929 movups STATE, (OUTP) # output
1930#ifndef __x86_64__
1931 popl KLEN
1932 popl KEYP
1933#endif
1934 FRAME_END
1935 ret
1936ENDPROC(aesni_enc)
1937
1938/*
1939 * _aesni_enc1: internal ABI
1940 * input:
1941 * KEYP: key struct pointer
1942 * KLEN: round count
1943 * STATE: initial state (input)
1944 * output:
1945 * STATE: finial state (output)
1946 * changed:
1947 * KEY
1948 * TKEYP (T1)
1949 */
1950.align 4
1951_aesni_enc1:
1952 movaps (KEYP), KEY # key
1953 mov KEYP, TKEYP
1954 pxor KEY, STATE # round 0
1955 add $0x30, TKEYP
1956 cmp $24, KLEN
1957 jb .Lenc128
1958 lea 0x20(TKEYP), TKEYP
1959 je .Lenc192
1960 add $0x20, TKEYP
1961 movaps -0x60(TKEYP), KEY
1962 AESENC KEY STATE
1963 movaps -0x50(TKEYP), KEY
1964 AESENC KEY STATE
1965.align 4
1966.Lenc192:
1967 movaps -0x40(TKEYP), KEY
1968 AESENC KEY STATE
1969 movaps -0x30(TKEYP), KEY
1970 AESENC KEY STATE
1971.align 4
1972.Lenc128:
1973 movaps -0x20(TKEYP), KEY
1974 AESENC KEY STATE
1975 movaps -0x10(TKEYP), KEY
1976 AESENC KEY STATE
1977 movaps (TKEYP), KEY
1978 AESENC KEY STATE
1979 movaps 0x10(TKEYP), KEY
1980 AESENC KEY STATE
1981 movaps 0x20(TKEYP), KEY
1982 AESENC KEY STATE
1983 movaps 0x30(TKEYP), KEY
1984 AESENC KEY STATE
1985 movaps 0x40(TKEYP), KEY
1986 AESENC KEY STATE
1987 movaps 0x50(TKEYP), KEY
1988 AESENC KEY STATE
1989 movaps 0x60(TKEYP), KEY
1990 AESENC KEY STATE
1991 movaps 0x70(TKEYP), KEY
1992 AESENCLAST KEY STATE
1993 ret
1994ENDPROC(_aesni_enc1)
1995
1996/*
1997 * _aesni_enc4: internal ABI
1998 * input:
1999 * KEYP: key struct pointer
2000 * KLEN: round count
2001 * STATE1: initial state (input)
2002 * STATE2
2003 * STATE3
2004 * STATE4
2005 * output:
2006 * STATE1: finial state (output)
2007 * STATE2
2008 * STATE3
2009 * STATE4
2010 * changed:
2011 * KEY
2012 * TKEYP (T1)
2013 */
2014.align 4
2015_aesni_enc4:
2016 movaps (KEYP), KEY # key
2017 mov KEYP, TKEYP
2018 pxor KEY, STATE1 # round 0
2019 pxor KEY, STATE2
2020 pxor KEY, STATE3
2021 pxor KEY, STATE4
2022 add $0x30, TKEYP
2023 cmp $24, KLEN
2024 jb .L4enc128
2025 lea 0x20(TKEYP), TKEYP
2026 je .L4enc192
2027 add $0x20, TKEYP
2028 movaps -0x60(TKEYP), KEY
2029 AESENC KEY STATE1
2030 AESENC KEY STATE2
2031 AESENC KEY STATE3
2032 AESENC KEY STATE4
2033 movaps -0x50(TKEYP), KEY
2034 AESENC KEY STATE1
2035 AESENC KEY STATE2
2036 AESENC KEY STATE3
2037 AESENC KEY STATE4
2038#.align 4
2039.L4enc192:
2040 movaps -0x40(TKEYP), KEY
2041 AESENC KEY STATE1
2042 AESENC KEY STATE2
2043 AESENC KEY STATE3
2044 AESENC KEY STATE4
2045 movaps -0x30(TKEYP), KEY
2046 AESENC KEY STATE1
2047 AESENC KEY STATE2
2048 AESENC KEY STATE3
2049 AESENC KEY STATE4
2050#.align 4
2051.L4enc128:
2052 movaps -0x20(TKEYP), KEY
2053 AESENC KEY STATE1
2054 AESENC KEY STATE2
2055 AESENC KEY STATE3
2056 AESENC KEY STATE4
2057 movaps -0x10(TKEYP), KEY
2058 AESENC KEY STATE1
2059 AESENC KEY STATE2
2060 AESENC KEY STATE3
2061 AESENC KEY STATE4
2062 movaps (TKEYP), KEY
2063 AESENC KEY STATE1
2064 AESENC KEY STATE2
2065 AESENC KEY STATE3
2066 AESENC KEY STATE4
2067 movaps 0x10(TKEYP), KEY
2068 AESENC KEY STATE1
2069 AESENC KEY STATE2
2070 AESENC KEY STATE3
2071 AESENC KEY STATE4
2072 movaps 0x20(TKEYP), KEY
2073 AESENC KEY STATE1
2074 AESENC KEY STATE2
2075 AESENC KEY STATE3
2076 AESENC KEY STATE4
2077 movaps 0x30(TKEYP), KEY
2078 AESENC KEY STATE1
2079 AESENC KEY STATE2
2080 AESENC KEY STATE3
2081 AESENC KEY STATE4
2082 movaps 0x40(TKEYP), KEY
2083 AESENC KEY STATE1
2084 AESENC KEY STATE2
2085 AESENC KEY STATE3
2086 AESENC KEY STATE4
2087 movaps 0x50(TKEYP), KEY
2088 AESENC KEY STATE1
2089 AESENC KEY STATE2
2090 AESENC KEY STATE3
2091 AESENC KEY STATE4
2092 movaps 0x60(TKEYP), KEY
2093 AESENC KEY STATE1
2094 AESENC KEY STATE2
2095 AESENC KEY STATE3
2096 AESENC KEY STATE4
2097 movaps 0x70(TKEYP), KEY
2098 AESENCLAST KEY STATE1 # last round
2099 AESENCLAST KEY STATE2
2100 AESENCLAST KEY STATE3
2101 AESENCLAST KEY STATE4
2102 ret
2103ENDPROC(_aesni_enc4)
2104
2105/*
2106 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2107 */
2108ENTRY(aesni_dec)
2109 FRAME_BEGIN
2110#ifndef __x86_64__
2111 pushl KEYP
2112 pushl KLEN
2113 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2114 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2115 movl (FRAME_OFFSET+20)(%esp), INP # src
2116#endif
2117 mov 480(KEYP), KLEN # key length
2118 add $240, KEYP
2119 movups (INP), STATE # input
2120 call _aesni_dec1
2121 movups STATE, (OUTP) #output
2122#ifndef __x86_64__
2123 popl KLEN
2124 popl KEYP
2125#endif
2126 FRAME_END
2127 ret
2128ENDPROC(aesni_dec)
2129
2130/*
2131 * _aesni_dec1: internal ABI
2132 * input:
2133 * KEYP: key struct pointer
2134 * KLEN: key length
2135 * STATE: initial state (input)
2136 * output:
2137 * STATE: finial state (output)
2138 * changed:
2139 * KEY
2140 * TKEYP (T1)
2141 */
2142.align 4
2143_aesni_dec1:
2144 movaps (KEYP), KEY # key
2145 mov KEYP, TKEYP
2146 pxor KEY, STATE # round 0
2147 add $0x30, TKEYP
2148 cmp $24, KLEN
2149 jb .Ldec128
2150 lea 0x20(TKEYP), TKEYP
2151 je .Ldec192
2152 add $0x20, TKEYP
2153 movaps -0x60(TKEYP), KEY
2154 AESDEC KEY STATE
2155 movaps -0x50(TKEYP), KEY
2156 AESDEC KEY STATE
2157.align 4
2158.Ldec192:
2159 movaps -0x40(TKEYP), KEY
2160 AESDEC KEY STATE
2161 movaps -0x30(TKEYP), KEY
2162 AESDEC KEY STATE
2163.align 4
2164.Ldec128:
2165 movaps -0x20(TKEYP), KEY
2166 AESDEC KEY STATE
2167 movaps -0x10(TKEYP), KEY
2168 AESDEC KEY STATE
2169 movaps (TKEYP), KEY
2170 AESDEC KEY STATE
2171 movaps 0x10(TKEYP), KEY
2172 AESDEC KEY STATE
2173 movaps 0x20(TKEYP), KEY
2174 AESDEC KEY STATE
2175 movaps 0x30(TKEYP), KEY
2176 AESDEC KEY STATE
2177 movaps 0x40(TKEYP), KEY
2178 AESDEC KEY STATE
2179 movaps 0x50(TKEYP), KEY
2180 AESDEC KEY STATE
2181 movaps 0x60(TKEYP), KEY
2182 AESDEC KEY STATE
2183 movaps 0x70(TKEYP), KEY
2184 AESDECLAST KEY STATE
2185 ret
2186ENDPROC(_aesni_dec1)
2187
2188/*
2189 * _aesni_dec4: internal ABI
2190 * input:
2191 * KEYP: key struct pointer
2192 * KLEN: key length
2193 * STATE1: initial state (input)
2194 * STATE2
2195 * STATE3
2196 * STATE4
2197 * output:
2198 * STATE1: finial state (output)
2199 * STATE2
2200 * STATE3
2201 * STATE4
2202 * changed:
2203 * KEY
2204 * TKEYP (T1)
2205 */
2206.align 4
2207_aesni_dec4:
2208 movaps (KEYP), KEY # key
2209 mov KEYP, TKEYP
2210 pxor KEY, STATE1 # round 0
2211 pxor KEY, STATE2
2212 pxor KEY, STATE3
2213 pxor KEY, STATE4
2214 add $0x30, TKEYP
2215 cmp $24, KLEN
2216 jb .L4dec128
2217 lea 0x20(TKEYP), TKEYP
2218 je .L4dec192
2219 add $0x20, TKEYP
2220 movaps -0x60(TKEYP), KEY
2221 AESDEC KEY STATE1
2222 AESDEC KEY STATE2
2223 AESDEC KEY STATE3
2224 AESDEC KEY STATE4
2225 movaps -0x50(TKEYP), KEY
2226 AESDEC KEY STATE1
2227 AESDEC KEY STATE2
2228 AESDEC KEY STATE3
2229 AESDEC KEY STATE4
2230.align 4
2231.L4dec192:
2232 movaps -0x40(TKEYP), KEY
2233 AESDEC KEY STATE1
2234 AESDEC KEY STATE2
2235 AESDEC KEY STATE3
2236 AESDEC KEY STATE4
2237 movaps -0x30(TKEYP), KEY
2238 AESDEC KEY STATE1
2239 AESDEC KEY STATE2
2240 AESDEC KEY STATE3
2241 AESDEC KEY STATE4
2242.align 4
2243.L4dec128:
2244 movaps -0x20(TKEYP), KEY
2245 AESDEC KEY STATE1
2246 AESDEC KEY STATE2
2247 AESDEC KEY STATE3
2248 AESDEC KEY STATE4
2249 movaps -0x10(TKEYP), KEY
2250 AESDEC KEY STATE1
2251 AESDEC KEY STATE2
2252 AESDEC KEY STATE3
2253 AESDEC KEY STATE4
2254 movaps (TKEYP), KEY
2255 AESDEC KEY STATE1
2256 AESDEC KEY STATE2
2257 AESDEC KEY STATE3
2258 AESDEC KEY STATE4
2259 movaps 0x10(TKEYP), KEY
2260 AESDEC KEY STATE1
2261 AESDEC KEY STATE2
2262 AESDEC KEY STATE3
2263 AESDEC KEY STATE4
2264 movaps 0x20(TKEYP), KEY
2265 AESDEC KEY STATE1
2266 AESDEC KEY STATE2
2267 AESDEC KEY STATE3
2268 AESDEC KEY STATE4
2269 movaps 0x30(TKEYP), KEY
2270 AESDEC KEY STATE1
2271 AESDEC KEY STATE2
2272 AESDEC KEY STATE3
2273 AESDEC KEY STATE4
2274 movaps 0x40(TKEYP), KEY
2275 AESDEC KEY STATE1
2276 AESDEC KEY STATE2
2277 AESDEC KEY STATE3
2278 AESDEC KEY STATE4
2279 movaps 0x50(TKEYP), KEY
2280 AESDEC KEY STATE1
2281 AESDEC KEY STATE2
2282 AESDEC KEY STATE3
2283 AESDEC KEY STATE4
2284 movaps 0x60(TKEYP), KEY
2285 AESDEC KEY STATE1
2286 AESDEC KEY STATE2
2287 AESDEC KEY STATE3
2288 AESDEC KEY STATE4
2289 movaps 0x70(TKEYP), KEY
2290 AESDECLAST KEY STATE1 # last round
2291 AESDECLAST KEY STATE2
2292 AESDECLAST KEY STATE3
2293 AESDECLAST KEY STATE4
2294 ret
2295ENDPROC(_aesni_dec4)
2296
2297/*
2298 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2299 * size_t len)
2300 */
2301ENTRY(aesni_ecb_enc)
2302 FRAME_BEGIN
2303#ifndef __x86_64__
2304 pushl LEN
2305 pushl KEYP
2306 pushl KLEN
2307 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2308 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2309 movl (FRAME_OFFSET+24)(%esp), INP # src
2310 movl (FRAME_OFFSET+28)(%esp), LEN # len
2311#endif
2312 test LEN, LEN # check length
2313 jz .Lecb_enc_ret
2314 mov 480(KEYP), KLEN
2315 cmp $16, LEN
2316 jb .Lecb_enc_ret
2317 cmp $64, LEN
2318 jb .Lecb_enc_loop1
2319.align 4
2320.Lecb_enc_loop4:
2321 movups (INP), STATE1
2322 movups 0x10(INP), STATE2
2323 movups 0x20(INP), STATE3
2324 movups 0x30(INP), STATE4
2325 call _aesni_enc4
2326 movups STATE1, (OUTP)
2327 movups STATE2, 0x10(OUTP)
2328 movups STATE3, 0x20(OUTP)
2329 movups STATE4, 0x30(OUTP)
2330 sub $64, LEN
2331 add $64, INP
2332 add $64, OUTP
2333 cmp $64, LEN
2334 jge .Lecb_enc_loop4
2335 cmp $16, LEN
2336 jb .Lecb_enc_ret
2337.align 4
2338.Lecb_enc_loop1:
2339 movups (INP), STATE1
2340 call _aesni_enc1
2341 movups STATE1, (OUTP)
2342 sub $16, LEN
2343 add $16, INP
2344 add $16, OUTP
2345 cmp $16, LEN
2346 jge .Lecb_enc_loop1
2347.Lecb_enc_ret:
2348#ifndef __x86_64__
2349 popl KLEN
2350 popl KEYP
2351 popl LEN
2352#endif
2353 FRAME_END
2354 ret
2355ENDPROC(aesni_ecb_enc)
2356
2357/*
2358 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2359 * size_t len);
2360 */
2361ENTRY(aesni_ecb_dec)
2362 FRAME_BEGIN
2363#ifndef __x86_64__
2364 pushl LEN
2365 pushl KEYP
2366 pushl KLEN
2367 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2368 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2369 movl (FRAME_OFFSET+24)(%esp), INP # src
2370 movl (FRAME_OFFSET+28)(%esp), LEN # len
2371#endif
2372 test LEN, LEN
2373 jz .Lecb_dec_ret
2374 mov 480(KEYP), KLEN
2375 add $240, KEYP
2376 cmp $16, LEN
2377 jb .Lecb_dec_ret
2378 cmp $64, LEN
2379 jb .Lecb_dec_loop1
2380.align 4
2381.Lecb_dec_loop4:
2382 movups (INP), STATE1
2383 movups 0x10(INP), STATE2
2384 movups 0x20(INP), STATE3
2385 movups 0x30(INP), STATE4
2386 call _aesni_dec4
2387 movups STATE1, (OUTP)
2388 movups STATE2, 0x10(OUTP)
2389 movups STATE3, 0x20(OUTP)
2390 movups STATE4, 0x30(OUTP)
2391 sub $64, LEN
2392 add $64, INP
2393 add $64, OUTP
2394 cmp $64, LEN
2395 jge .Lecb_dec_loop4
2396 cmp $16, LEN
2397 jb .Lecb_dec_ret
2398.align 4
2399.Lecb_dec_loop1:
2400 movups (INP), STATE1
2401 call _aesni_dec1
2402 movups STATE1, (OUTP)
2403 sub $16, LEN
2404 add $16, INP
2405 add $16, OUTP
2406 cmp $16, LEN
2407 jge .Lecb_dec_loop1
2408.Lecb_dec_ret:
2409#ifndef __x86_64__
2410 popl KLEN
2411 popl KEYP
2412 popl LEN
2413#endif
2414 FRAME_END
2415 ret
2416ENDPROC(aesni_ecb_dec)
2417
2418/*
2419 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2420 * size_t len, u8 *iv)
2421 */
2422ENTRY(aesni_cbc_enc)
2423 FRAME_BEGIN
2424#ifndef __x86_64__
2425 pushl IVP
2426 pushl LEN
2427 pushl KEYP
2428 pushl KLEN
2429 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2430 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2431 movl (FRAME_OFFSET+28)(%esp), INP # src
2432 movl (FRAME_OFFSET+32)(%esp), LEN # len
2433 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2434#endif
2435 cmp $16, LEN
2436 jb .Lcbc_enc_ret
2437 mov 480(KEYP), KLEN
2438 movups (IVP), STATE # load iv as initial state
2439.align 4
2440.Lcbc_enc_loop:
2441 movups (INP), IN # load input
2442 pxor IN, STATE
2443 call _aesni_enc1
2444 movups STATE, (OUTP) # store output
2445 sub $16, LEN
2446 add $16, INP
2447 add $16, OUTP
2448 cmp $16, LEN
2449 jge .Lcbc_enc_loop
2450 movups STATE, (IVP)
2451.Lcbc_enc_ret:
2452#ifndef __x86_64__
2453 popl KLEN
2454 popl KEYP
2455 popl LEN
2456 popl IVP
2457#endif
2458 FRAME_END
2459 ret
2460ENDPROC(aesni_cbc_enc)
2461
2462/*
2463 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2464 * size_t len, u8 *iv)
2465 */
2466ENTRY(aesni_cbc_dec)
2467 FRAME_BEGIN
2468#ifndef __x86_64__
2469 pushl IVP
2470 pushl LEN
2471 pushl KEYP
2472 pushl KLEN
2473 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2474 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2475 movl (FRAME_OFFSET+28)(%esp), INP # src
2476 movl (FRAME_OFFSET+32)(%esp), LEN # len
2477 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2478#endif
2479 cmp $16, LEN
2480 jb .Lcbc_dec_just_ret
2481 mov 480(KEYP), KLEN
2482 add $240, KEYP
2483 movups (IVP), IV
2484 cmp $64, LEN
2485 jb .Lcbc_dec_loop1
2486.align 4
2487.Lcbc_dec_loop4:
2488 movups (INP), IN1
2489 movaps IN1, STATE1
2490 movups 0x10(INP), IN2
2491 movaps IN2, STATE2
2492#ifdef __x86_64__
2493 movups 0x20(INP), IN3
2494 movaps IN3, STATE3
2495 movups 0x30(INP), IN4
2496 movaps IN4, STATE4
2497#else
2498 movups 0x20(INP), IN1
2499 movaps IN1, STATE3
2500 movups 0x30(INP), IN2
2501 movaps IN2, STATE4
2502#endif
2503 call _aesni_dec4
2504 pxor IV, STATE1
2505#ifdef __x86_64__
2506 pxor IN1, STATE2
2507 pxor IN2, STATE3
2508 pxor IN3, STATE4
2509 movaps IN4, IV
2510#else
2511 pxor IN1, STATE4
2512 movaps IN2, IV
2513 movups (INP), IN1
2514 pxor IN1, STATE2
2515 movups 0x10(INP), IN2
2516 pxor IN2, STATE3
2517#endif
2518 movups STATE1, (OUTP)
2519 movups STATE2, 0x10(OUTP)
2520 movups STATE3, 0x20(OUTP)
2521 movups STATE4, 0x30(OUTP)
2522 sub $64, LEN
2523 add $64, INP
2524 add $64, OUTP
2525 cmp $64, LEN
2526 jge .Lcbc_dec_loop4
2527 cmp $16, LEN
2528 jb .Lcbc_dec_ret
2529.align 4
2530.Lcbc_dec_loop1:
2531 movups (INP), IN
2532 movaps IN, STATE
2533 call _aesni_dec1
2534 pxor IV, STATE
2535 movups STATE, (OUTP)
2536 movaps IN, IV
2537 sub $16, LEN
2538 add $16, INP
2539 add $16, OUTP
2540 cmp $16, LEN
2541 jge .Lcbc_dec_loop1
2542.Lcbc_dec_ret:
2543 movups IV, (IVP)
2544.Lcbc_dec_just_ret:
2545#ifndef __x86_64__
2546 popl KLEN
2547 popl KEYP
2548 popl LEN
2549 popl IVP
2550#endif
2551 FRAME_END
2552 ret
2553ENDPROC(aesni_cbc_dec)
2554
2555#ifdef __x86_64__
2556.pushsection .rodata
2557.align 16
2558.Lbswap_mask:
2559 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2560.popsection
2561
2562/*
2563 * _aesni_inc_init: internal ABI
2564 * setup registers used by _aesni_inc
2565 * input:
2566 * IV
2567 * output:
2568 * CTR: == IV, in little endian
2569 * TCTR_LOW: == lower qword of CTR
2570 * INC: == 1, in little endian
2571 * BSWAP_MASK == endian swapping mask
2572 */
2573.align 4
2574_aesni_inc_init:
2575 movaps .Lbswap_mask, BSWAP_MASK
2576 movaps IV, CTR
2577 PSHUFB_XMM BSWAP_MASK CTR
2578 mov $1, TCTR_LOW
2579 MOVQ_R64_XMM TCTR_LOW INC
2580 MOVQ_R64_XMM CTR TCTR_LOW
2581 ret
2582ENDPROC(_aesni_inc_init)
2583
2584/*
2585 * _aesni_inc: internal ABI
2586 * Increase IV by 1, IV is in big endian
2587 * input:
2588 * IV
2589 * CTR: == IV, in little endian
2590 * TCTR_LOW: == lower qword of CTR
2591 * INC: == 1, in little endian
2592 * BSWAP_MASK == endian swapping mask
2593 * output:
2594 * IV: Increase by 1
2595 * changed:
2596 * CTR: == output IV, in little endian
2597 * TCTR_LOW: == lower qword of CTR
2598 */
2599.align 4
2600_aesni_inc:
2601 paddq INC, CTR
2602 add $1, TCTR_LOW
2603 jnc .Linc_low
2604 pslldq $8, INC
2605 paddq INC, CTR
2606 psrldq $8, INC
2607.Linc_low:
2608 movaps CTR, IV
2609 PSHUFB_XMM BSWAP_MASK IV
2610 ret
2611ENDPROC(_aesni_inc)
2612
2613/*
2614 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2615 * size_t len, u8 *iv)
2616 */
2617ENTRY(aesni_ctr_enc)
2618 FRAME_BEGIN
2619 cmp $16, LEN
2620 jb .Lctr_enc_just_ret
2621 mov 480(KEYP), KLEN
2622 movups (IVP), IV
2623 call _aesni_inc_init
2624 cmp $64, LEN
2625 jb .Lctr_enc_loop1
2626.align 4
2627.Lctr_enc_loop4:
2628 movaps IV, STATE1
2629 call _aesni_inc
2630 movups (INP), IN1
2631 movaps IV, STATE2
2632 call _aesni_inc
2633 movups 0x10(INP), IN2
2634 movaps IV, STATE3
2635 call _aesni_inc
2636 movups 0x20(INP), IN3
2637 movaps IV, STATE4
2638 call _aesni_inc
2639 movups 0x30(INP), IN4
2640 call _aesni_enc4
2641 pxor IN1, STATE1
2642 movups STATE1, (OUTP)
2643 pxor IN2, STATE2
2644 movups STATE2, 0x10(OUTP)
2645 pxor IN3, STATE3
2646 movups STATE3, 0x20(OUTP)
2647 pxor IN4, STATE4
2648 movups STATE4, 0x30(OUTP)
2649 sub $64, LEN
2650 add $64, INP
2651 add $64, OUTP
2652 cmp $64, LEN
2653 jge .Lctr_enc_loop4
2654 cmp $16, LEN
2655 jb .Lctr_enc_ret
2656.align 4
2657.Lctr_enc_loop1:
2658 movaps IV, STATE
2659 call _aesni_inc
2660 movups (INP), IN
2661 call _aesni_enc1
2662 pxor IN, STATE
2663 movups STATE, (OUTP)
2664 sub $16, LEN
2665 add $16, INP
2666 add $16, OUTP
2667 cmp $16, LEN
2668 jge .Lctr_enc_loop1
2669.Lctr_enc_ret:
2670 movups IV, (IVP)
2671.Lctr_enc_just_ret:
2672 FRAME_END
2673 ret
2674ENDPROC(aesni_ctr_enc)
2675
2676/*
2677 * _aesni_gf128mul_x_ble: internal ABI
2678 * Multiply in GF(2^128) for XTS IVs
2679 * input:
2680 * IV: current IV
2681 * GF128MUL_MASK == mask with 0x87 and 0x01
2682 * output:
2683 * IV: next IV
2684 * changed:
2685 * CTR: == temporary value
2686 */
2687#define _aesni_gf128mul_x_ble() \
2688 pshufd $0x13, IV, CTR; \
2689 paddq IV, IV; \
2690 psrad $31, CTR; \
2691 pand GF128MUL_MASK, CTR; \
2692 pxor CTR, IV;
2693
2694/*
2695 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2696 * bool enc, u8 *iv)
2697 */
2698ENTRY(aesni_xts_crypt8)
2699 FRAME_BEGIN
2700 cmpb $0, %cl
2701 movl $0, %ecx
2702 movl $240, %r10d
2703 leaq _aesni_enc4, %r11
2704 leaq _aesni_dec4, %rax
2705 cmovel %r10d, %ecx
2706 cmoveq %rax, %r11
2707
2708 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2709 movups (IVP), IV
2710
2711 mov 480(KEYP), KLEN
2712 addq %rcx, KEYP
2713
2714 movdqa IV, STATE1
2715 movdqu 0x00(INP), INC
2716 pxor INC, STATE1
2717 movdqu IV, 0x00(OUTP)
2718
2719 _aesni_gf128mul_x_ble()
2720 movdqa IV, STATE2
2721 movdqu 0x10(INP), INC
2722 pxor INC, STATE2
2723 movdqu IV, 0x10(OUTP)
2724
2725 _aesni_gf128mul_x_ble()
2726 movdqa IV, STATE3
2727 movdqu 0x20(INP), INC
2728 pxor INC, STATE3
2729 movdqu IV, 0x20(OUTP)
2730
2731 _aesni_gf128mul_x_ble()
2732 movdqa IV, STATE4
2733 movdqu 0x30(INP), INC
2734 pxor INC, STATE4
2735 movdqu IV, 0x30(OUTP)
2736
2737 call *%r11
2738
2739 movdqu 0x00(OUTP), INC
2740 pxor INC, STATE1
2741 movdqu STATE1, 0x00(OUTP)
2742
2743 _aesni_gf128mul_x_ble()
2744 movdqa IV, STATE1
2745 movdqu 0x40(INP), INC
2746 pxor INC, STATE1
2747 movdqu IV, 0x40(OUTP)
2748
2749 movdqu 0x10(OUTP), INC
2750 pxor INC, STATE2
2751 movdqu STATE2, 0x10(OUTP)
2752
2753 _aesni_gf128mul_x_ble()
2754 movdqa IV, STATE2
2755 movdqu 0x50(INP), INC
2756 pxor INC, STATE2
2757 movdqu IV, 0x50(OUTP)
2758
2759 movdqu 0x20(OUTP), INC
2760 pxor INC, STATE3
2761 movdqu STATE3, 0x20(OUTP)
2762
2763 _aesni_gf128mul_x_ble()
2764 movdqa IV, STATE3
2765 movdqu 0x60(INP), INC
2766 pxor INC, STATE3
2767 movdqu IV, 0x60(OUTP)
2768
2769 movdqu 0x30(OUTP), INC
2770 pxor INC, STATE4
2771 movdqu STATE4, 0x30(OUTP)
2772
2773 _aesni_gf128mul_x_ble()
2774 movdqa IV, STATE4
2775 movdqu 0x70(INP), INC
2776 pxor INC, STATE4
2777 movdqu IV, 0x70(OUTP)
2778
2779 _aesni_gf128mul_x_ble()
2780 movups IV, (IVP)
2781
2782 call *%r11
2783
2784 movdqu 0x40(OUTP), INC
2785 pxor INC, STATE1
2786 movdqu STATE1, 0x40(OUTP)
2787
2788 movdqu 0x50(OUTP), INC
2789 pxor INC, STATE2
2790 movdqu STATE2, 0x50(OUTP)
2791
2792 movdqu 0x60(OUTP), INC
2793 pxor INC, STATE3
2794 movdqu STATE3, 0x60(OUTP)
2795
2796 movdqu 0x70(OUTP), INC
2797 pxor INC, STATE4
2798 movdqu STATE4, 0x70(OUTP)
2799
2800 FRAME_END
2801 ret
2802ENDPROC(aesni_xts_crypt8)
2803
2804#endif