Loading...
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34
35#ifdef __x86_64__
36.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
56.text
57
58
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
91#define STATE1 %xmm0
92#define STATE2 %xmm4
93#define STATE3 %xmm5
94#define STATE4 %xmm6
95#define STATE STATE1
96#define IN1 %xmm1
97#define IN2 %xmm7
98#define IN3 %xmm8
99#define IN4 %xmm9
100#define IN IN1
101#define KEY %xmm2
102#define IV %xmm3
103
104#define BSWAP_MASK %xmm10
105#define CTR %xmm11
106#define INC %xmm12
107
108#ifdef __x86_64__
109#define AREG %rax
110#define KEYP %rdi
111#define OUTP %rsi
112#define UKEYP OUTP
113#define INP %rdx
114#define LEN %rcx
115#define IVP %r8
116#define KLEN %r9d
117#define T1 %r10
118#define TKEYP T1
119#define T2 %r11
120#define TCTR_LOW T2
121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
300
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651 add $64, %r11
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267 push %r12
1268 push %r13
1269 push %r14
1270 mov %rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275 sub $VARIABLE_OFFSET, %rsp
1276 and $~63, %rsp # align rsp to 64 bytes
1277 mov %arg6, %r12
1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285 movdqa %xmm13, %xmm2
1286 psllq $1, %xmm13
1287 psrlq $63, %xmm2
1288 movdqa %xmm2, %xmm1
1289 pslldq $8, %xmm2
1290 psrldq $8, %xmm1
1291 por %xmm2, %xmm13
1292
1293 # Reduction
1294
1295 pshufd $0x24, %xmm1, %xmm2
1296 pcmpeqd TWOONE(%rip), %xmm2
1297 pand POLY(%rip), %xmm2
1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301 # Decrypt first few blocks
1302
1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306 mov %r13, %r12
1307 and $(3<<4), %r12
1308 jz _initial_num_blocks_is_0_decrypt
1309 cmp $(2<<4), %r12
1310 jb _initial_num_blocks_is_1_decrypt
1311 je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 sub $48, %r13
1316 jmp _initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 sub $32, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 sub $16, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331 cmp $0, %r13
1332 je _zero_cipher_left_decrypt
1333 sub $64, %r13
1334 je _four_cipher_left_decrypt
1335_decrypt_by_4:
1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338 add $64, %r11
1339 sub $64, %r13
1340 jne _decrypt_by_4
1341_four_cipher_left_decrypt:
1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345 mov %arg4, %r13
1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt
1348
1349 # Handle the last <16 byte block separately
1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11
1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366 movdqa %xmm1, %xmm2
1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 pand %xmm1, %xmm2
1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
1375 pxor %xmm2, %xmm8
1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377 # GHASH computation for the last <16 byte block
1378 sub %r13, %r11
1379 add $16, %r11
1380
1381 # output %r13 bytes
1382 MOVQ_R64_XMM %xmm0, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left_decrypt
1385 mov %rax, (%arg2 , %r11, 1)
1386 add $8, %r11
1387 psrldq $8, %xmm0
1388 MOVQ_R64_XMM %xmm0, %rax
1389 sub $8, %r13
1390_less_than_8_bytes_left_decrypt:
1391 mov %al, (%arg2, %r11, 1)
1392 add $1, %r11
1393 shr $8, %rax
1394 sub $1, %r13
1395 jne _less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398 shl $3, %r12 # convert into number of bits
1399 movd %r12d, %xmm15 # len(A) in %xmm15
1400 shl $3, %arg4 # len(C) in bits (*128)
1401 MOVQ_R64_XMM %arg4, %xmm1
1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 pxor %xmm15, %xmm8
1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406 # final GHASH computation
1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
1410 mov %arg5, %rax # %rax = *Y0
1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413 pxor %xmm8, %xmm0
1414_return_T_decrypt:
1415 mov arg9, %r10 # %r10 = authTag
1416 mov arg10, %r11 # %r11 = auth_tag_len
1417 cmp $16, %r11
1418 je _T_16_decrypt
1419 cmp $12, %r11
1420 je _T_12_decrypt
1421_T_8_decrypt:
1422 MOVQ_R64_XMM %xmm0, %rax
1423 mov %rax, (%r10)
1424 jmp _return_T_done_decrypt
1425_T_12_decrypt:
1426 MOVQ_R64_XMM %xmm0, %rax
1427 mov %rax, (%r10)
1428 psrldq $8, %xmm0
1429 movd %xmm0, %eax
1430 mov %eax, 8(%r10)
1431 jmp _return_T_done_decrypt
1432_T_16_decrypt:
1433 movdqu %xmm0, (%r10)
1434_return_T_done_decrypt:
1435 mov %r14, %rsp
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439 ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1617 sub $16, %r11
1618 add %r13, %r11
1619 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1620 lea SHIFT_MASK+16(%rip), %r12
1621 sub %r13, %r12
1622 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623 # (%r13 is the number of bytes in plaintext mod 16)
1624 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1625 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1626 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1627 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1628 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10,%xmm0
1632
1633 pxor %xmm0, %xmm8
1634 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635 # GHASH computation for the last <16 byte block
1636 sub %r13, %r11
1637 add $16, %r11
1638
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
1641
1642 # shuffle xmm0 back to output as ciphertext
1643
1644 # Output %r13 bytes
1645 MOVQ_R64_XMM %xmm0, %rax
1646 cmp $8, %r13
1647 jle _less_than_8_bytes_left_encrypt
1648 mov %rax, (%arg2 , %r11, 1)
1649 add $8, %r11
1650 psrldq $8, %xmm0
1651 MOVQ_R64_XMM %xmm0, %rax
1652 sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654 mov %al, (%arg2, %r11, 1)
1655 add $1, %r11
1656 shr $8, %rax
1657 sub $1, %r13
1658 jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660 mov arg8, %r12 # %r12 = addLen (number of bytes)
1661 shl $3, %r12
1662 movd %r12d, %xmm15 # len(A) in %xmm15
1663 shl $3, %arg4 # len(C) in bits (*128)
1664 MOVQ_R64_XMM %arg4, %xmm1
1665 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1666 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1667 pxor %xmm15, %xmm8
1668 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669 # final GHASH computation
1670 movdqa SHUF_MASK(%rip), %xmm10
1671 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1672
1673 mov %arg5, %rax # %rax = *Y0
1674 movdqu (%rax), %xmm0 # %xmm0 = Y0
1675 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1676 pxor %xmm8, %xmm0
1677_return_T_encrypt:
1678 mov arg9, %r10 # %r10 = authTag
1679 mov arg10, %r11 # %r11 = auth_tag_len
1680 cmp $16, %r11
1681 je _T_16_encrypt
1682 cmp $12, %r11
1683 je _T_12_encrypt
1684_T_8_encrypt:
1685 MOVQ_R64_XMM %xmm0, %rax
1686 mov %rax, (%r10)
1687 jmp _return_T_done_encrypt
1688_T_12_encrypt:
1689 MOVQ_R64_XMM %xmm0, %rax
1690 mov %rax, (%r10)
1691 psrldq $8, %xmm0
1692 movd %xmm0, %eax
1693 mov %eax, 8(%r10)
1694 jmp _return_T_done_encrypt
1695_T_16_encrypt:
1696 movdqu %xmm0, (%r10)
1697_return_T_done_encrypt:
1698 mov %r14, %rsp
1699 pop %r14
1700 pop %r13
1701 pop %r12
1702 ret
1703
1704#endif
1705
1706
1707_key_expansion_128:
1708_key_expansion_256a:
1709 pshufd $0b11111111, %xmm1, %xmm1
1710 shufps $0b00010000, %xmm0, %xmm4
1711 pxor %xmm4, %xmm0
1712 shufps $0b10001100, %xmm0, %xmm4
1713 pxor %xmm4, %xmm0
1714 pxor %xmm1, %xmm0
1715 movaps %xmm0, (TKEYP)
1716 add $0x10, TKEYP
1717 ret
1718
1719.align 4
1720_key_expansion_192a:
1721 pshufd $0b01010101, %xmm1, %xmm1
1722 shufps $0b00010000, %xmm0, %xmm4
1723 pxor %xmm4, %xmm0
1724 shufps $0b10001100, %xmm0, %xmm4
1725 pxor %xmm4, %xmm0
1726 pxor %xmm1, %xmm0
1727
1728 movaps %xmm2, %xmm5
1729 movaps %xmm2, %xmm6
1730 pslldq $4, %xmm5
1731 pshufd $0b11111111, %xmm0, %xmm3
1732 pxor %xmm3, %xmm2
1733 pxor %xmm5, %xmm2
1734
1735 movaps %xmm0, %xmm1
1736 shufps $0b01000100, %xmm0, %xmm6
1737 movaps %xmm6, (TKEYP)
1738 shufps $0b01001110, %xmm2, %xmm1
1739 movaps %xmm1, 0x10(TKEYP)
1740 add $0x20, TKEYP
1741 ret
1742
1743.align 4
1744_key_expansion_192b:
1745 pshufd $0b01010101, %xmm1, %xmm1
1746 shufps $0b00010000, %xmm0, %xmm4
1747 pxor %xmm4, %xmm0
1748 shufps $0b10001100, %xmm0, %xmm4
1749 pxor %xmm4, %xmm0
1750 pxor %xmm1, %xmm0
1751
1752 movaps %xmm2, %xmm5
1753 pslldq $4, %xmm5
1754 pshufd $0b11111111, %xmm0, %xmm3
1755 pxor %xmm3, %xmm2
1756 pxor %xmm5, %xmm2
1757
1758 movaps %xmm0, (TKEYP)
1759 add $0x10, TKEYP
1760 ret
1761
1762.align 4
1763_key_expansion_256b:
1764 pshufd $0b10101010, %xmm1, %xmm1
1765 shufps $0b00010000, %xmm2, %xmm4
1766 pxor %xmm4, %xmm2
1767 shufps $0b10001100, %xmm2, %xmm4
1768 pxor %xmm4, %xmm2
1769 pxor %xmm1, %xmm2
1770 movaps %xmm2, (TKEYP)
1771 add $0x10, TKEYP
1772 ret
1773
1774/*
1775 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1776 * unsigned int key_len)
1777 */
1778ENTRY(aesni_set_key)
1779#ifndef __x86_64__
1780 pushl KEYP
1781 movl 8(%esp), KEYP # ctx
1782 movl 12(%esp), UKEYP # in_key
1783 movl 16(%esp), %edx # key_len
1784#endif
1785 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1786 movaps %xmm0, (KEYP)
1787 lea 0x10(KEYP), TKEYP # key addr
1788 movl %edx, 480(KEYP)
1789 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1790 cmp $24, %dl
1791 jb .Lenc_key128
1792 je .Lenc_key192
1793 movups 0x10(UKEYP), %xmm2 # other user key
1794 movaps %xmm2, (TKEYP)
1795 add $0x10, TKEYP
1796 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1797 call _key_expansion_256a
1798 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1799 call _key_expansion_256b
1800 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1801 call _key_expansion_256a
1802 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1803 call _key_expansion_256b
1804 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1805 call _key_expansion_256a
1806 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1807 call _key_expansion_256b
1808 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1809 call _key_expansion_256a
1810 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1811 call _key_expansion_256b
1812 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1813 call _key_expansion_256a
1814 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1815 call _key_expansion_256b
1816 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1817 call _key_expansion_256a
1818 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1819 call _key_expansion_256b
1820 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1821 call _key_expansion_256a
1822 jmp .Ldec_key
1823.Lenc_key192:
1824 movq 0x10(UKEYP), %xmm2 # other user key
1825 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1826 call _key_expansion_192a
1827 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1828 call _key_expansion_192b
1829 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1830 call _key_expansion_192a
1831 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1832 call _key_expansion_192b
1833 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1834 call _key_expansion_192a
1835 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1836 call _key_expansion_192b
1837 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1838 call _key_expansion_192a
1839 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1840 call _key_expansion_192b
1841 jmp .Ldec_key
1842.Lenc_key128:
1843 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1844 call _key_expansion_128
1845 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1846 call _key_expansion_128
1847 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1848 call _key_expansion_128
1849 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1850 call _key_expansion_128
1851 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1852 call _key_expansion_128
1853 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1854 call _key_expansion_128
1855 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1856 call _key_expansion_128
1857 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1858 call _key_expansion_128
1859 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1860 call _key_expansion_128
1861 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1862 call _key_expansion_128
1863.Ldec_key:
1864 sub $0x10, TKEYP
1865 movaps (KEYP), %xmm0
1866 movaps (TKEYP), %xmm1
1867 movaps %xmm0, 240(TKEYP)
1868 movaps %xmm1, 240(KEYP)
1869 add $0x10, KEYP
1870 lea 240-16(TKEYP), UKEYP
1871.align 4
1872.Ldec_key_loop:
1873 movaps (KEYP), %xmm0
1874 AESIMC %xmm0 %xmm1
1875 movaps %xmm1, (UKEYP)
1876 add $0x10, KEYP
1877 sub $0x10, UKEYP
1878 cmp TKEYP, KEYP
1879 jb .Ldec_key_loop
1880 xor AREG, AREG
1881#ifndef __x86_64__
1882 popl KEYP
1883#endif
1884 ret
1885
1886/*
1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1888 */
1889ENTRY(aesni_enc)
1890#ifndef __x86_64__
1891 pushl KEYP
1892 pushl KLEN
1893 movl 12(%esp), KEYP
1894 movl 16(%esp), OUTP
1895 movl 20(%esp), INP
1896#endif
1897 movl 480(KEYP), KLEN # key length
1898 movups (INP), STATE # input
1899 call _aesni_enc1
1900 movups STATE, (OUTP) # output
1901#ifndef __x86_64__
1902 popl KLEN
1903 popl KEYP
1904#endif
1905 ret
1906
1907/*
1908 * _aesni_enc1: internal ABI
1909 * input:
1910 * KEYP: key struct pointer
1911 * KLEN: round count
1912 * STATE: initial state (input)
1913 * output:
1914 * STATE: finial state (output)
1915 * changed:
1916 * KEY
1917 * TKEYP (T1)
1918 */
1919.align 4
1920_aesni_enc1:
1921 movaps (KEYP), KEY # key
1922 mov KEYP, TKEYP
1923 pxor KEY, STATE # round 0
1924 add $0x30, TKEYP
1925 cmp $24, KLEN
1926 jb .Lenc128
1927 lea 0x20(TKEYP), TKEYP
1928 je .Lenc192
1929 add $0x20, TKEYP
1930 movaps -0x60(TKEYP), KEY
1931 AESENC KEY STATE
1932 movaps -0x50(TKEYP), KEY
1933 AESENC KEY STATE
1934.align 4
1935.Lenc192:
1936 movaps -0x40(TKEYP), KEY
1937 AESENC KEY STATE
1938 movaps -0x30(TKEYP), KEY
1939 AESENC KEY STATE
1940.align 4
1941.Lenc128:
1942 movaps -0x20(TKEYP), KEY
1943 AESENC KEY STATE
1944 movaps -0x10(TKEYP), KEY
1945 AESENC KEY STATE
1946 movaps (TKEYP), KEY
1947 AESENC KEY STATE
1948 movaps 0x10(TKEYP), KEY
1949 AESENC KEY STATE
1950 movaps 0x20(TKEYP), KEY
1951 AESENC KEY STATE
1952 movaps 0x30(TKEYP), KEY
1953 AESENC KEY STATE
1954 movaps 0x40(TKEYP), KEY
1955 AESENC KEY STATE
1956 movaps 0x50(TKEYP), KEY
1957 AESENC KEY STATE
1958 movaps 0x60(TKEYP), KEY
1959 AESENC KEY STATE
1960 movaps 0x70(TKEYP), KEY
1961 AESENCLAST KEY STATE
1962 ret
1963
1964/*
1965 * _aesni_enc4: internal ABI
1966 * input:
1967 * KEYP: key struct pointer
1968 * KLEN: round count
1969 * STATE1: initial state (input)
1970 * STATE2
1971 * STATE3
1972 * STATE4
1973 * output:
1974 * STATE1: finial state (output)
1975 * STATE2
1976 * STATE3
1977 * STATE4
1978 * changed:
1979 * KEY
1980 * TKEYP (T1)
1981 */
1982.align 4
1983_aesni_enc4:
1984 movaps (KEYP), KEY # key
1985 mov KEYP, TKEYP
1986 pxor KEY, STATE1 # round 0
1987 pxor KEY, STATE2
1988 pxor KEY, STATE3
1989 pxor KEY, STATE4
1990 add $0x30, TKEYP
1991 cmp $24, KLEN
1992 jb .L4enc128
1993 lea 0x20(TKEYP), TKEYP
1994 je .L4enc192
1995 add $0x20, TKEYP
1996 movaps -0x60(TKEYP), KEY
1997 AESENC KEY STATE1
1998 AESENC KEY STATE2
1999 AESENC KEY STATE3
2000 AESENC KEY STATE4
2001 movaps -0x50(TKEYP), KEY
2002 AESENC KEY STATE1
2003 AESENC KEY STATE2
2004 AESENC KEY STATE3
2005 AESENC KEY STATE4
2006#.align 4
2007.L4enc192:
2008 movaps -0x40(TKEYP), KEY
2009 AESENC KEY STATE1
2010 AESENC KEY STATE2
2011 AESENC KEY STATE3
2012 AESENC KEY STATE4
2013 movaps -0x30(TKEYP), KEY
2014 AESENC KEY STATE1
2015 AESENC KEY STATE2
2016 AESENC KEY STATE3
2017 AESENC KEY STATE4
2018#.align 4
2019.L4enc128:
2020 movaps -0x20(TKEYP), KEY
2021 AESENC KEY STATE1
2022 AESENC KEY STATE2
2023 AESENC KEY STATE3
2024 AESENC KEY STATE4
2025 movaps -0x10(TKEYP), KEY
2026 AESENC KEY STATE1
2027 AESENC KEY STATE2
2028 AESENC KEY STATE3
2029 AESENC KEY STATE4
2030 movaps (TKEYP), KEY
2031 AESENC KEY STATE1
2032 AESENC KEY STATE2
2033 AESENC KEY STATE3
2034 AESENC KEY STATE4
2035 movaps 0x10(TKEYP), KEY
2036 AESENC KEY STATE1
2037 AESENC KEY STATE2
2038 AESENC KEY STATE3
2039 AESENC KEY STATE4
2040 movaps 0x20(TKEYP), KEY
2041 AESENC KEY STATE1
2042 AESENC KEY STATE2
2043 AESENC KEY STATE3
2044 AESENC KEY STATE4
2045 movaps 0x30(TKEYP), KEY
2046 AESENC KEY STATE1
2047 AESENC KEY STATE2
2048 AESENC KEY STATE3
2049 AESENC KEY STATE4
2050 movaps 0x40(TKEYP), KEY
2051 AESENC KEY STATE1
2052 AESENC KEY STATE2
2053 AESENC KEY STATE3
2054 AESENC KEY STATE4
2055 movaps 0x50(TKEYP), KEY
2056 AESENC KEY STATE1
2057 AESENC KEY STATE2
2058 AESENC KEY STATE3
2059 AESENC KEY STATE4
2060 movaps 0x60(TKEYP), KEY
2061 AESENC KEY STATE1
2062 AESENC KEY STATE2
2063 AESENC KEY STATE3
2064 AESENC KEY STATE4
2065 movaps 0x70(TKEYP), KEY
2066 AESENCLAST KEY STATE1 # last round
2067 AESENCLAST KEY STATE2
2068 AESENCLAST KEY STATE3
2069 AESENCLAST KEY STATE4
2070 ret
2071
2072/*
2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2074 */
2075ENTRY(aesni_dec)
2076#ifndef __x86_64__
2077 pushl KEYP
2078 pushl KLEN
2079 movl 12(%esp), KEYP
2080 movl 16(%esp), OUTP
2081 movl 20(%esp), INP
2082#endif
2083 mov 480(KEYP), KLEN # key length
2084 add $240, KEYP
2085 movups (INP), STATE # input
2086 call _aesni_dec1
2087 movups STATE, (OUTP) #output
2088#ifndef __x86_64__
2089 popl KLEN
2090 popl KEYP
2091#endif
2092 ret
2093
2094/*
2095 * _aesni_dec1: internal ABI
2096 * input:
2097 * KEYP: key struct pointer
2098 * KLEN: key length
2099 * STATE: initial state (input)
2100 * output:
2101 * STATE: finial state (output)
2102 * changed:
2103 * KEY
2104 * TKEYP (T1)
2105 */
2106.align 4
2107_aesni_dec1:
2108 movaps (KEYP), KEY # key
2109 mov KEYP, TKEYP
2110 pxor KEY, STATE # round 0
2111 add $0x30, TKEYP
2112 cmp $24, KLEN
2113 jb .Ldec128
2114 lea 0x20(TKEYP), TKEYP
2115 je .Ldec192
2116 add $0x20, TKEYP
2117 movaps -0x60(TKEYP), KEY
2118 AESDEC KEY STATE
2119 movaps -0x50(TKEYP), KEY
2120 AESDEC KEY STATE
2121.align 4
2122.Ldec192:
2123 movaps -0x40(TKEYP), KEY
2124 AESDEC KEY STATE
2125 movaps -0x30(TKEYP), KEY
2126 AESDEC KEY STATE
2127.align 4
2128.Ldec128:
2129 movaps -0x20(TKEYP), KEY
2130 AESDEC KEY STATE
2131 movaps -0x10(TKEYP), KEY
2132 AESDEC KEY STATE
2133 movaps (TKEYP), KEY
2134 AESDEC KEY STATE
2135 movaps 0x10(TKEYP), KEY
2136 AESDEC KEY STATE
2137 movaps 0x20(TKEYP), KEY
2138 AESDEC KEY STATE
2139 movaps 0x30(TKEYP), KEY
2140 AESDEC KEY STATE
2141 movaps 0x40(TKEYP), KEY
2142 AESDEC KEY STATE
2143 movaps 0x50(TKEYP), KEY
2144 AESDEC KEY STATE
2145 movaps 0x60(TKEYP), KEY
2146 AESDEC KEY STATE
2147 movaps 0x70(TKEYP), KEY
2148 AESDECLAST KEY STATE
2149 ret
2150
2151/*
2152 * _aesni_dec4: internal ABI
2153 * input:
2154 * KEYP: key struct pointer
2155 * KLEN: key length
2156 * STATE1: initial state (input)
2157 * STATE2
2158 * STATE3
2159 * STATE4
2160 * output:
2161 * STATE1: finial state (output)
2162 * STATE2
2163 * STATE3
2164 * STATE4
2165 * changed:
2166 * KEY
2167 * TKEYP (T1)
2168 */
2169.align 4
2170_aesni_dec4:
2171 movaps (KEYP), KEY # key
2172 mov KEYP, TKEYP
2173 pxor KEY, STATE1 # round 0
2174 pxor KEY, STATE2
2175 pxor KEY, STATE3
2176 pxor KEY, STATE4
2177 add $0x30, TKEYP
2178 cmp $24, KLEN
2179 jb .L4dec128
2180 lea 0x20(TKEYP), TKEYP
2181 je .L4dec192
2182 add $0x20, TKEYP
2183 movaps -0x60(TKEYP), KEY
2184 AESDEC KEY STATE1
2185 AESDEC KEY STATE2
2186 AESDEC KEY STATE3
2187 AESDEC KEY STATE4
2188 movaps -0x50(TKEYP), KEY
2189 AESDEC KEY STATE1
2190 AESDEC KEY STATE2
2191 AESDEC KEY STATE3
2192 AESDEC KEY STATE4
2193.align 4
2194.L4dec192:
2195 movaps -0x40(TKEYP), KEY
2196 AESDEC KEY STATE1
2197 AESDEC KEY STATE2
2198 AESDEC KEY STATE3
2199 AESDEC KEY STATE4
2200 movaps -0x30(TKEYP), KEY
2201 AESDEC KEY STATE1
2202 AESDEC KEY STATE2
2203 AESDEC KEY STATE3
2204 AESDEC KEY STATE4
2205.align 4
2206.L4dec128:
2207 movaps -0x20(TKEYP), KEY
2208 AESDEC KEY STATE1
2209 AESDEC KEY STATE2
2210 AESDEC KEY STATE3
2211 AESDEC KEY STATE4
2212 movaps -0x10(TKEYP), KEY
2213 AESDEC KEY STATE1
2214 AESDEC KEY STATE2
2215 AESDEC KEY STATE3
2216 AESDEC KEY STATE4
2217 movaps (TKEYP), KEY
2218 AESDEC KEY STATE1
2219 AESDEC KEY STATE2
2220 AESDEC KEY STATE3
2221 AESDEC KEY STATE4
2222 movaps 0x10(TKEYP), KEY
2223 AESDEC KEY STATE1
2224 AESDEC KEY STATE2
2225 AESDEC KEY STATE3
2226 AESDEC KEY STATE4
2227 movaps 0x20(TKEYP), KEY
2228 AESDEC KEY STATE1
2229 AESDEC KEY STATE2
2230 AESDEC KEY STATE3
2231 AESDEC KEY STATE4
2232 movaps 0x30(TKEYP), KEY
2233 AESDEC KEY STATE1
2234 AESDEC KEY STATE2
2235 AESDEC KEY STATE3
2236 AESDEC KEY STATE4
2237 movaps 0x40(TKEYP), KEY
2238 AESDEC KEY STATE1
2239 AESDEC KEY STATE2
2240 AESDEC KEY STATE3
2241 AESDEC KEY STATE4
2242 movaps 0x50(TKEYP), KEY
2243 AESDEC KEY STATE1
2244 AESDEC KEY STATE2
2245 AESDEC KEY STATE3
2246 AESDEC KEY STATE4
2247 movaps 0x60(TKEYP), KEY
2248 AESDEC KEY STATE1
2249 AESDEC KEY STATE2
2250 AESDEC KEY STATE3
2251 AESDEC KEY STATE4
2252 movaps 0x70(TKEYP), KEY
2253 AESDECLAST KEY STATE1 # last round
2254 AESDECLAST KEY STATE2
2255 AESDECLAST KEY STATE3
2256 AESDECLAST KEY STATE4
2257 ret
2258
2259/*
2260 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 * size_t len)
2262 */
2263ENTRY(aesni_ecb_enc)
2264#ifndef __x86_64__
2265 pushl LEN
2266 pushl KEYP
2267 pushl KLEN
2268 movl 16(%esp), KEYP
2269 movl 20(%esp), OUTP
2270 movl 24(%esp), INP
2271 movl 28(%esp), LEN
2272#endif
2273 test LEN, LEN # check length
2274 jz .Lecb_enc_ret
2275 mov 480(KEYP), KLEN
2276 cmp $16, LEN
2277 jb .Lecb_enc_ret
2278 cmp $64, LEN
2279 jb .Lecb_enc_loop1
2280.align 4
2281.Lecb_enc_loop4:
2282 movups (INP), STATE1
2283 movups 0x10(INP), STATE2
2284 movups 0x20(INP), STATE3
2285 movups 0x30(INP), STATE4
2286 call _aesni_enc4
2287 movups STATE1, (OUTP)
2288 movups STATE2, 0x10(OUTP)
2289 movups STATE3, 0x20(OUTP)
2290 movups STATE4, 0x30(OUTP)
2291 sub $64, LEN
2292 add $64, INP
2293 add $64, OUTP
2294 cmp $64, LEN
2295 jge .Lecb_enc_loop4
2296 cmp $16, LEN
2297 jb .Lecb_enc_ret
2298.align 4
2299.Lecb_enc_loop1:
2300 movups (INP), STATE1
2301 call _aesni_enc1
2302 movups STATE1, (OUTP)
2303 sub $16, LEN
2304 add $16, INP
2305 add $16, OUTP
2306 cmp $16, LEN
2307 jge .Lecb_enc_loop1
2308.Lecb_enc_ret:
2309#ifndef __x86_64__
2310 popl KLEN
2311 popl KEYP
2312 popl LEN
2313#endif
2314 ret
2315
2316/*
2317 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 * size_t len);
2319 */
2320ENTRY(aesni_ecb_dec)
2321#ifndef __x86_64__
2322 pushl LEN
2323 pushl KEYP
2324 pushl KLEN
2325 movl 16(%esp), KEYP
2326 movl 20(%esp), OUTP
2327 movl 24(%esp), INP
2328 movl 28(%esp), LEN
2329#endif
2330 test LEN, LEN
2331 jz .Lecb_dec_ret
2332 mov 480(KEYP), KLEN
2333 add $240, KEYP
2334 cmp $16, LEN
2335 jb .Lecb_dec_ret
2336 cmp $64, LEN
2337 jb .Lecb_dec_loop1
2338.align 4
2339.Lecb_dec_loop4:
2340 movups (INP), STATE1
2341 movups 0x10(INP), STATE2
2342 movups 0x20(INP), STATE3
2343 movups 0x30(INP), STATE4
2344 call _aesni_dec4
2345 movups STATE1, (OUTP)
2346 movups STATE2, 0x10(OUTP)
2347 movups STATE3, 0x20(OUTP)
2348 movups STATE4, 0x30(OUTP)
2349 sub $64, LEN
2350 add $64, INP
2351 add $64, OUTP
2352 cmp $64, LEN
2353 jge .Lecb_dec_loop4
2354 cmp $16, LEN
2355 jb .Lecb_dec_ret
2356.align 4
2357.Lecb_dec_loop1:
2358 movups (INP), STATE1
2359 call _aesni_dec1
2360 movups STATE1, (OUTP)
2361 sub $16, LEN
2362 add $16, INP
2363 add $16, OUTP
2364 cmp $16, LEN
2365 jge .Lecb_dec_loop1
2366.Lecb_dec_ret:
2367#ifndef __x86_64__
2368 popl KLEN
2369 popl KEYP
2370 popl LEN
2371#endif
2372 ret
2373
2374/*
2375 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2376 * size_t len, u8 *iv)
2377 */
2378ENTRY(aesni_cbc_enc)
2379#ifndef __x86_64__
2380 pushl IVP
2381 pushl LEN
2382 pushl KEYP
2383 pushl KLEN
2384 movl 20(%esp), KEYP
2385 movl 24(%esp), OUTP
2386 movl 28(%esp), INP
2387 movl 32(%esp), LEN
2388 movl 36(%esp), IVP
2389#endif
2390 cmp $16, LEN
2391 jb .Lcbc_enc_ret
2392 mov 480(KEYP), KLEN
2393 movups (IVP), STATE # load iv as initial state
2394.align 4
2395.Lcbc_enc_loop:
2396 movups (INP), IN # load input
2397 pxor IN, STATE
2398 call _aesni_enc1
2399 movups STATE, (OUTP) # store output
2400 sub $16, LEN
2401 add $16, INP
2402 add $16, OUTP
2403 cmp $16, LEN
2404 jge .Lcbc_enc_loop
2405 movups STATE, (IVP)
2406.Lcbc_enc_ret:
2407#ifndef __x86_64__
2408 popl KLEN
2409 popl KEYP
2410 popl LEN
2411 popl IVP
2412#endif
2413 ret
2414
2415/*
2416 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2417 * size_t len, u8 *iv)
2418 */
2419ENTRY(aesni_cbc_dec)
2420#ifndef __x86_64__
2421 pushl IVP
2422 pushl LEN
2423 pushl KEYP
2424 pushl KLEN
2425 movl 20(%esp), KEYP
2426 movl 24(%esp), OUTP
2427 movl 28(%esp), INP
2428 movl 32(%esp), LEN
2429 movl 36(%esp), IVP
2430#endif
2431 cmp $16, LEN
2432 jb .Lcbc_dec_just_ret
2433 mov 480(KEYP), KLEN
2434 add $240, KEYP
2435 movups (IVP), IV
2436 cmp $64, LEN
2437 jb .Lcbc_dec_loop1
2438.align 4
2439.Lcbc_dec_loop4:
2440 movups (INP), IN1
2441 movaps IN1, STATE1
2442 movups 0x10(INP), IN2
2443 movaps IN2, STATE2
2444#ifdef __x86_64__
2445 movups 0x20(INP), IN3
2446 movaps IN3, STATE3
2447 movups 0x30(INP), IN4
2448 movaps IN4, STATE4
2449#else
2450 movups 0x20(INP), IN1
2451 movaps IN1, STATE3
2452 movups 0x30(INP), IN2
2453 movaps IN2, STATE4
2454#endif
2455 call _aesni_dec4
2456 pxor IV, STATE1
2457#ifdef __x86_64__
2458 pxor IN1, STATE2
2459 pxor IN2, STATE3
2460 pxor IN3, STATE4
2461 movaps IN4, IV
2462#else
2463 pxor (INP), STATE2
2464 pxor 0x10(INP), STATE3
2465 pxor IN1, STATE4
2466 movaps IN2, IV
2467#endif
2468 movups STATE1, (OUTP)
2469 movups STATE2, 0x10(OUTP)
2470 movups STATE3, 0x20(OUTP)
2471 movups STATE4, 0x30(OUTP)
2472 sub $64, LEN
2473 add $64, INP
2474 add $64, OUTP
2475 cmp $64, LEN
2476 jge .Lcbc_dec_loop4
2477 cmp $16, LEN
2478 jb .Lcbc_dec_ret
2479.align 4
2480.Lcbc_dec_loop1:
2481 movups (INP), IN
2482 movaps IN, STATE
2483 call _aesni_dec1
2484 pxor IV, STATE
2485 movups STATE, (OUTP)
2486 movaps IN, IV
2487 sub $16, LEN
2488 add $16, INP
2489 add $16, OUTP
2490 cmp $16, LEN
2491 jge .Lcbc_dec_loop1
2492.Lcbc_dec_ret:
2493 movups IV, (IVP)
2494.Lcbc_dec_just_ret:
2495#ifndef __x86_64__
2496 popl KLEN
2497 popl KEYP
2498 popl LEN
2499 popl IVP
2500#endif
2501 ret
2502
2503#ifdef __x86_64__
2504.align 16
2505.Lbswap_mask:
2506 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2507
2508/*
2509 * _aesni_inc_init: internal ABI
2510 * setup registers used by _aesni_inc
2511 * input:
2512 * IV
2513 * output:
2514 * CTR: == IV, in little endian
2515 * TCTR_LOW: == lower qword of CTR
2516 * INC: == 1, in little endian
2517 * BSWAP_MASK == endian swapping mask
2518 */
2519.align 4
2520_aesni_inc_init:
2521 movaps .Lbswap_mask, BSWAP_MASK
2522 movaps IV, CTR
2523 PSHUFB_XMM BSWAP_MASK CTR
2524 mov $1, TCTR_LOW
2525 MOVQ_R64_XMM TCTR_LOW INC
2526 MOVQ_R64_XMM CTR TCTR_LOW
2527 ret
2528
2529/*
2530 * _aesni_inc: internal ABI
2531 * Increase IV by 1, IV is in big endian
2532 * input:
2533 * IV
2534 * CTR: == IV, in little endian
2535 * TCTR_LOW: == lower qword of CTR
2536 * INC: == 1, in little endian
2537 * BSWAP_MASK == endian swapping mask
2538 * output:
2539 * IV: Increase by 1
2540 * changed:
2541 * CTR: == output IV, in little endian
2542 * TCTR_LOW: == lower qword of CTR
2543 */
2544.align 4
2545_aesni_inc:
2546 paddq INC, CTR
2547 add $1, TCTR_LOW
2548 jnc .Linc_low
2549 pslldq $8, INC
2550 paddq INC, CTR
2551 psrldq $8, INC
2552.Linc_low:
2553 movaps CTR, IV
2554 PSHUFB_XMM BSWAP_MASK IV
2555 ret
2556
2557/*
2558 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2559 * size_t len, u8 *iv)
2560 */
2561ENTRY(aesni_ctr_enc)
2562 cmp $16, LEN
2563 jb .Lctr_enc_just_ret
2564 mov 480(KEYP), KLEN
2565 movups (IVP), IV
2566 call _aesni_inc_init
2567 cmp $64, LEN
2568 jb .Lctr_enc_loop1
2569.align 4
2570.Lctr_enc_loop4:
2571 movaps IV, STATE1
2572 call _aesni_inc
2573 movups (INP), IN1
2574 movaps IV, STATE2
2575 call _aesni_inc
2576 movups 0x10(INP), IN2
2577 movaps IV, STATE3
2578 call _aesni_inc
2579 movups 0x20(INP), IN3
2580 movaps IV, STATE4
2581 call _aesni_inc
2582 movups 0x30(INP), IN4
2583 call _aesni_enc4
2584 pxor IN1, STATE1
2585 movups STATE1, (OUTP)
2586 pxor IN2, STATE2
2587 movups STATE2, 0x10(OUTP)
2588 pxor IN3, STATE3
2589 movups STATE3, 0x20(OUTP)
2590 pxor IN4, STATE4
2591 movups STATE4, 0x30(OUTP)
2592 sub $64, LEN
2593 add $64, INP
2594 add $64, OUTP
2595 cmp $64, LEN
2596 jge .Lctr_enc_loop4
2597 cmp $16, LEN
2598 jb .Lctr_enc_ret
2599.align 4
2600.Lctr_enc_loop1:
2601 movaps IV, STATE
2602 call _aesni_inc
2603 movups (INP), IN
2604 call _aesni_enc1
2605 pxor IN, STATE
2606 movups STATE, (OUTP)
2607 sub $16, LEN
2608 add $16, INP
2609 add $16, OUTP
2610 cmp $16, LEN
2611 jge .Lctr_enc_loop1
2612.Lctr_enc_ret:
2613 movups IV, (IVP)
2614.Lctr_enc_just_ret:
2615 ret
2616#endif
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
23 *
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
26 */
27
28#include <linux/linkage.h>
29#include <asm/frame.h>
30#include <asm/nospec-branch.h>
31
32/*
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register. This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned). It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released. However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
39 */
40#define MOVADQ movaps
41#define MOVUDQ movups
42
43#ifdef __x86_64__
44
45# constants in mergeable sections, linker can reorder and merge
46.section .rodata.cst16.POLY, "aM", @progbits, 16
47.align 16
48POLY: .octa 0xC2000000000000000000000000000001
49.section .rodata.cst16.TWOONE, "aM", @progbits, 16
50.align 16
51TWOONE: .octa 0x00000001000000000000000000000001
52
53.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
54.align 16
55SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
56.section .rodata.cst16.MASK1, "aM", @progbits, 16
57.align 16
58MASK1: .octa 0x0000000000000000ffffffffffffffff
59.section .rodata.cst16.MASK2, "aM", @progbits, 16
60.align 16
61MASK2: .octa 0xffffffffffffffff0000000000000000
62.section .rodata.cst16.ONE, "aM", @progbits, 16
63.align 16
64ONE: .octa 0x00000000000000000000000000000001
65.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
66.align 16
67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68.section .rodata.cst16.dec, "aM", @progbits, 16
69.align 16
70dec: .octa 0x1
71.section .rodata.cst16.enc, "aM", @progbits, 16
72.align 16
73enc: .octa 0x2
74
75# order of these constants should not change.
76# more specifically, ALL_F should follow SHIFT_MASK,
77# and zero should follow ALL_F
78.section .rodata, "a", @progbits
79.align 16
80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
81ALL_F: .octa 0xffffffffffffffffffffffffffffffff
82 .octa 0x00000000000000000000000000000000
83
84.text
85
86
87#define STACK_OFFSET 8*3
88
89#define AadHash 16*0
90#define AadLen 16*1
91#define InLen (16*1)+8
92#define PBlockEncKey 16*2
93#define OrigIV 16*3
94#define CurCount 16*4
95#define PBlockLen 16*5
96#define HashKey 16*6 // store HashKey <<1 mod poly here
97#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
98#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
99#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
100#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
101 // bits of HashKey <<1 mod poly here
102 //(for Karatsuba purposes)
103#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
104 // bits of HashKey^2 <<1 mod poly here
105 // (for Karatsuba purposes)
106#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
107 // bits of HashKey^3 <<1 mod poly here
108 // (for Karatsuba purposes)
109#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
110 // bits of HashKey^4 <<1 mod poly here
111 // (for Karatsuba purposes)
112
113#define arg1 rdi
114#define arg2 rsi
115#define arg3 rdx
116#define arg4 rcx
117#define arg5 r8
118#define arg6 r9
119#define arg7 STACK_OFFSET+8(%rsp)
120#define arg8 STACK_OFFSET+16(%rsp)
121#define arg9 STACK_OFFSET+24(%rsp)
122#define arg10 STACK_OFFSET+32(%rsp)
123#define arg11 STACK_OFFSET+40(%rsp)
124#define keysize 2*15*16(%arg1)
125#endif
126
127
128#define STATE1 %xmm0
129#define STATE2 %xmm4
130#define STATE3 %xmm5
131#define STATE4 %xmm6
132#define STATE STATE1
133#define IN1 %xmm1
134#define IN2 %xmm7
135#define IN3 %xmm8
136#define IN4 %xmm9
137#define IN IN1
138#define KEY %xmm2
139#define IV %xmm3
140
141#define BSWAP_MASK %xmm10
142#define CTR %xmm11
143#define INC %xmm12
144
145#define GF128MUL_MASK %xmm7
146
147#ifdef __x86_64__
148#define AREG %rax
149#define KEYP %rdi
150#define OUTP %rsi
151#define UKEYP OUTP
152#define INP %rdx
153#define LEN %rcx
154#define IVP %r8
155#define KLEN %r9d
156#define T1 %r10
157#define TKEYP T1
158#define T2 %r11
159#define TCTR_LOW T2
160#else
161#define AREG %eax
162#define KEYP %edi
163#define OUTP AREG
164#define UKEYP OUTP
165#define INP %edx
166#define LEN %esi
167#define IVP %ebp
168#define KLEN %ebx
169#define T1 %ecx
170#define TKEYP T1
171#endif
172
173.macro FUNC_SAVE
174 push %r12
175 push %r13
176 push %r14
177#
178# states of %xmm registers %xmm6:%xmm15 not saved
179# all %xmm registers are clobbered
180#
181.endm
182
183
184.macro FUNC_RESTORE
185 pop %r14
186 pop %r13
187 pop %r12
188.endm
189
190# Precompute hashkeys.
191# Input: Hash subkey.
192# Output: HashKeys stored in gcm_context_data. Only needs to be called
193# once per key.
194# clobbers r12, and tmp xmm registers.
195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
196 mov \SUBKEY, %r12
197 movdqu (%r12), \TMP3
198 movdqa SHUF_MASK(%rip), \TMP2
199 pshufb \TMP2, \TMP3
200
201 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
202
203 movdqa \TMP3, \TMP2
204 psllq $1, \TMP3
205 psrlq $63, \TMP2
206 movdqa \TMP2, \TMP1
207 pslldq $8, \TMP2
208 psrldq $8, \TMP1
209 por \TMP2, \TMP3
210
211 # reduce HashKey<<1
212
213 pshufd $0x24, \TMP1, \TMP2
214 pcmpeqd TWOONE(%rip), \TMP2
215 pand POLY(%rip), \TMP2
216 pxor \TMP2, \TMP3
217 movdqu \TMP3, HashKey(%arg2)
218
219 movdqa \TMP3, \TMP5
220 pshufd $78, \TMP3, \TMP1
221 pxor \TMP3, \TMP1
222 movdqu \TMP1, HashKey_k(%arg2)
223
224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
225# TMP5 = HashKey^2<<1 (mod poly)
226 movdqu \TMP5, HashKey_2(%arg2)
227# HashKey_2 = HashKey^2<<1 (mod poly)
228 pshufd $78, \TMP5, \TMP1
229 pxor \TMP5, \TMP1
230 movdqu \TMP1, HashKey_2_k(%arg2)
231
232 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
233# TMP5 = HashKey^3<<1 (mod poly)
234 movdqu \TMP5, HashKey_3(%arg2)
235 pshufd $78, \TMP5, \TMP1
236 pxor \TMP5, \TMP1
237 movdqu \TMP1, HashKey_3_k(%arg2)
238
239 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
240# TMP5 = HashKey^3<<1 (mod poly)
241 movdqu \TMP5, HashKey_4(%arg2)
242 pshufd $78, \TMP5, \TMP1
243 pxor \TMP5, \TMP1
244 movdqu \TMP1, HashKey_4_k(%arg2)
245.endm
246
247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
249.macro GCM_INIT Iv SUBKEY AAD AADLEN
250 mov \AADLEN, %r11
251 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
252 xor %r11d, %r11d
253 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
254 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
255 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
256 mov \Iv, %rax
257 movdqu (%rax), %xmm0
258 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
259
260 movdqa SHUF_MASK(%rip), %xmm2
261 pshufb %xmm2, %xmm0
262 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
263
264 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
265 movdqu HashKey(%arg2), %xmm13
266
267 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
268 %xmm4, %xmm5, %xmm6
269.endm
270
271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
272# struct has been initialized by GCM_INIT.
273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
274# Clobbers rax, r10-r13, and xmm0-xmm15
275.macro GCM_ENC_DEC operation
276 movdqu AadHash(%arg2), %xmm8
277 movdqu HashKey(%arg2), %xmm13
278 add %arg5, InLen(%arg2)
279
280 xor %r11d, %r11d # initialise the data pointer offset as zero
281 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
282
283 sub %r11, %arg5 # sub partial block data used
284 mov %arg5, %r13 # save the number of bytes
285
286 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
287 mov %r13, %r12
288 # Encrypt/Decrypt first few blocks
289
290 and $(3<<4), %r12
291 jz .L_initial_num_blocks_is_0_\@
292 cmp $(2<<4), %r12
293 jb .L_initial_num_blocks_is_1_\@
294 je .L_initial_num_blocks_is_2_\@
295.L_initial_num_blocks_is_3_\@:
296 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
298 sub $48, %r13
299 jmp .L_initial_blocks_\@
300.L_initial_num_blocks_is_2_\@:
301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
303 sub $32, %r13
304 jmp .L_initial_blocks_\@
305.L_initial_num_blocks_is_1_\@:
306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
308 sub $16, %r13
309 jmp .L_initial_blocks_\@
310.L_initial_num_blocks_is_0_\@:
311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
313.L_initial_blocks_\@:
314
315 # Main loop - Encrypt/Decrypt remaining blocks
316
317 test %r13, %r13
318 je .L_zero_cipher_left_\@
319 sub $64, %r13
320 je .L_four_cipher_left_\@
321.L_crypt_by_4_\@:
322 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
323 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
324 %xmm7, %xmm8, enc
325 add $64, %r11
326 sub $64, %r13
327 jne .L_crypt_by_4_\@
328.L_four_cipher_left_\@:
329 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
331.L_zero_cipher_left_\@:
332 movdqu %xmm8, AadHash(%arg2)
333 movdqu %xmm0, CurCount(%arg2)
334
335 mov %arg5, %r13
336 and $15, %r13 # %r13 = arg5 (mod 16)
337 je .L_multiple_of_16_bytes_\@
338
339 mov %r13, PBlockLen(%arg2)
340
341 # Handle the last <16 Byte block separately
342 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
343 movdqu %xmm0, CurCount(%arg2)
344 movdqa SHUF_MASK(%rip), %xmm10
345 pshufb %xmm10, %xmm0
346
347 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
348 movdqu %xmm0, PBlockEncKey(%arg2)
349
350 cmp $16, %arg5
351 jge .L_large_enough_update_\@
352
353 lea (%arg4,%r11,1), %r10
354 mov %r13, %r12
355 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
356 jmp .L_data_read_\@
357
358.L_large_enough_update_\@:
359 sub $16, %r11
360 add %r13, %r11
361
362 # receive the last <16 Byte block
363 movdqu (%arg4, %r11, 1), %xmm1
364
365 sub %r13, %r11
366 add $16, %r11
367
368 lea SHIFT_MASK+16(%rip), %r12
369 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
370 # (r13 is the number of bytes in plaintext mod 16)
371 sub %r13, %r12
372 # get the appropriate shuffle mask
373 movdqu (%r12), %xmm2
374 # shift right 16-r13 bytes
375 pshufb %xmm2, %xmm1
376
377.L_data_read_\@:
378 lea ALL_F+16(%rip), %r12
379 sub %r13, %r12
380
381.ifc \operation, dec
382 movdqa %xmm1, %xmm2
383.endif
384 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
385 movdqu (%r12), %xmm1
386 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
387 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
388.ifc \operation, dec
389 pand %xmm1, %xmm2
390 movdqa SHUF_MASK(%rip), %xmm10
391 pshufb %xmm10 ,%xmm2
392
393 pxor %xmm2, %xmm8
394.else
395 movdqa SHUF_MASK(%rip), %xmm10
396 pshufb %xmm10,%xmm0
397
398 pxor %xmm0, %xmm8
399.endif
400
401 movdqu %xmm8, AadHash(%arg2)
402.ifc \operation, enc
403 # GHASH computation for the last <16 byte block
404 movdqa SHUF_MASK(%rip), %xmm10
405 # shuffle xmm0 back to output as ciphertext
406 pshufb %xmm10, %xmm0
407.endif
408
409 # Output %r13 bytes
410 movq %xmm0, %rax
411 cmp $8, %r13
412 jle .L_less_than_8_bytes_left_\@
413 mov %rax, (%arg3 , %r11, 1)
414 add $8, %r11
415 psrldq $8, %xmm0
416 movq %xmm0, %rax
417 sub $8, %r13
418.L_less_than_8_bytes_left_\@:
419 mov %al, (%arg3, %r11, 1)
420 add $1, %r11
421 shr $8, %rax
422 sub $1, %r13
423 jne .L_less_than_8_bytes_left_\@
424.L_multiple_of_16_bytes_\@:
425.endm
426
427# GCM_COMPLETE Finishes update of tag of last partial block
428# Output: Authorization Tag (AUTH_TAG)
429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
431 movdqu AadHash(%arg2), %xmm8
432 movdqu HashKey(%arg2), %xmm13
433
434 mov PBlockLen(%arg2), %r12
435
436 test %r12, %r12
437 je .L_partial_done\@
438
439 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
440
441.L_partial_done\@:
442 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
443 shl $3, %r12 # convert into number of bits
444 movd %r12d, %xmm15 # len(A) in %xmm15
445 mov InLen(%arg2), %r12
446 shl $3, %r12 # len(C) in bits (*128)
447 movq %r12, %xmm1
448
449 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
450 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
451 pxor %xmm15, %xmm8
452 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
453 # final GHASH computation
454 movdqa SHUF_MASK(%rip), %xmm10
455 pshufb %xmm10, %xmm8
456
457 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
458 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
459 pxor %xmm8, %xmm0
460.L_return_T_\@:
461 mov \AUTHTAG, %r10 # %r10 = authTag
462 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
463 cmp $16, %r11
464 je .L_T_16_\@
465 cmp $8, %r11
466 jl .L_T_4_\@
467.L_T_8_\@:
468 movq %xmm0, %rax
469 mov %rax, (%r10)
470 add $8, %r10
471 sub $8, %r11
472 psrldq $8, %xmm0
473 test %r11, %r11
474 je .L_return_T_done_\@
475.L_T_4_\@:
476 movd %xmm0, %eax
477 mov %eax, (%r10)
478 add $4, %r10
479 sub $4, %r11
480 psrldq $4, %xmm0
481 test %r11, %r11
482 je .L_return_T_done_\@
483.L_T_123_\@:
484 movd %xmm0, %eax
485 cmp $2, %r11
486 jl .L_T_1_\@
487 mov %ax, (%r10)
488 cmp $2, %r11
489 je .L_return_T_done_\@
490 add $2, %r10
491 sar $16, %eax
492.L_T_1_\@:
493 mov %al, (%r10)
494 jmp .L_return_T_done_\@
495.L_T_16_\@:
496 movdqu %xmm0, (%r10)
497.L_return_T_done_\@:
498.endm
499
500#ifdef __x86_64__
501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
502*
503*
504* Input: A and B (128-bits each, bit-reflected)
505* Output: C = A*B*x mod poly, (i.e. >>1 )
506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
508*
509*/
510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
511 movdqa \GH, \TMP1
512 pshufd $78, \GH, \TMP2
513 pshufd $78, \HK, \TMP3
514 pxor \GH, \TMP2 # TMP2 = a1+a0
515 pxor \HK, \TMP3 # TMP3 = b1+b0
516 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
517 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
518 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
519 pxor \GH, \TMP2
520 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
521 movdqa \TMP2, \TMP3
522 pslldq $8, \TMP3 # left shift TMP3 2 DWs
523 psrldq $8, \TMP2 # right shift TMP2 2 DWs
524 pxor \TMP3, \GH
525 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
526
527 # first phase of the reduction
528
529 movdqa \GH, \TMP2
530 movdqa \GH, \TMP3
531 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
532 # in in order to perform
533 # independent shifts
534 pslld $31, \TMP2 # packed right shift <<31
535 pslld $30, \TMP3 # packed right shift <<30
536 pslld $25, \TMP4 # packed right shift <<25
537 pxor \TMP3, \TMP2 # xor the shifted versions
538 pxor \TMP4, \TMP2
539 movdqa \TMP2, \TMP5
540 psrldq $4, \TMP5 # right shift TMP5 1 DW
541 pslldq $12, \TMP2 # left shift TMP2 3 DWs
542 pxor \TMP2, \GH
543
544 # second phase of the reduction
545
546 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
547 # in in order to perform
548 # independent shifts
549 movdqa \GH,\TMP3
550 movdqa \GH,\TMP4
551 psrld $1,\TMP2 # packed left shift >>1
552 psrld $2,\TMP3 # packed left shift >>2
553 psrld $7,\TMP4 # packed left shift >>7
554 pxor \TMP3,\TMP2 # xor the shifted versions
555 pxor \TMP4,\TMP2
556 pxor \TMP5, \TMP2
557 pxor \TMP2, \GH
558 pxor \TMP1, \GH # result is in TMP1
559.endm
560
561# Reads DLEN bytes starting at DPTR and stores in XMMDst
562# where 0 < DLEN < 16
563# Clobbers %rax, DLEN and XMM1
564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
565 cmp $8, \DLEN
566 jl .L_read_lt8_\@
567 mov (\DPTR), %rax
568 movq %rax, \XMMDst
569 sub $8, \DLEN
570 jz .L_done_read_partial_block_\@
571 xor %eax, %eax
572.L_read_next_byte_\@:
573 shl $8, %rax
574 mov 7(\DPTR, \DLEN, 1), %al
575 dec \DLEN
576 jnz .L_read_next_byte_\@
577 movq %rax, \XMM1
578 pslldq $8, \XMM1
579 por \XMM1, \XMMDst
580 jmp .L_done_read_partial_block_\@
581.L_read_lt8_\@:
582 xor %eax, %eax
583.L_read_next_byte_lt8_\@:
584 shl $8, %rax
585 mov -1(\DPTR, \DLEN, 1), %al
586 dec \DLEN
587 jnz .L_read_next_byte_lt8_\@
588 movq %rax, \XMMDst
589.L_done_read_partial_block_\@:
590.endm
591
592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
593# clobbers r10-11, xmm14
594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
595 TMP6 TMP7
596 MOVADQ SHUF_MASK(%rip), %xmm14
597 mov \AAD, %r10 # %r10 = AAD
598 mov \AADLEN, %r11 # %r11 = aadLen
599 pxor \TMP7, \TMP7
600 pxor \TMP6, \TMP6
601
602 cmp $16, %r11
603 jl .L_get_AAD_rest\@
604.L_get_AAD_blocks\@:
605 movdqu (%r10), \TMP7
606 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
607 pxor \TMP7, \TMP6
608 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
609 add $16, %r10
610 sub $16, %r11
611 cmp $16, %r11
612 jge .L_get_AAD_blocks\@
613
614 movdqu \TMP6, \TMP7
615
616 /* read the last <16B of AAD */
617.L_get_AAD_rest\@:
618 test %r11, %r11
619 je .L_get_AAD_done\@
620
621 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
622 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
623 pxor \TMP6, \TMP7
624 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
625 movdqu \TMP7, \TMP6
626
627.L_get_AAD_done\@:
628 movdqu \TMP6, AadHash(%arg2)
629.endm
630
631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
632# between update calls.
633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
637 AAD_HASH operation
638 mov PBlockLen(%arg2), %r13
639 test %r13, %r13
640 je .L_partial_block_done_\@ # Leave Macro if no partial blocks
641 # Read in input data without over reading
642 cmp $16, \PLAIN_CYPH_LEN
643 jl .L_fewer_than_16_bytes_\@
644 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
645 jmp .L_data_read_\@
646
647.L_fewer_than_16_bytes_\@:
648 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
649 mov \PLAIN_CYPH_LEN, %r12
650 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
651
652 mov PBlockLen(%arg2), %r13
653
654.L_data_read_\@: # Finished reading in data
655
656 movdqu PBlockEncKey(%arg2), %xmm9
657 movdqu HashKey(%arg2), %xmm13
658
659 lea SHIFT_MASK(%rip), %r12
660
661 # adjust the shuffle mask pointer to be able to shift r13 bytes
662 # r16-r13 is the number of bytes in plaintext mod 16)
663 add %r13, %r12
664 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
665 pshufb %xmm2, %xmm9 # shift right r13 bytes
666
667.ifc \operation, dec
668 movdqa %xmm1, %xmm3
669 pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
670
671 mov \PLAIN_CYPH_LEN, %r10
672 add %r13, %r10
673 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
674 sub $16, %r10
675 # Determine if partial block is not being filled and
676 # shift mask accordingly
677 jge .L_no_extra_mask_1_\@
678 sub %r10, %r12
679.L_no_extra_mask_1_\@:
680
681 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
682 # get the appropriate mask to mask out bottom r13 bytes of xmm9
683 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
684
685 pand %xmm1, %xmm3
686 movdqa SHUF_MASK(%rip), %xmm10
687 pshufb %xmm10, %xmm3
688 pshufb %xmm2, %xmm3
689 pxor %xmm3, \AAD_HASH
690
691 test %r10, %r10
692 jl .L_partial_incomplete_1_\@
693
694 # GHASH computation for the last <16 Byte block
695 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
696 xor %eax, %eax
697
698 mov %rax, PBlockLen(%arg2)
699 jmp .L_dec_done_\@
700.L_partial_incomplete_1_\@:
701 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
702.L_dec_done_\@:
703 movdqu \AAD_HASH, AadHash(%arg2)
704.else
705 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
706
707 mov \PLAIN_CYPH_LEN, %r10
708 add %r13, %r10
709 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
710 sub $16, %r10
711 # Determine if partial block is not being filled and
712 # shift mask accordingly
713 jge .L_no_extra_mask_2_\@
714 sub %r10, %r12
715.L_no_extra_mask_2_\@:
716
717 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
718 # get the appropriate mask to mask out bottom r13 bytes of xmm9
719 pand %xmm1, %xmm9
720
721 movdqa SHUF_MASK(%rip), %xmm1
722 pshufb %xmm1, %xmm9
723 pshufb %xmm2, %xmm9
724 pxor %xmm9, \AAD_HASH
725
726 test %r10, %r10
727 jl .L_partial_incomplete_2_\@
728
729 # GHASH computation for the last <16 Byte block
730 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
731 xor %eax, %eax
732
733 mov %rax, PBlockLen(%arg2)
734 jmp .L_encode_done_\@
735.L_partial_incomplete_2_\@:
736 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
737.L_encode_done_\@:
738 movdqu \AAD_HASH, AadHash(%arg2)
739
740 movdqa SHUF_MASK(%rip), %xmm10
741 # shuffle xmm9 back to output as ciphertext
742 pshufb %xmm10, %xmm9
743 pshufb %xmm2, %xmm9
744.endif
745 # output encrypted Bytes
746 test %r10, %r10
747 jl .L_partial_fill_\@
748 mov %r13, %r12
749 mov $16, %r13
750 # Set r13 to be the number of bytes to write out
751 sub %r12, %r13
752 jmp .L_count_set_\@
753.L_partial_fill_\@:
754 mov \PLAIN_CYPH_LEN, %r13
755.L_count_set_\@:
756 movdqa %xmm9, %xmm0
757 movq %xmm0, %rax
758 cmp $8, %r13
759 jle .L_less_than_8_bytes_left_\@
760
761 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
762 add $8, \DATA_OFFSET
763 psrldq $8, %xmm0
764 movq %xmm0, %rax
765 sub $8, %r13
766.L_less_than_8_bytes_left_\@:
767 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
768 add $1, \DATA_OFFSET
769 shr $8, %rax
770 sub $1, %r13
771 jne .L_less_than_8_bytes_left_\@
772.L_partial_block_done_\@:
773.endm # PARTIAL_BLOCK
774
775/*
776* if a = number of total plaintext bytes
777* b = floor(a/16)
778* num_initial_blocks = b mod 4
779* encrypt the initial num_initial_blocks blocks and apply ghash on
780* the ciphertext
781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
782* are clobbered
783* arg1, %arg2, %arg3 are used as a pointer only, not modified
784*/
785
786
787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
788 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
789 MOVADQ SHUF_MASK(%rip), %xmm14
790
791 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
792
793 # start AES for num_initial_blocks blocks
794
795 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
796
797.if (\i == 5) || (\i == 6) || (\i == 7)
798
799 MOVADQ ONE(%RIP),\TMP1
800 MOVADQ 0(%arg1),\TMP2
801.irpc index, \i_seq
802 paddd \TMP1, \XMM0 # INCR Y0
803.ifc \operation, dec
804 movdqa \XMM0, %xmm\index
805.else
806 MOVADQ \XMM0, %xmm\index
807.endif
808 pshufb %xmm14, %xmm\index # perform a 16 byte swap
809 pxor \TMP2, %xmm\index
810.endr
811 lea 0x10(%arg1),%r10
812 mov keysize,%eax
813 shr $2,%eax # 128->4, 192->6, 256->8
814 add $5,%eax # 128->9, 192->11, 256->13
815
816.Laes_loop_initial_\@:
817 MOVADQ (%r10),\TMP1
818.irpc index, \i_seq
819 aesenc \TMP1, %xmm\index
820.endr
821 add $16,%r10
822 sub $1,%eax
823 jnz .Laes_loop_initial_\@
824
825 MOVADQ (%r10), \TMP1
826.irpc index, \i_seq
827 aesenclast \TMP1, %xmm\index # Last Round
828.endr
829.irpc index, \i_seq
830 movdqu (%arg4 , %r11, 1), \TMP1
831 pxor \TMP1, %xmm\index
832 movdqu %xmm\index, (%arg3 , %r11, 1)
833 # write back plaintext/ciphertext for num_initial_blocks
834 add $16, %r11
835
836.ifc \operation, dec
837 movdqa \TMP1, %xmm\index
838.endif
839 pshufb %xmm14, %xmm\index
840
841 # prepare plaintext/ciphertext for GHASH computation
842.endr
843.endif
844
845 # apply GHASH on num_initial_blocks blocks
846
847.if \i == 5
848 pxor %xmm5, %xmm6
849 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
850 pxor %xmm6, %xmm7
851 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
852 pxor %xmm7, %xmm8
853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854.elseif \i == 6
855 pxor %xmm6, %xmm7
856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857 pxor %xmm7, %xmm8
858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859.elseif \i == 7
860 pxor %xmm7, %xmm8
861 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862.endif
863 cmp $64, %r13
864 jl .L_initial_blocks_done\@
865 # no need for precomputed values
866/*
867*
868* Precomputations for HashKey parallel with encryption of first 4 blocks.
869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
870*/
871 MOVADQ ONE(%RIP),\TMP1
872 paddd \TMP1, \XMM0 # INCR Y0
873 MOVADQ \XMM0, \XMM1
874 pshufb %xmm14, \XMM1 # perform a 16 byte swap
875
876 paddd \TMP1, \XMM0 # INCR Y0
877 MOVADQ \XMM0, \XMM2
878 pshufb %xmm14, \XMM2 # perform a 16 byte swap
879
880 paddd \TMP1, \XMM0 # INCR Y0
881 MOVADQ \XMM0, \XMM3
882 pshufb %xmm14, \XMM3 # perform a 16 byte swap
883
884 paddd \TMP1, \XMM0 # INCR Y0
885 MOVADQ \XMM0, \XMM4
886 pshufb %xmm14, \XMM4 # perform a 16 byte swap
887
888 MOVADQ 0(%arg1),\TMP1
889 pxor \TMP1, \XMM1
890 pxor \TMP1, \XMM2
891 pxor \TMP1, \XMM3
892 pxor \TMP1, \XMM4
893.irpc index, 1234 # do 4 rounds
894 movaps 0x10*\index(%arg1), \TMP1
895 aesenc \TMP1, \XMM1
896 aesenc \TMP1, \XMM2
897 aesenc \TMP1, \XMM3
898 aesenc \TMP1, \XMM4
899.endr
900.irpc index, 56789 # do next 5 rounds
901 movaps 0x10*\index(%arg1), \TMP1
902 aesenc \TMP1, \XMM1
903 aesenc \TMP1, \XMM2
904 aesenc \TMP1, \XMM3
905 aesenc \TMP1, \XMM4
906.endr
907 lea 0xa0(%arg1),%r10
908 mov keysize,%eax
909 shr $2,%eax # 128->4, 192->6, 256->8
910 sub $4,%eax # 128->0, 192->2, 256->4
911 jz .Laes_loop_pre_done\@
912
913.Laes_loop_pre_\@:
914 MOVADQ (%r10),\TMP2
915.irpc index, 1234
916 aesenc \TMP2, %xmm\index
917.endr
918 add $16,%r10
919 sub $1,%eax
920 jnz .Laes_loop_pre_\@
921
922.Laes_loop_pre_done\@:
923 MOVADQ (%r10), \TMP2
924 aesenclast \TMP2, \XMM1
925 aesenclast \TMP2, \XMM2
926 aesenclast \TMP2, \XMM3
927 aesenclast \TMP2, \XMM4
928 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
929 pxor \TMP1, \XMM1
930.ifc \operation, dec
931 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
932 movdqa \TMP1, \XMM1
933.endif
934 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
935 pxor \TMP1, \XMM2
936.ifc \operation, dec
937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
938 movdqa \TMP1, \XMM2
939.endif
940 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
941 pxor \TMP1, \XMM3
942.ifc \operation, dec
943 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
944 movdqa \TMP1, \XMM3
945.endif
946 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
947 pxor \TMP1, \XMM4
948.ifc \operation, dec
949 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
950 movdqa \TMP1, \XMM4
951.else
952 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
953 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
954 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
955 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
956.endif
957
958 add $64, %r11
959 pshufb %xmm14, \XMM1 # perform a 16 byte swap
960 pxor \XMMDst, \XMM1
961# combine GHASHed value with the corresponding ciphertext
962 pshufb %xmm14, \XMM2 # perform a 16 byte swap
963 pshufb %xmm14, \XMM3 # perform a 16 byte swap
964 pshufb %xmm14, \XMM4 # perform a 16 byte swap
965
966.L_initial_blocks_done\@:
967
968.endm
969
970/*
971* encrypt 4 blocks at a time
972* ghash the 4 previously encrypted ciphertext blocks
973* arg1, %arg3, %arg4 are used as pointers only, not modified
974* %r11 is the data offset value
975*/
976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
978
979 movdqa \XMM1, \XMM5
980 movdqa \XMM2, \XMM6
981 movdqa \XMM3, \XMM7
982 movdqa \XMM4, \XMM8
983
984 movdqa SHUF_MASK(%rip), %xmm15
985 # multiply TMP5 * HashKey using karatsuba
986
987 movdqa \XMM5, \TMP4
988 pshufd $78, \XMM5, \TMP6
989 pxor \XMM5, \TMP6
990 paddd ONE(%rip), \XMM0 # INCR CNT
991 movdqu HashKey_4(%arg2), \TMP5
992 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
993 movdqa \XMM0, \XMM1
994 paddd ONE(%rip), \XMM0 # INCR CNT
995 movdqa \XMM0, \XMM2
996 paddd ONE(%rip), \XMM0 # INCR CNT
997 movdqa \XMM0, \XMM3
998 paddd ONE(%rip), \XMM0 # INCR CNT
999 movdqa \XMM0, \XMM4
1000 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1001 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1002 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1003 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1004 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1005
1006 pxor (%arg1), \XMM1
1007 pxor (%arg1), \XMM2
1008 pxor (%arg1), \XMM3
1009 pxor (%arg1), \XMM4
1010 movdqu HashKey_4_k(%arg2), \TMP5
1011 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1012 movaps 0x10(%arg1), \TMP1
1013 aesenc \TMP1, \XMM1 # Round 1
1014 aesenc \TMP1, \XMM2
1015 aesenc \TMP1, \XMM3
1016 aesenc \TMP1, \XMM4
1017 movaps 0x20(%arg1), \TMP1
1018 aesenc \TMP1, \XMM1 # Round 2
1019 aesenc \TMP1, \XMM2
1020 aesenc \TMP1, \XMM3
1021 aesenc \TMP1, \XMM4
1022 movdqa \XMM6, \TMP1
1023 pshufd $78, \XMM6, \TMP2
1024 pxor \XMM6, \TMP2
1025 movdqu HashKey_3(%arg2), \TMP5
1026 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1027 movaps 0x30(%arg1), \TMP3
1028 aesenc \TMP3, \XMM1 # Round 3
1029 aesenc \TMP3, \XMM2
1030 aesenc \TMP3, \XMM3
1031 aesenc \TMP3, \XMM4
1032 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1033 movaps 0x40(%arg1), \TMP3
1034 aesenc \TMP3, \XMM1 # Round 4
1035 aesenc \TMP3, \XMM2
1036 aesenc \TMP3, \XMM3
1037 aesenc \TMP3, \XMM4
1038 movdqu HashKey_3_k(%arg2), \TMP5
1039 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1040 movaps 0x50(%arg1), \TMP3
1041 aesenc \TMP3, \XMM1 # Round 5
1042 aesenc \TMP3, \XMM2
1043 aesenc \TMP3, \XMM3
1044 aesenc \TMP3, \XMM4
1045 pxor \TMP1, \TMP4
1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047 pxor \XMM6, \XMM5
1048 pxor \TMP2, \TMP6
1049 movdqa \XMM7, \TMP1
1050 pshufd $78, \XMM7, \TMP2
1051 pxor \XMM7, \TMP2
1052 movdqu HashKey_2(%arg2), \TMP5
1053
1054 # Multiply TMP5 * HashKey using karatsuba
1055
1056 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1057 movaps 0x60(%arg1), \TMP3
1058 aesenc \TMP3, \XMM1 # Round 6
1059 aesenc \TMP3, \XMM2
1060 aesenc \TMP3, \XMM3
1061 aesenc \TMP3, \XMM4
1062 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1063 movaps 0x70(%arg1), \TMP3
1064 aesenc \TMP3, \XMM1 # Round 7
1065 aesenc \TMP3, \XMM2
1066 aesenc \TMP3, \XMM3
1067 aesenc \TMP3, \XMM4
1068 movdqu HashKey_2_k(%arg2), \TMP5
1069 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1070 movaps 0x80(%arg1), \TMP3
1071 aesenc \TMP3, \XMM1 # Round 8
1072 aesenc \TMP3, \XMM2
1073 aesenc \TMP3, \XMM3
1074 aesenc \TMP3, \XMM4
1075 pxor \TMP1, \TMP4
1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077 pxor \XMM7, \XMM5
1078 pxor \TMP2, \TMP6
1079
1080 # Multiply XMM8 * HashKey
1081 # XMM8 and TMP5 hold the values for the two operands
1082
1083 movdqa \XMM8, \TMP1
1084 pshufd $78, \XMM8, \TMP2
1085 pxor \XMM8, \TMP2
1086 movdqu HashKey(%arg2), \TMP5
1087 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1088 movaps 0x90(%arg1), \TMP3
1089 aesenc \TMP3, \XMM1 # Round 9
1090 aesenc \TMP3, \XMM2
1091 aesenc \TMP3, \XMM3
1092 aesenc \TMP3, \XMM4
1093 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1094 lea 0xa0(%arg1),%r10
1095 mov keysize,%eax
1096 shr $2,%eax # 128->4, 192->6, 256->8
1097 sub $4,%eax # 128->0, 192->2, 256->4
1098 jz .Laes_loop_par_enc_done\@
1099
1100.Laes_loop_par_enc\@:
1101 MOVADQ (%r10),\TMP3
1102.irpc index, 1234
1103 aesenc \TMP3, %xmm\index
1104.endr
1105 add $16,%r10
1106 sub $1,%eax
1107 jnz .Laes_loop_par_enc\@
1108
1109.Laes_loop_par_enc_done\@:
1110 MOVADQ (%r10), \TMP3
1111 aesenclast \TMP3, \XMM1 # Round 10
1112 aesenclast \TMP3, \XMM2
1113 aesenclast \TMP3, \XMM3
1114 aesenclast \TMP3, \XMM4
1115 movdqu HashKey_k(%arg2), \TMP5
1116 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1117 movdqu (%arg4,%r11,1), \TMP3
1118 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1119 movdqu 16(%arg4,%r11,1), \TMP3
1120 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1121 movdqu 32(%arg4,%r11,1), \TMP3
1122 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1123 movdqu 48(%arg4,%r11,1), \TMP3
1124 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1125 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1126 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1127 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1128 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1129 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1130 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1131 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1132 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1133
1134 pxor \TMP4, \TMP1
1135 pxor \XMM8, \XMM5
1136 pxor \TMP6, \TMP2
1137 pxor \TMP1, \TMP2
1138 pxor \XMM5, \TMP2
1139 movdqa \TMP2, \TMP3
1140 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1142 pxor \TMP3, \XMM5
1143 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1144
1145 # first phase of reduction
1146
1147 movdqa \XMM5, \TMP2
1148 movdqa \XMM5, \TMP3
1149 movdqa \XMM5, \TMP4
1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151 pslld $31, \TMP2 # packed right shift << 31
1152 pslld $30, \TMP3 # packed right shift << 30
1153 pslld $25, \TMP4 # packed right shift << 25
1154 pxor \TMP3, \TMP2 # xor the shifted versions
1155 pxor \TMP4, \TMP2
1156 movdqa \TMP2, \TMP5
1157 psrldq $4, \TMP5 # right shift T5 1 DW
1158 pslldq $12, \TMP2 # left shift T2 3 DWs
1159 pxor \TMP2, \XMM5
1160
1161 # second phase of reduction
1162
1163 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164 movdqa \XMM5,\TMP3
1165 movdqa \XMM5,\TMP4
1166 psrld $1, \TMP2 # packed left shift >>1
1167 psrld $2, \TMP3 # packed left shift >>2
1168 psrld $7, \TMP4 # packed left shift >>7
1169 pxor \TMP3,\TMP2 # xor the shifted versions
1170 pxor \TMP4,\TMP2
1171 pxor \TMP5, \TMP2
1172 pxor \TMP2, \XMM5
1173 pxor \TMP1, \XMM5 # result is in TMP1
1174
1175 pxor \XMM5, \XMM1
1176.endm
1177
1178/*
1179* decrypt 4 blocks at a time
1180* ghash the 4 previously decrypted ciphertext blocks
1181* arg1, %arg3, %arg4 are used as pointers only, not modified
1182* %r11 is the data offset value
1183*/
1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187 movdqa \XMM1, \XMM5
1188 movdqa \XMM2, \XMM6
1189 movdqa \XMM3, \XMM7
1190 movdqa \XMM4, \XMM8
1191
1192 movdqa SHUF_MASK(%rip), %xmm15
1193 # multiply TMP5 * HashKey using karatsuba
1194
1195 movdqa \XMM5, \TMP4
1196 pshufd $78, \XMM5, \TMP6
1197 pxor \XMM5, \TMP6
1198 paddd ONE(%rip), \XMM0 # INCR CNT
1199 movdqu HashKey_4(%arg2), \TMP5
1200 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1201 movdqa \XMM0, \XMM1
1202 paddd ONE(%rip), \XMM0 # INCR CNT
1203 movdqa \XMM0, \XMM2
1204 paddd ONE(%rip), \XMM0 # INCR CNT
1205 movdqa \XMM0, \XMM3
1206 paddd ONE(%rip), \XMM0 # INCR CNT
1207 movdqa \XMM0, \XMM4
1208 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1209 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1210 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1211 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1212 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1213
1214 pxor (%arg1), \XMM1
1215 pxor (%arg1), \XMM2
1216 pxor (%arg1), \XMM3
1217 pxor (%arg1), \XMM4
1218 movdqu HashKey_4_k(%arg2), \TMP5
1219 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1220 movaps 0x10(%arg1), \TMP1
1221 aesenc \TMP1, \XMM1 # Round 1
1222 aesenc \TMP1, \XMM2
1223 aesenc \TMP1, \XMM3
1224 aesenc \TMP1, \XMM4
1225 movaps 0x20(%arg1), \TMP1
1226 aesenc \TMP1, \XMM1 # Round 2
1227 aesenc \TMP1, \XMM2
1228 aesenc \TMP1, \XMM3
1229 aesenc \TMP1, \XMM4
1230 movdqa \XMM6, \TMP1
1231 pshufd $78, \XMM6, \TMP2
1232 pxor \XMM6, \TMP2
1233 movdqu HashKey_3(%arg2), \TMP5
1234 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1235 movaps 0x30(%arg1), \TMP3
1236 aesenc \TMP3, \XMM1 # Round 3
1237 aesenc \TMP3, \XMM2
1238 aesenc \TMP3, \XMM3
1239 aesenc \TMP3, \XMM4
1240 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1241 movaps 0x40(%arg1), \TMP3
1242 aesenc \TMP3, \XMM1 # Round 4
1243 aesenc \TMP3, \XMM2
1244 aesenc \TMP3, \XMM3
1245 aesenc \TMP3, \XMM4
1246 movdqu HashKey_3_k(%arg2), \TMP5
1247 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1248 movaps 0x50(%arg1), \TMP3
1249 aesenc \TMP3, \XMM1 # Round 5
1250 aesenc \TMP3, \XMM2
1251 aesenc \TMP3, \XMM3
1252 aesenc \TMP3, \XMM4
1253 pxor \TMP1, \TMP4
1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255 pxor \XMM6, \XMM5
1256 pxor \TMP2, \TMP6
1257 movdqa \XMM7, \TMP1
1258 pshufd $78, \XMM7, \TMP2
1259 pxor \XMM7, \TMP2
1260 movdqu HashKey_2(%arg2), \TMP5
1261
1262 # Multiply TMP5 * HashKey using karatsuba
1263
1264 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1265 movaps 0x60(%arg1), \TMP3
1266 aesenc \TMP3, \XMM1 # Round 6
1267 aesenc \TMP3, \XMM2
1268 aesenc \TMP3, \XMM3
1269 aesenc \TMP3, \XMM4
1270 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1271 movaps 0x70(%arg1), \TMP3
1272 aesenc \TMP3, \XMM1 # Round 7
1273 aesenc \TMP3, \XMM2
1274 aesenc \TMP3, \XMM3
1275 aesenc \TMP3, \XMM4
1276 movdqu HashKey_2_k(%arg2), \TMP5
1277 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1278 movaps 0x80(%arg1), \TMP3
1279 aesenc \TMP3, \XMM1 # Round 8
1280 aesenc \TMP3, \XMM2
1281 aesenc \TMP3, \XMM3
1282 aesenc \TMP3, \XMM4
1283 pxor \TMP1, \TMP4
1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285 pxor \XMM7, \XMM5
1286 pxor \TMP2, \TMP6
1287
1288 # Multiply XMM8 * HashKey
1289 # XMM8 and TMP5 hold the values for the two operands
1290
1291 movdqa \XMM8, \TMP1
1292 pshufd $78, \XMM8, \TMP2
1293 pxor \XMM8, \TMP2
1294 movdqu HashKey(%arg2), \TMP5
1295 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1296 movaps 0x90(%arg1), \TMP3
1297 aesenc \TMP3, \XMM1 # Round 9
1298 aesenc \TMP3, \XMM2
1299 aesenc \TMP3, \XMM3
1300 aesenc \TMP3, \XMM4
1301 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1302 lea 0xa0(%arg1),%r10
1303 mov keysize,%eax
1304 shr $2,%eax # 128->4, 192->6, 256->8
1305 sub $4,%eax # 128->0, 192->2, 256->4
1306 jz .Laes_loop_par_dec_done\@
1307
1308.Laes_loop_par_dec\@:
1309 MOVADQ (%r10),\TMP3
1310.irpc index, 1234
1311 aesenc \TMP3, %xmm\index
1312.endr
1313 add $16,%r10
1314 sub $1,%eax
1315 jnz .Laes_loop_par_dec\@
1316
1317.Laes_loop_par_dec_done\@:
1318 MOVADQ (%r10), \TMP3
1319 aesenclast \TMP3, \XMM1 # last round
1320 aesenclast \TMP3, \XMM2
1321 aesenclast \TMP3, \XMM3
1322 aesenclast \TMP3, \XMM4
1323 movdqu HashKey_k(%arg2), \TMP5
1324 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1325 movdqu (%arg4,%r11,1), \TMP3
1326 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1327 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1328 movdqa \TMP3, \XMM1
1329 movdqu 16(%arg4,%r11,1), \TMP3
1330 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1331 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1332 movdqa \TMP3, \XMM2
1333 movdqu 32(%arg4,%r11,1), \TMP3
1334 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1335 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1336 movdqa \TMP3, \XMM3
1337 movdqu 48(%arg4,%r11,1), \TMP3
1338 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1339 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1340 movdqa \TMP3, \XMM4
1341 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1342 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1343 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1344 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1345
1346 pxor \TMP4, \TMP1
1347 pxor \XMM8, \XMM5
1348 pxor \TMP6, \TMP2
1349 pxor \TMP1, \TMP2
1350 pxor \XMM5, \TMP2
1351 movdqa \TMP2, \TMP3
1352 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1353 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1354 pxor \TMP3, \XMM5
1355 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1356
1357 # first phase of reduction
1358
1359 movdqa \XMM5, \TMP2
1360 movdqa \XMM5, \TMP3
1361 movdqa \XMM5, \TMP4
1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363 pslld $31, \TMP2 # packed right shift << 31
1364 pslld $30, \TMP3 # packed right shift << 30
1365 pslld $25, \TMP4 # packed right shift << 25
1366 pxor \TMP3, \TMP2 # xor the shifted versions
1367 pxor \TMP4, \TMP2
1368 movdqa \TMP2, \TMP5
1369 psrldq $4, \TMP5 # right shift T5 1 DW
1370 pslldq $12, \TMP2 # left shift T2 3 DWs
1371 pxor \TMP2, \XMM5
1372
1373 # second phase of reduction
1374
1375 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376 movdqa \XMM5,\TMP3
1377 movdqa \XMM5,\TMP4
1378 psrld $1, \TMP2 # packed left shift >>1
1379 psrld $2, \TMP3 # packed left shift >>2
1380 psrld $7, \TMP4 # packed left shift >>7
1381 pxor \TMP3,\TMP2 # xor the shifted versions
1382 pxor \TMP4,\TMP2
1383 pxor \TMP5, \TMP2
1384 pxor \TMP2, \XMM5
1385 pxor \TMP1, \XMM5 # result is in TMP1
1386
1387 pxor \XMM5, \XMM1
1388.endm
1389
1390/* GHASH the last 4 ciphertext blocks. */
1391.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394 # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396 movdqa \XMM1, \TMP6
1397 pshufd $78, \XMM1, \TMP2
1398 pxor \XMM1, \TMP2
1399 movdqu HashKey_4(%arg2), \TMP5
1400 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1401 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1402 movdqu HashKey_4_k(%arg2), \TMP4
1403 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1404 movdqa \XMM1, \XMMDst
1405 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1406
1407 # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409 movdqa \XMM2, \TMP1
1410 pshufd $78, \XMM2, \TMP2
1411 pxor \XMM2, \TMP2
1412 movdqu HashKey_3(%arg2), \TMP5
1413 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1414 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1415 movdqu HashKey_3_k(%arg2), \TMP4
1416 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1417 pxor \TMP1, \TMP6
1418 pxor \XMM2, \XMMDst
1419 pxor \TMP2, \XMM1
1420# results accumulated in TMP6, XMMDst, XMM1
1421
1422 # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424 movdqa \XMM3, \TMP1
1425 pshufd $78, \XMM3, \TMP2
1426 pxor \XMM3, \TMP2
1427 movdqu HashKey_2(%arg2), \TMP5
1428 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1429 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1430 movdqu HashKey_2_k(%arg2), \TMP4
1431 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1432 pxor \TMP1, \TMP6
1433 pxor \XMM3, \XMMDst
1434 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1435
1436 # Multiply TMP1 * HashKey (using Karatsuba)
1437 movdqa \XMM4, \TMP1
1438 pshufd $78, \XMM4, \TMP2
1439 pxor \XMM4, \TMP2
1440 movdqu HashKey(%arg2), \TMP5
1441 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1442 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1443 movdqu HashKey_k(%arg2), \TMP4
1444 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1445 pxor \TMP1, \TMP6
1446 pxor \XMM4, \XMMDst
1447 pxor \XMM1, \TMP2
1448 pxor \TMP6, \TMP2
1449 pxor \XMMDst, \TMP2
1450 # middle section of the temp results combined as in karatsuba algorithm
1451 movdqa \TMP2, \TMP4
1452 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1453 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1454 pxor \TMP4, \XMMDst
1455 pxor \TMP2, \TMP6
1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457 # first phase of the reduction
1458 movdqa \XMMDst, \TMP2
1459 movdqa \XMMDst, \TMP3
1460 movdqa \XMMDst, \TMP4
1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462 pslld $31, \TMP2 # packed right shifting << 31
1463 pslld $30, \TMP3 # packed right shifting << 30
1464 pslld $25, \TMP4 # packed right shifting << 25
1465 pxor \TMP3, \TMP2 # xor the shifted versions
1466 pxor \TMP4, \TMP2
1467 movdqa \TMP2, \TMP7
1468 psrldq $4, \TMP7 # right shift TMP7 1 DW
1469 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1470 pxor \TMP2, \XMMDst
1471
1472 # second phase of the reduction
1473 movdqa \XMMDst, \TMP2
1474 # make 3 copies of XMMDst for doing 3 shift operations
1475 movdqa \XMMDst, \TMP3
1476 movdqa \XMMDst, \TMP4
1477 psrld $1, \TMP2 # packed left shift >> 1
1478 psrld $2, \TMP3 # packed left shift >> 2
1479 psrld $7, \TMP4 # packed left shift >> 7
1480 pxor \TMP3, \TMP2 # xor the shifted versions
1481 pxor \TMP4, \TMP2
1482 pxor \TMP7, \TMP2
1483 pxor \TMP2, \XMMDst
1484 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1485.endm
1486
1487
1488/* Encryption of a single block
1489* uses eax & r10
1490*/
1491
1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
1494 pxor (%arg1), \XMM0
1495 mov keysize,%eax
1496 shr $2,%eax # 128->4, 192->6, 256->8
1497 add $5,%eax # 128->9, 192->11, 256->13
1498 lea 16(%arg1), %r10 # get first expanded key address
1499
1500_esb_loop_\@:
1501 MOVADQ (%r10),\TMP1
1502 aesenc \TMP1,\XMM0
1503 add $16,%r10
1504 sub $1,%eax
1505 jnz _esb_loop_\@
1506
1507 MOVADQ (%r10),\TMP1
1508 aesenclast \TMP1,\XMM0
1509.endm
1510/*****************************************************************************
1511* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1512* struct gcm_context_data *data
1513* // Context data
1514* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1515* const u8 *in, // Ciphertext input
1516* u64 plaintext_len, // Length of data in bytes for decryption.
1517* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1518* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519* // concatenated with 0x00000001. 16-byte aligned pointer.
1520* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521* const u8 *aad, // Additional Authentication Data (AAD)
1522* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1524* // given authentication tag and only return the plaintext if they match.
1525* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526* // (most likely), 12 or 8.
1527*
1528* Assumptions:
1529*
1530* keys:
1531* keys are pre-expanded and aligned to 16 bytes. we are using the first
1532* set of 11 keys in the data structure void *aes_ctx
1533*
1534* iv:
1535* 0 1 2 3
1536* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538* | Salt (From the SA) |
1539* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540* | Initialization Vector |
1541* | (This is the sequence number from IPSec header) |
1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543* | 0x1 |
1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*
1546*
1547*
1548* AAD:
1549* AAD padded to 128 bits with 0
1550* for example, assume AAD is a u32 vector
1551*
1552* if AAD is 8 bytes:
1553* AAD[3] = {A0, A1};
1554* padded AAD in xmm register = {A1 A0 0 0}
1555*
1556* 0 1 2 3
1557* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559* | SPI (A1) |
1560* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561* | 32-bit Sequence Number (A0) |
1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563* | 0x0 |
1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*
1566* AAD Format with 32-bit Sequence Number
1567*
1568* if AAD is 12 bytes:
1569* AAD[3] = {A0, A1, A2};
1570* padded AAD in xmm register = {A2 A1 A0 0}
1571*
1572* 0 1 2 3
1573* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577* | SPI (A2) |
1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579* | 64-bit Extended Sequence Number {A1,A0} |
1580* | |
1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582* | 0x0 |
1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*
1585* AAD Format with 64-bit Extended Sequence Number
1586*
1587* poly = x^128 + x^127 + x^126 + x^121 + 1
1588*
1589*****************************************************************************/
1590SYM_FUNC_START(aesni_gcm_dec)
1591 FUNC_SAVE
1592
1593 GCM_INIT %arg6, arg7, arg8, arg9
1594 GCM_ENC_DEC dec
1595 GCM_COMPLETE arg10, arg11
1596 FUNC_RESTORE
1597 RET
1598SYM_FUNC_END(aesni_gcm_dec)
1599
1600
1601/*****************************************************************************
1602* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1603* struct gcm_context_data *data
1604* // Context data
1605* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1606* const u8 *in, // Plaintext input
1607* u64 plaintext_len, // Length of data in bytes for encryption.
1608* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1609* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610* // concatenated with 0x00000001. 16-byte aligned pointer.
1611* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612* const u8 *aad, // Additional Authentication Data (AAD)
1613* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614* u8 *auth_tag, // Authenticated Tag output.
1615* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616* // 12 or 8.
1617*
1618* Assumptions:
1619*
1620* keys:
1621* keys are pre-expanded and aligned to 16 bytes. we are using the
1622* first set of 11 keys in the data structure void *aes_ctx
1623*
1624*
1625* iv:
1626* 0 1 2 3
1627* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629* | Salt (From the SA) |
1630* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631* | Initialization Vector |
1632* | (This is the sequence number from IPSec header) |
1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634* | 0x1 |
1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*
1637*
1638*
1639* AAD:
1640* AAD padded to 128 bits with 0
1641* for example, assume AAD is a u32 vector
1642*
1643* if AAD is 8 bytes:
1644* AAD[3] = {A0, A1};
1645* padded AAD in xmm register = {A1 A0 0 0}
1646*
1647* 0 1 2 3
1648* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650* | SPI (A1) |
1651* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652* | 32-bit Sequence Number (A0) |
1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654* | 0x0 |
1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*
1657* AAD Format with 32-bit Sequence Number
1658*
1659* if AAD is 12 bytes:
1660* AAD[3] = {A0, A1, A2};
1661* padded AAD in xmm register = {A2 A1 A0 0}
1662*
1663* 0 1 2 3
1664* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666* | SPI (A2) |
1667* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668* | 64-bit Extended Sequence Number {A1,A0} |
1669* | |
1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671* | 0x0 |
1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*
1674* AAD Format with 64-bit Extended Sequence Number
1675*
1676* poly = x^128 + x^127 + x^126 + x^121 + 1
1677***************************************************************************/
1678SYM_FUNC_START(aesni_gcm_enc)
1679 FUNC_SAVE
1680
1681 GCM_INIT %arg6, arg7, arg8, arg9
1682 GCM_ENC_DEC enc
1683
1684 GCM_COMPLETE arg10, arg11
1685 FUNC_RESTORE
1686 RET
1687SYM_FUNC_END(aesni_gcm_enc)
1688
1689/*****************************************************************************
1690* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1691* struct gcm_context_data *data,
1692* // context data
1693* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1694* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695* // concatenated with 0x00000001. 16-byte aligned pointer.
1696* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697* const u8 *aad, // Additional Authentication Data (AAD)
1698* u64 aad_len) // Length of AAD in bytes.
1699*/
1700SYM_FUNC_START(aesni_gcm_init)
1701 FUNC_SAVE
1702 GCM_INIT %arg3, %arg4,%arg5, %arg6
1703 FUNC_RESTORE
1704 RET
1705SYM_FUNC_END(aesni_gcm_init)
1706
1707/*****************************************************************************
1708* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1709* struct gcm_context_data *data,
1710* // context data
1711* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1712* const u8 *in, // Plaintext input
1713* u64 plaintext_len, // Length of data in bytes for encryption.
1714*/
1715SYM_FUNC_START(aesni_gcm_enc_update)
1716 FUNC_SAVE
1717 GCM_ENC_DEC enc
1718 FUNC_RESTORE
1719 RET
1720SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722/*****************************************************************************
1723* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1724* struct gcm_context_data *data,
1725* // context data
1726* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1727* const u8 *in, // Plaintext input
1728* u64 plaintext_len, // Length of data in bytes for encryption.
1729*/
1730SYM_FUNC_START(aesni_gcm_dec_update)
1731 FUNC_SAVE
1732 GCM_ENC_DEC dec
1733 FUNC_RESTORE
1734 RET
1735SYM_FUNC_END(aesni_gcm_dec_update)
1736
1737/*****************************************************************************
1738* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1739* struct gcm_context_data *data,
1740* // context data
1741* u8 *auth_tag, // Authenticated Tag output.
1742* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743* // 12 or 8.
1744*/
1745SYM_FUNC_START(aesni_gcm_finalize)
1746 FUNC_SAVE
1747 GCM_COMPLETE %arg3 %arg4
1748 FUNC_RESTORE
1749 RET
1750SYM_FUNC_END(aesni_gcm_finalize)
1751
1752#endif
1753
1754SYM_FUNC_START_LOCAL(_key_expansion_256a)
1755 pshufd $0b11111111, %xmm1, %xmm1
1756 shufps $0b00010000, %xmm0, %xmm4
1757 pxor %xmm4, %xmm0
1758 shufps $0b10001100, %xmm0, %xmm4
1759 pxor %xmm4, %xmm0
1760 pxor %xmm1, %xmm0
1761 movaps %xmm0, (TKEYP)
1762 add $0x10, TKEYP
1763 RET
1764SYM_FUNC_END(_key_expansion_256a)
1765SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1766
1767SYM_FUNC_START_LOCAL(_key_expansion_192a)
1768 pshufd $0b01010101, %xmm1, %xmm1
1769 shufps $0b00010000, %xmm0, %xmm4
1770 pxor %xmm4, %xmm0
1771 shufps $0b10001100, %xmm0, %xmm4
1772 pxor %xmm4, %xmm0
1773 pxor %xmm1, %xmm0
1774
1775 movaps %xmm2, %xmm5
1776 movaps %xmm2, %xmm6
1777 pslldq $4, %xmm5
1778 pshufd $0b11111111, %xmm0, %xmm3
1779 pxor %xmm3, %xmm2
1780 pxor %xmm5, %xmm2
1781
1782 movaps %xmm0, %xmm1
1783 shufps $0b01000100, %xmm0, %xmm6
1784 movaps %xmm6, (TKEYP)
1785 shufps $0b01001110, %xmm2, %xmm1
1786 movaps %xmm1, 0x10(TKEYP)
1787 add $0x20, TKEYP
1788 RET
1789SYM_FUNC_END(_key_expansion_192a)
1790
1791SYM_FUNC_START_LOCAL(_key_expansion_192b)
1792 pshufd $0b01010101, %xmm1, %xmm1
1793 shufps $0b00010000, %xmm0, %xmm4
1794 pxor %xmm4, %xmm0
1795 shufps $0b10001100, %xmm0, %xmm4
1796 pxor %xmm4, %xmm0
1797 pxor %xmm1, %xmm0
1798
1799 movaps %xmm2, %xmm5
1800 pslldq $4, %xmm5
1801 pshufd $0b11111111, %xmm0, %xmm3
1802 pxor %xmm3, %xmm2
1803 pxor %xmm5, %xmm2
1804
1805 movaps %xmm0, (TKEYP)
1806 add $0x10, TKEYP
1807 RET
1808SYM_FUNC_END(_key_expansion_192b)
1809
1810SYM_FUNC_START_LOCAL(_key_expansion_256b)
1811 pshufd $0b10101010, %xmm1, %xmm1
1812 shufps $0b00010000, %xmm2, %xmm4
1813 pxor %xmm4, %xmm2
1814 shufps $0b10001100, %xmm2, %xmm4
1815 pxor %xmm4, %xmm2
1816 pxor %xmm1, %xmm2
1817 movaps %xmm2, (TKEYP)
1818 add $0x10, TKEYP
1819 RET
1820SYM_FUNC_END(_key_expansion_256b)
1821
1822/*
1823 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1824 * unsigned int key_len)
1825 */
1826SYM_FUNC_START(aesni_set_key)
1827 FRAME_BEGIN
1828#ifndef __x86_64__
1829 pushl KEYP
1830 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1831 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1832 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1833#endif
1834 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1835 movaps %xmm0, (KEYP)
1836 lea 0x10(KEYP), TKEYP # key addr
1837 movl %edx, 480(KEYP)
1838 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1839 cmp $24, %dl
1840 jb .Lenc_key128
1841 je .Lenc_key192
1842 movups 0x10(UKEYP), %xmm2 # other user key
1843 movaps %xmm2, (TKEYP)
1844 add $0x10, TKEYP
1845 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1846 call _key_expansion_256a
1847 aeskeygenassist $0x1, %xmm0, %xmm1
1848 call _key_expansion_256b
1849 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1850 call _key_expansion_256a
1851 aeskeygenassist $0x2, %xmm0, %xmm1
1852 call _key_expansion_256b
1853 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1854 call _key_expansion_256a
1855 aeskeygenassist $0x4, %xmm0, %xmm1
1856 call _key_expansion_256b
1857 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1858 call _key_expansion_256a
1859 aeskeygenassist $0x8, %xmm0, %xmm1
1860 call _key_expansion_256b
1861 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1862 call _key_expansion_256a
1863 aeskeygenassist $0x10, %xmm0, %xmm1
1864 call _key_expansion_256b
1865 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1866 call _key_expansion_256a
1867 aeskeygenassist $0x20, %xmm0, %xmm1
1868 call _key_expansion_256b
1869 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1870 call _key_expansion_256a
1871 jmp .Ldec_key
1872.Lenc_key192:
1873 movq 0x10(UKEYP), %xmm2 # other user key
1874 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1875 call _key_expansion_192a
1876 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1877 call _key_expansion_192b
1878 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1879 call _key_expansion_192a
1880 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1881 call _key_expansion_192b
1882 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1883 call _key_expansion_192a
1884 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1885 call _key_expansion_192b
1886 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1887 call _key_expansion_192a
1888 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
1889 call _key_expansion_192b
1890 jmp .Ldec_key
1891.Lenc_key128:
1892 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
1893 call _key_expansion_128
1894 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
1895 call _key_expansion_128
1896 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
1897 call _key_expansion_128
1898 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
1899 call _key_expansion_128
1900 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
1901 call _key_expansion_128
1902 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
1903 call _key_expansion_128
1904 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
1905 call _key_expansion_128
1906 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
1907 call _key_expansion_128
1908 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
1909 call _key_expansion_128
1910 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
1911 call _key_expansion_128
1912.Ldec_key:
1913 sub $0x10, TKEYP
1914 movaps (KEYP), %xmm0
1915 movaps (TKEYP), %xmm1
1916 movaps %xmm0, 240(TKEYP)
1917 movaps %xmm1, 240(KEYP)
1918 add $0x10, KEYP
1919 lea 240-16(TKEYP), UKEYP
1920.align 4
1921.Ldec_key_loop:
1922 movaps (KEYP), %xmm0
1923 aesimc %xmm0, %xmm1
1924 movaps %xmm1, (UKEYP)
1925 add $0x10, KEYP
1926 sub $0x10, UKEYP
1927 cmp TKEYP, KEYP
1928 jb .Ldec_key_loop
1929 xor AREG, AREG
1930#ifndef __x86_64__
1931 popl KEYP
1932#endif
1933 FRAME_END
1934 RET
1935SYM_FUNC_END(aesni_set_key)
1936
1937/*
1938 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1939 */
1940SYM_FUNC_START(aesni_enc)
1941 FRAME_BEGIN
1942#ifndef __x86_64__
1943 pushl KEYP
1944 pushl KLEN
1945 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1946 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1947 movl (FRAME_OFFSET+20)(%esp), INP # src
1948#endif
1949 movl 480(KEYP), KLEN # key length
1950 movups (INP), STATE # input
1951 call _aesni_enc1
1952 movups STATE, (OUTP) # output
1953#ifndef __x86_64__
1954 popl KLEN
1955 popl KEYP
1956#endif
1957 FRAME_END
1958 RET
1959SYM_FUNC_END(aesni_enc)
1960
1961/*
1962 * _aesni_enc1: internal ABI
1963 * input:
1964 * KEYP: key struct pointer
1965 * KLEN: round count
1966 * STATE: initial state (input)
1967 * output:
1968 * STATE: finial state (output)
1969 * changed:
1970 * KEY
1971 * TKEYP (T1)
1972 */
1973SYM_FUNC_START_LOCAL(_aesni_enc1)
1974 movaps (KEYP), KEY # key
1975 mov KEYP, TKEYP
1976 pxor KEY, STATE # round 0
1977 add $0x30, TKEYP
1978 cmp $24, KLEN
1979 jb .Lenc128
1980 lea 0x20(TKEYP), TKEYP
1981 je .Lenc192
1982 add $0x20, TKEYP
1983 movaps -0x60(TKEYP), KEY
1984 aesenc KEY, STATE
1985 movaps -0x50(TKEYP), KEY
1986 aesenc KEY, STATE
1987.align 4
1988.Lenc192:
1989 movaps -0x40(TKEYP), KEY
1990 aesenc KEY, STATE
1991 movaps -0x30(TKEYP), KEY
1992 aesenc KEY, STATE
1993.align 4
1994.Lenc128:
1995 movaps -0x20(TKEYP), KEY
1996 aesenc KEY, STATE
1997 movaps -0x10(TKEYP), KEY
1998 aesenc KEY, STATE
1999 movaps (TKEYP), KEY
2000 aesenc KEY, STATE
2001 movaps 0x10(TKEYP), KEY
2002 aesenc KEY, STATE
2003 movaps 0x20(TKEYP), KEY
2004 aesenc KEY, STATE
2005 movaps 0x30(TKEYP), KEY
2006 aesenc KEY, STATE
2007 movaps 0x40(TKEYP), KEY
2008 aesenc KEY, STATE
2009 movaps 0x50(TKEYP), KEY
2010 aesenc KEY, STATE
2011 movaps 0x60(TKEYP), KEY
2012 aesenc KEY, STATE
2013 movaps 0x70(TKEYP), KEY
2014 aesenclast KEY, STATE
2015 RET
2016SYM_FUNC_END(_aesni_enc1)
2017
2018/*
2019 * _aesni_enc4: internal ABI
2020 * input:
2021 * KEYP: key struct pointer
2022 * KLEN: round count
2023 * STATE1: initial state (input)
2024 * STATE2
2025 * STATE3
2026 * STATE4
2027 * output:
2028 * STATE1: finial state (output)
2029 * STATE2
2030 * STATE3
2031 * STATE4
2032 * changed:
2033 * KEY
2034 * TKEYP (T1)
2035 */
2036SYM_FUNC_START_LOCAL(_aesni_enc4)
2037 movaps (KEYP), KEY # key
2038 mov KEYP, TKEYP
2039 pxor KEY, STATE1 # round 0
2040 pxor KEY, STATE2
2041 pxor KEY, STATE3
2042 pxor KEY, STATE4
2043 add $0x30, TKEYP
2044 cmp $24, KLEN
2045 jb .L4enc128
2046 lea 0x20(TKEYP), TKEYP
2047 je .L4enc192
2048 add $0x20, TKEYP
2049 movaps -0x60(TKEYP), KEY
2050 aesenc KEY, STATE1
2051 aesenc KEY, STATE2
2052 aesenc KEY, STATE3
2053 aesenc KEY, STATE4
2054 movaps -0x50(TKEYP), KEY
2055 aesenc KEY, STATE1
2056 aesenc KEY, STATE2
2057 aesenc KEY, STATE3
2058 aesenc KEY, STATE4
2059#.align 4
2060.L4enc192:
2061 movaps -0x40(TKEYP), KEY
2062 aesenc KEY, STATE1
2063 aesenc KEY, STATE2
2064 aesenc KEY, STATE3
2065 aesenc KEY, STATE4
2066 movaps -0x30(TKEYP), KEY
2067 aesenc KEY, STATE1
2068 aesenc KEY, STATE2
2069 aesenc KEY, STATE3
2070 aesenc KEY, STATE4
2071#.align 4
2072.L4enc128:
2073 movaps -0x20(TKEYP), KEY
2074 aesenc KEY, STATE1
2075 aesenc KEY, STATE2
2076 aesenc KEY, STATE3
2077 aesenc KEY, STATE4
2078 movaps -0x10(TKEYP), KEY
2079 aesenc KEY, STATE1
2080 aesenc KEY, STATE2
2081 aesenc KEY, STATE3
2082 aesenc KEY, STATE4
2083 movaps (TKEYP), KEY
2084 aesenc KEY, STATE1
2085 aesenc KEY, STATE2
2086 aesenc KEY, STATE3
2087 aesenc KEY, STATE4
2088 movaps 0x10(TKEYP), KEY
2089 aesenc KEY, STATE1
2090 aesenc KEY, STATE2
2091 aesenc KEY, STATE3
2092 aesenc KEY, STATE4
2093 movaps 0x20(TKEYP), KEY
2094 aesenc KEY, STATE1
2095 aesenc KEY, STATE2
2096 aesenc KEY, STATE3
2097 aesenc KEY, STATE4
2098 movaps 0x30(TKEYP), KEY
2099 aesenc KEY, STATE1
2100 aesenc KEY, STATE2
2101 aesenc KEY, STATE3
2102 aesenc KEY, STATE4
2103 movaps 0x40(TKEYP), KEY
2104 aesenc KEY, STATE1
2105 aesenc KEY, STATE2
2106 aesenc KEY, STATE3
2107 aesenc KEY, STATE4
2108 movaps 0x50(TKEYP), KEY
2109 aesenc KEY, STATE1
2110 aesenc KEY, STATE2
2111 aesenc KEY, STATE3
2112 aesenc KEY, STATE4
2113 movaps 0x60(TKEYP), KEY
2114 aesenc KEY, STATE1
2115 aesenc KEY, STATE2
2116 aesenc KEY, STATE3
2117 aesenc KEY, STATE4
2118 movaps 0x70(TKEYP), KEY
2119 aesenclast KEY, STATE1 # last round
2120 aesenclast KEY, STATE2
2121 aesenclast KEY, STATE3
2122 aesenclast KEY, STATE4
2123 RET
2124SYM_FUNC_END(_aesni_enc4)
2125
2126/*
2127 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2128 */
2129SYM_FUNC_START(aesni_dec)
2130 FRAME_BEGIN
2131#ifndef __x86_64__
2132 pushl KEYP
2133 pushl KLEN
2134 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2135 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2136 movl (FRAME_OFFSET+20)(%esp), INP # src
2137#endif
2138 mov 480(KEYP), KLEN # key length
2139 add $240, KEYP
2140 movups (INP), STATE # input
2141 call _aesni_dec1
2142 movups STATE, (OUTP) #output
2143#ifndef __x86_64__
2144 popl KLEN
2145 popl KEYP
2146#endif
2147 FRAME_END
2148 RET
2149SYM_FUNC_END(aesni_dec)
2150
2151/*
2152 * _aesni_dec1: internal ABI
2153 * input:
2154 * KEYP: key struct pointer
2155 * KLEN: key length
2156 * STATE: initial state (input)
2157 * output:
2158 * STATE: finial state (output)
2159 * changed:
2160 * KEY
2161 * TKEYP (T1)
2162 */
2163SYM_FUNC_START_LOCAL(_aesni_dec1)
2164 movaps (KEYP), KEY # key
2165 mov KEYP, TKEYP
2166 pxor KEY, STATE # round 0
2167 add $0x30, TKEYP
2168 cmp $24, KLEN
2169 jb .Ldec128
2170 lea 0x20(TKEYP), TKEYP
2171 je .Ldec192
2172 add $0x20, TKEYP
2173 movaps -0x60(TKEYP), KEY
2174 aesdec KEY, STATE
2175 movaps -0x50(TKEYP), KEY
2176 aesdec KEY, STATE
2177.align 4
2178.Ldec192:
2179 movaps -0x40(TKEYP), KEY
2180 aesdec KEY, STATE
2181 movaps -0x30(TKEYP), KEY
2182 aesdec KEY, STATE
2183.align 4
2184.Ldec128:
2185 movaps -0x20(TKEYP), KEY
2186 aesdec KEY, STATE
2187 movaps -0x10(TKEYP), KEY
2188 aesdec KEY, STATE
2189 movaps (TKEYP), KEY
2190 aesdec KEY, STATE
2191 movaps 0x10(TKEYP), KEY
2192 aesdec KEY, STATE
2193 movaps 0x20(TKEYP), KEY
2194 aesdec KEY, STATE
2195 movaps 0x30(TKEYP), KEY
2196 aesdec KEY, STATE
2197 movaps 0x40(TKEYP), KEY
2198 aesdec KEY, STATE
2199 movaps 0x50(TKEYP), KEY
2200 aesdec KEY, STATE
2201 movaps 0x60(TKEYP), KEY
2202 aesdec KEY, STATE
2203 movaps 0x70(TKEYP), KEY
2204 aesdeclast KEY, STATE
2205 RET
2206SYM_FUNC_END(_aesni_dec1)
2207
2208/*
2209 * _aesni_dec4: internal ABI
2210 * input:
2211 * KEYP: key struct pointer
2212 * KLEN: key length
2213 * STATE1: initial state (input)
2214 * STATE2
2215 * STATE3
2216 * STATE4
2217 * output:
2218 * STATE1: finial state (output)
2219 * STATE2
2220 * STATE3
2221 * STATE4
2222 * changed:
2223 * KEY
2224 * TKEYP (T1)
2225 */
2226SYM_FUNC_START_LOCAL(_aesni_dec4)
2227 movaps (KEYP), KEY # key
2228 mov KEYP, TKEYP
2229 pxor KEY, STATE1 # round 0
2230 pxor KEY, STATE2
2231 pxor KEY, STATE3
2232 pxor KEY, STATE4
2233 add $0x30, TKEYP
2234 cmp $24, KLEN
2235 jb .L4dec128
2236 lea 0x20(TKEYP), TKEYP
2237 je .L4dec192
2238 add $0x20, TKEYP
2239 movaps -0x60(TKEYP), KEY
2240 aesdec KEY, STATE1
2241 aesdec KEY, STATE2
2242 aesdec KEY, STATE3
2243 aesdec KEY, STATE4
2244 movaps -0x50(TKEYP), KEY
2245 aesdec KEY, STATE1
2246 aesdec KEY, STATE2
2247 aesdec KEY, STATE3
2248 aesdec KEY, STATE4
2249.align 4
2250.L4dec192:
2251 movaps -0x40(TKEYP), KEY
2252 aesdec KEY, STATE1
2253 aesdec KEY, STATE2
2254 aesdec KEY, STATE3
2255 aesdec KEY, STATE4
2256 movaps -0x30(TKEYP), KEY
2257 aesdec KEY, STATE1
2258 aesdec KEY, STATE2
2259 aesdec KEY, STATE3
2260 aesdec KEY, STATE4
2261.align 4
2262.L4dec128:
2263 movaps -0x20(TKEYP), KEY
2264 aesdec KEY, STATE1
2265 aesdec KEY, STATE2
2266 aesdec KEY, STATE3
2267 aesdec KEY, STATE4
2268 movaps -0x10(TKEYP), KEY
2269 aesdec KEY, STATE1
2270 aesdec KEY, STATE2
2271 aesdec KEY, STATE3
2272 aesdec KEY, STATE4
2273 movaps (TKEYP), KEY
2274 aesdec KEY, STATE1
2275 aesdec KEY, STATE2
2276 aesdec KEY, STATE3
2277 aesdec KEY, STATE4
2278 movaps 0x10(TKEYP), KEY
2279 aesdec KEY, STATE1
2280 aesdec KEY, STATE2
2281 aesdec KEY, STATE3
2282 aesdec KEY, STATE4
2283 movaps 0x20(TKEYP), KEY
2284 aesdec KEY, STATE1
2285 aesdec KEY, STATE2
2286 aesdec KEY, STATE3
2287 aesdec KEY, STATE4
2288 movaps 0x30(TKEYP), KEY
2289 aesdec KEY, STATE1
2290 aesdec KEY, STATE2
2291 aesdec KEY, STATE3
2292 aesdec KEY, STATE4
2293 movaps 0x40(TKEYP), KEY
2294 aesdec KEY, STATE1
2295 aesdec KEY, STATE2
2296 aesdec KEY, STATE3
2297 aesdec KEY, STATE4
2298 movaps 0x50(TKEYP), KEY
2299 aesdec KEY, STATE1
2300 aesdec KEY, STATE2
2301 aesdec KEY, STATE3
2302 aesdec KEY, STATE4
2303 movaps 0x60(TKEYP), KEY
2304 aesdec KEY, STATE1
2305 aesdec KEY, STATE2
2306 aesdec KEY, STATE3
2307 aesdec KEY, STATE4
2308 movaps 0x70(TKEYP), KEY
2309 aesdeclast KEY, STATE1 # last round
2310 aesdeclast KEY, STATE2
2311 aesdeclast KEY, STATE3
2312 aesdeclast KEY, STATE4
2313 RET
2314SYM_FUNC_END(_aesni_dec4)
2315
2316/*
2317 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 * size_t len)
2319 */
2320SYM_FUNC_START(aesni_ecb_enc)
2321 FRAME_BEGIN
2322#ifndef __x86_64__
2323 pushl LEN
2324 pushl KEYP
2325 pushl KLEN
2326 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2327 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2328 movl (FRAME_OFFSET+24)(%esp), INP # src
2329 movl (FRAME_OFFSET+28)(%esp), LEN # len
2330#endif
2331 test LEN, LEN # check length
2332 jz .Lecb_enc_ret
2333 mov 480(KEYP), KLEN
2334 cmp $16, LEN
2335 jb .Lecb_enc_ret
2336 cmp $64, LEN
2337 jb .Lecb_enc_loop1
2338.align 4
2339.Lecb_enc_loop4:
2340 movups (INP), STATE1
2341 movups 0x10(INP), STATE2
2342 movups 0x20(INP), STATE3
2343 movups 0x30(INP), STATE4
2344 call _aesni_enc4
2345 movups STATE1, (OUTP)
2346 movups STATE2, 0x10(OUTP)
2347 movups STATE3, 0x20(OUTP)
2348 movups STATE4, 0x30(OUTP)
2349 sub $64, LEN
2350 add $64, INP
2351 add $64, OUTP
2352 cmp $64, LEN
2353 jge .Lecb_enc_loop4
2354 cmp $16, LEN
2355 jb .Lecb_enc_ret
2356.align 4
2357.Lecb_enc_loop1:
2358 movups (INP), STATE1
2359 call _aesni_enc1
2360 movups STATE1, (OUTP)
2361 sub $16, LEN
2362 add $16, INP
2363 add $16, OUTP
2364 cmp $16, LEN
2365 jge .Lecb_enc_loop1
2366.Lecb_enc_ret:
2367#ifndef __x86_64__
2368 popl KLEN
2369 popl KEYP
2370 popl LEN
2371#endif
2372 FRAME_END
2373 RET
2374SYM_FUNC_END(aesni_ecb_enc)
2375
2376/*
2377 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2378 * size_t len);
2379 */
2380SYM_FUNC_START(aesni_ecb_dec)
2381 FRAME_BEGIN
2382#ifndef __x86_64__
2383 pushl LEN
2384 pushl KEYP
2385 pushl KLEN
2386 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2387 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2388 movl (FRAME_OFFSET+24)(%esp), INP # src
2389 movl (FRAME_OFFSET+28)(%esp), LEN # len
2390#endif
2391 test LEN, LEN
2392 jz .Lecb_dec_ret
2393 mov 480(KEYP), KLEN
2394 add $240, KEYP
2395 cmp $16, LEN
2396 jb .Lecb_dec_ret
2397 cmp $64, LEN
2398 jb .Lecb_dec_loop1
2399.align 4
2400.Lecb_dec_loop4:
2401 movups (INP), STATE1
2402 movups 0x10(INP), STATE2
2403 movups 0x20(INP), STATE3
2404 movups 0x30(INP), STATE4
2405 call _aesni_dec4
2406 movups STATE1, (OUTP)
2407 movups STATE2, 0x10(OUTP)
2408 movups STATE3, 0x20(OUTP)
2409 movups STATE4, 0x30(OUTP)
2410 sub $64, LEN
2411 add $64, INP
2412 add $64, OUTP
2413 cmp $64, LEN
2414 jge .Lecb_dec_loop4
2415 cmp $16, LEN
2416 jb .Lecb_dec_ret
2417.align 4
2418.Lecb_dec_loop1:
2419 movups (INP), STATE1
2420 call _aesni_dec1
2421 movups STATE1, (OUTP)
2422 sub $16, LEN
2423 add $16, INP
2424 add $16, OUTP
2425 cmp $16, LEN
2426 jge .Lecb_dec_loop1
2427.Lecb_dec_ret:
2428#ifndef __x86_64__
2429 popl KLEN
2430 popl KEYP
2431 popl LEN
2432#endif
2433 FRAME_END
2434 RET
2435SYM_FUNC_END(aesni_ecb_dec)
2436
2437/*
2438 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2439 * size_t len, u8 *iv)
2440 */
2441SYM_FUNC_START(aesni_cbc_enc)
2442 FRAME_BEGIN
2443#ifndef __x86_64__
2444 pushl IVP
2445 pushl LEN
2446 pushl KEYP
2447 pushl KLEN
2448 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2449 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2450 movl (FRAME_OFFSET+28)(%esp), INP # src
2451 movl (FRAME_OFFSET+32)(%esp), LEN # len
2452 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2453#endif
2454 cmp $16, LEN
2455 jb .Lcbc_enc_ret
2456 mov 480(KEYP), KLEN
2457 movups (IVP), STATE # load iv as initial state
2458.align 4
2459.Lcbc_enc_loop:
2460 movups (INP), IN # load input
2461 pxor IN, STATE
2462 call _aesni_enc1
2463 movups STATE, (OUTP) # store output
2464 sub $16, LEN
2465 add $16, INP
2466 add $16, OUTP
2467 cmp $16, LEN
2468 jge .Lcbc_enc_loop
2469 movups STATE, (IVP)
2470.Lcbc_enc_ret:
2471#ifndef __x86_64__
2472 popl KLEN
2473 popl KEYP
2474 popl LEN
2475 popl IVP
2476#endif
2477 FRAME_END
2478 RET
2479SYM_FUNC_END(aesni_cbc_enc)
2480
2481/*
2482 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2483 * size_t len, u8 *iv)
2484 */
2485SYM_FUNC_START(aesni_cbc_dec)
2486 FRAME_BEGIN
2487#ifndef __x86_64__
2488 pushl IVP
2489 pushl LEN
2490 pushl KEYP
2491 pushl KLEN
2492 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2493 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2494 movl (FRAME_OFFSET+28)(%esp), INP # src
2495 movl (FRAME_OFFSET+32)(%esp), LEN # len
2496 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2497#endif
2498 cmp $16, LEN
2499 jb .Lcbc_dec_just_ret
2500 mov 480(KEYP), KLEN
2501 add $240, KEYP
2502 movups (IVP), IV
2503 cmp $64, LEN
2504 jb .Lcbc_dec_loop1
2505.align 4
2506.Lcbc_dec_loop4:
2507 movups (INP), IN1
2508 movaps IN1, STATE1
2509 movups 0x10(INP), IN2
2510 movaps IN2, STATE2
2511#ifdef __x86_64__
2512 movups 0x20(INP), IN3
2513 movaps IN3, STATE3
2514 movups 0x30(INP), IN4
2515 movaps IN4, STATE4
2516#else
2517 movups 0x20(INP), IN1
2518 movaps IN1, STATE3
2519 movups 0x30(INP), IN2
2520 movaps IN2, STATE4
2521#endif
2522 call _aesni_dec4
2523 pxor IV, STATE1
2524#ifdef __x86_64__
2525 pxor IN1, STATE2
2526 pxor IN2, STATE3
2527 pxor IN3, STATE4
2528 movaps IN4, IV
2529#else
2530 pxor IN1, STATE4
2531 movaps IN2, IV
2532 movups (INP), IN1
2533 pxor IN1, STATE2
2534 movups 0x10(INP), IN2
2535 pxor IN2, STATE3
2536#endif
2537 movups STATE1, (OUTP)
2538 movups STATE2, 0x10(OUTP)
2539 movups STATE3, 0x20(OUTP)
2540 movups STATE4, 0x30(OUTP)
2541 sub $64, LEN
2542 add $64, INP
2543 add $64, OUTP
2544 cmp $64, LEN
2545 jge .Lcbc_dec_loop4
2546 cmp $16, LEN
2547 jb .Lcbc_dec_ret
2548.align 4
2549.Lcbc_dec_loop1:
2550 movups (INP), IN
2551 movaps IN, STATE
2552 call _aesni_dec1
2553 pxor IV, STATE
2554 movups STATE, (OUTP)
2555 movaps IN, IV
2556 sub $16, LEN
2557 add $16, INP
2558 add $16, OUTP
2559 cmp $16, LEN
2560 jge .Lcbc_dec_loop1
2561.Lcbc_dec_ret:
2562 movups IV, (IVP)
2563.Lcbc_dec_just_ret:
2564#ifndef __x86_64__
2565 popl KLEN
2566 popl KEYP
2567 popl LEN
2568 popl IVP
2569#endif
2570 FRAME_END
2571 RET
2572SYM_FUNC_END(aesni_cbc_dec)
2573
2574/*
2575 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2576 * size_t len, u8 *iv)
2577 */
2578SYM_FUNC_START(aesni_cts_cbc_enc)
2579 FRAME_BEGIN
2580#ifndef __x86_64__
2581 pushl IVP
2582 pushl LEN
2583 pushl KEYP
2584 pushl KLEN
2585 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2586 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2587 movl (FRAME_OFFSET+28)(%esp), INP # src
2588 movl (FRAME_OFFSET+32)(%esp), LEN # len
2589 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2590 lea .Lcts_permute_table, T1
2591#else
2592 lea .Lcts_permute_table(%rip), T1
2593#endif
2594 mov 480(KEYP), KLEN
2595 movups (IVP), STATE
2596 sub $16, LEN
2597 mov T1, IVP
2598 add $32, IVP
2599 add LEN, T1
2600 sub LEN, IVP
2601 movups (T1), %xmm4
2602 movups (IVP), %xmm5
2603
2604 movups (INP), IN1
2605 add LEN, INP
2606 movups (INP), IN2
2607
2608 pxor IN1, STATE
2609 call _aesni_enc1
2610
2611 pshufb %xmm5, IN2
2612 pxor STATE, IN2
2613 pshufb %xmm4, STATE
2614 add OUTP, LEN
2615 movups STATE, (LEN)
2616
2617 movaps IN2, STATE
2618 call _aesni_enc1
2619 movups STATE, (OUTP)
2620
2621#ifndef __x86_64__
2622 popl KLEN
2623 popl KEYP
2624 popl LEN
2625 popl IVP
2626#endif
2627 FRAME_END
2628 RET
2629SYM_FUNC_END(aesni_cts_cbc_enc)
2630
2631/*
2632 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2633 * size_t len, u8 *iv)
2634 */
2635SYM_FUNC_START(aesni_cts_cbc_dec)
2636 FRAME_BEGIN
2637#ifndef __x86_64__
2638 pushl IVP
2639 pushl LEN
2640 pushl KEYP
2641 pushl KLEN
2642 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2643 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2644 movl (FRAME_OFFSET+28)(%esp), INP # src
2645 movl (FRAME_OFFSET+32)(%esp), LEN # len
2646 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2647 lea .Lcts_permute_table, T1
2648#else
2649 lea .Lcts_permute_table(%rip), T1
2650#endif
2651 mov 480(KEYP), KLEN
2652 add $240, KEYP
2653 movups (IVP), IV
2654 sub $16, LEN
2655 mov T1, IVP
2656 add $32, IVP
2657 add LEN, T1
2658 sub LEN, IVP
2659 movups (T1), %xmm4
2660
2661 movups (INP), STATE
2662 add LEN, INP
2663 movups (INP), IN1
2664
2665 call _aesni_dec1
2666 movaps STATE, IN2
2667 pshufb %xmm4, STATE
2668 pxor IN1, STATE
2669
2670 add OUTP, LEN
2671 movups STATE, (LEN)
2672
2673 movups (IVP), %xmm0
2674 pshufb %xmm0, IN1
2675 pblendvb IN2, IN1
2676 movaps IN1, STATE
2677 call _aesni_dec1
2678
2679 pxor IV, STATE
2680 movups STATE, (OUTP)
2681
2682#ifndef __x86_64__
2683 popl KLEN
2684 popl KEYP
2685 popl LEN
2686 popl IVP
2687#endif
2688 FRAME_END
2689 RET
2690SYM_FUNC_END(aesni_cts_cbc_dec)
2691
2692.pushsection .rodata
2693.align 16
2694.Lcts_permute_table:
2695 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2696 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2697 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2698 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2699 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2700 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2701#ifdef __x86_64__
2702.Lbswap_mask:
2703 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2704#endif
2705.popsection
2706
2707#ifdef __x86_64__
2708/*
2709 * _aesni_inc_init: internal ABI
2710 * setup registers used by _aesni_inc
2711 * input:
2712 * IV
2713 * output:
2714 * CTR: == IV, in little endian
2715 * TCTR_LOW: == lower qword of CTR
2716 * INC: == 1, in little endian
2717 * BSWAP_MASK == endian swapping mask
2718 */
2719SYM_FUNC_START_LOCAL(_aesni_inc_init)
2720 movaps .Lbswap_mask(%rip), BSWAP_MASK
2721 movaps IV, CTR
2722 pshufb BSWAP_MASK, CTR
2723 mov $1, TCTR_LOW
2724 movq TCTR_LOW, INC
2725 movq CTR, TCTR_LOW
2726 RET
2727SYM_FUNC_END(_aesni_inc_init)
2728
2729/*
2730 * _aesni_inc: internal ABI
2731 * Increase IV by 1, IV is in big endian
2732 * input:
2733 * IV
2734 * CTR: == IV, in little endian
2735 * TCTR_LOW: == lower qword of CTR
2736 * INC: == 1, in little endian
2737 * BSWAP_MASK == endian swapping mask
2738 * output:
2739 * IV: Increase by 1
2740 * changed:
2741 * CTR: == output IV, in little endian
2742 * TCTR_LOW: == lower qword of CTR
2743 */
2744SYM_FUNC_START_LOCAL(_aesni_inc)
2745 paddq INC, CTR
2746 add $1, TCTR_LOW
2747 jnc .Linc_low
2748 pslldq $8, INC
2749 paddq INC, CTR
2750 psrldq $8, INC
2751.Linc_low:
2752 movaps CTR, IV
2753 pshufb BSWAP_MASK, IV
2754 RET
2755SYM_FUNC_END(_aesni_inc)
2756
2757/*
2758 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2759 * size_t len, u8 *iv)
2760 */
2761SYM_FUNC_START(aesni_ctr_enc)
2762 FRAME_BEGIN
2763 cmp $16, LEN
2764 jb .Lctr_enc_just_ret
2765 mov 480(KEYP), KLEN
2766 movups (IVP), IV
2767 call _aesni_inc_init
2768 cmp $64, LEN
2769 jb .Lctr_enc_loop1
2770.align 4
2771.Lctr_enc_loop4:
2772 movaps IV, STATE1
2773 call _aesni_inc
2774 movups (INP), IN1
2775 movaps IV, STATE2
2776 call _aesni_inc
2777 movups 0x10(INP), IN2
2778 movaps IV, STATE3
2779 call _aesni_inc
2780 movups 0x20(INP), IN3
2781 movaps IV, STATE4
2782 call _aesni_inc
2783 movups 0x30(INP), IN4
2784 call _aesni_enc4
2785 pxor IN1, STATE1
2786 movups STATE1, (OUTP)
2787 pxor IN2, STATE2
2788 movups STATE2, 0x10(OUTP)
2789 pxor IN3, STATE3
2790 movups STATE3, 0x20(OUTP)
2791 pxor IN4, STATE4
2792 movups STATE4, 0x30(OUTP)
2793 sub $64, LEN
2794 add $64, INP
2795 add $64, OUTP
2796 cmp $64, LEN
2797 jge .Lctr_enc_loop4
2798 cmp $16, LEN
2799 jb .Lctr_enc_ret
2800.align 4
2801.Lctr_enc_loop1:
2802 movaps IV, STATE
2803 call _aesni_inc
2804 movups (INP), IN
2805 call _aesni_enc1
2806 pxor IN, STATE
2807 movups STATE, (OUTP)
2808 sub $16, LEN
2809 add $16, INP
2810 add $16, OUTP
2811 cmp $16, LEN
2812 jge .Lctr_enc_loop1
2813.Lctr_enc_ret:
2814 movups IV, (IVP)
2815.Lctr_enc_just_ret:
2816 FRAME_END
2817 RET
2818SYM_FUNC_END(aesni_ctr_enc)
2819
2820#endif
2821
2822.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2823.align 16
2824.Lgf128mul_x_ble_mask:
2825 .octa 0x00000000000000010000000000000087
2826.previous
2827
2828/*
2829 * _aesni_gf128mul_x_ble: internal ABI
2830 * Multiply in GF(2^128) for XTS IVs
2831 * input:
2832 * IV: current IV
2833 * GF128MUL_MASK == mask with 0x87 and 0x01
2834 * output:
2835 * IV: next IV
2836 * changed:
2837 * CTR: == temporary value
2838 */
2839#define _aesni_gf128mul_x_ble() \
2840 pshufd $0x13, IV, KEY; \
2841 paddq IV, IV; \
2842 psrad $31, KEY; \
2843 pand GF128MUL_MASK, KEY; \
2844 pxor KEY, IV;
2845
2846/*
2847 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2848 * const u8 *src, unsigned int len, le128 *iv)
2849 */
2850SYM_FUNC_START(aesni_xts_encrypt)
2851 FRAME_BEGIN
2852#ifndef __x86_64__
2853 pushl IVP
2854 pushl LEN
2855 pushl KEYP
2856 pushl KLEN
2857 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2858 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2859 movl (FRAME_OFFSET+28)(%esp), INP # src
2860 movl (FRAME_OFFSET+32)(%esp), LEN # len
2861 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2862 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2863#else
2864 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2865#endif
2866 movups (IVP), IV
2867
2868 mov 480(KEYP), KLEN
2869
2870.Lxts_enc_loop4:
2871 sub $64, LEN
2872 jl .Lxts_enc_1x
2873
2874 movdqa IV, STATE1
2875 movdqu 0x00(INP), IN
2876 pxor IN, STATE1
2877 movdqu IV, 0x00(OUTP)
2878
2879 _aesni_gf128mul_x_ble()
2880 movdqa IV, STATE2
2881 movdqu 0x10(INP), IN
2882 pxor IN, STATE2
2883 movdqu IV, 0x10(OUTP)
2884
2885 _aesni_gf128mul_x_ble()
2886 movdqa IV, STATE3
2887 movdqu 0x20(INP), IN
2888 pxor IN, STATE3
2889 movdqu IV, 0x20(OUTP)
2890
2891 _aesni_gf128mul_x_ble()
2892 movdqa IV, STATE4
2893 movdqu 0x30(INP), IN
2894 pxor IN, STATE4
2895 movdqu IV, 0x30(OUTP)
2896
2897 call _aesni_enc4
2898
2899 movdqu 0x00(OUTP), IN
2900 pxor IN, STATE1
2901 movdqu STATE1, 0x00(OUTP)
2902
2903 movdqu 0x10(OUTP), IN
2904 pxor IN, STATE2
2905 movdqu STATE2, 0x10(OUTP)
2906
2907 movdqu 0x20(OUTP), IN
2908 pxor IN, STATE3
2909 movdqu STATE3, 0x20(OUTP)
2910
2911 movdqu 0x30(OUTP), IN
2912 pxor IN, STATE4
2913 movdqu STATE4, 0x30(OUTP)
2914
2915 _aesni_gf128mul_x_ble()
2916
2917 add $64, INP
2918 add $64, OUTP
2919 test LEN, LEN
2920 jnz .Lxts_enc_loop4
2921
2922.Lxts_enc_ret_iv:
2923 movups IV, (IVP)
2924
2925.Lxts_enc_ret:
2926#ifndef __x86_64__
2927 popl KLEN
2928 popl KEYP
2929 popl LEN
2930 popl IVP
2931#endif
2932 FRAME_END
2933 RET
2934
2935.Lxts_enc_1x:
2936 add $64, LEN
2937 jz .Lxts_enc_ret_iv
2938 sub $16, LEN
2939 jl .Lxts_enc_cts4
2940
2941.Lxts_enc_loop1:
2942 movdqu (INP), STATE
2943 pxor IV, STATE
2944 call _aesni_enc1
2945 pxor IV, STATE
2946 _aesni_gf128mul_x_ble()
2947
2948 test LEN, LEN
2949 jz .Lxts_enc_out
2950
2951 add $16, INP
2952 sub $16, LEN
2953 jl .Lxts_enc_cts1
2954
2955 movdqu STATE, (OUTP)
2956 add $16, OUTP
2957 jmp .Lxts_enc_loop1
2958
2959.Lxts_enc_out:
2960 movdqu STATE, (OUTP)
2961 jmp .Lxts_enc_ret_iv
2962
2963.Lxts_enc_cts4:
2964 movdqa STATE4, STATE
2965 sub $16, OUTP
2966
2967.Lxts_enc_cts1:
2968#ifndef __x86_64__
2969 lea .Lcts_permute_table, T1
2970#else
2971 lea .Lcts_permute_table(%rip), T1
2972#endif
2973 add LEN, INP /* rewind input pointer */
2974 add $16, LEN /* # bytes in final block */
2975 movups (INP), IN1
2976
2977 mov T1, IVP
2978 add $32, IVP
2979 add LEN, T1
2980 sub LEN, IVP
2981 add OUTP, LEN
2982
2983 movups (T1), %xmm4
2984 movaps STATE, IN2
2985 pshufb %xmm4, STATE
2986 movups STATE, (LEN)
2987
2988 movups (IVP), %xmm0
2989 pshufb %xmm0, IN1
2990 pblendvb IN2, IN1
2991 movaps IN1, STATE
2992
2993 pxor IV, STATE
2994 call _aesni_enc1
2995 pxor IV, STATE
2996
2997 movups STATE, (OUTP)
2998 jmp .Lxts_enc_ret
2999SYM_FUNC_END(aesni_xts_encrypt)
3000
3001/*
3002 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3003 * const u8 *src, unsigned int len, le128 *iv)
3004 */
3005SYM_FUNC_START(aesni_xts_decrypt)
3006 FRAME_BEGIN
3007#ifndef __x86_64__
3008 pushl IVP
3009 pushl LEN
3010 pushl KEYP
3011 pushl KLEN
3012 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
3013 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
3014 movl (FRAME_OFFSET+28)(%esp), INP # src
3015 movl (FRAME_OFFSET+32)(%esp), LEN # len
3016 movl (FRAME_OFFSET+36)(%esp), IVP # iv
3017 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3018#else
3019 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3020#endif
3021 movups (IVP), IV
3022
3023 mov 480(KEYP), KLEN
3024 add $240, KEYP
3025
3026 test $15, LEN
3027 jz .Lxts_dec_loop4
3028 sub $16, LEN
3029
3030.Lxts_dec_loop4:
3031 sub $64, LEN
3032 jl .Lxts_dec_1x
3033
3034 movdqa IV, STATE1
3035 movdqu 0x00(INP), IN
3036 pxor IN, STATE1
3037 movdqu IV, 0x00(OUTP)
3038
3039 _aesni_gf128mul_x_ble()
3040 movdqa IV, STATE2
3041 movdqu 0x10(INP), IN
3042 pxor IN, STATE2
3043 movdqu IV, 0x10(OUTP)
3044
3045 _aesni_gf128mul_x_ble()
3046 movdqa IV, STATE3
3047 movdqu 0x20(INP), IN
3048 pxor IN, STATE3
3049 movdqu IV, 0x20(OUTP)
3050
3051 _aesni_gf128mul_x_ble()
3052 movdqa IV, STATE4
3053 movdqu 0x30(INP), IN
3054 pxor IN, STATE4
3055 movdqu IV, 0x30(OUTP)
3056
3057 call _aesni_dec4
3058
3059 movdqu 0x00(OUTP), IN
3060 pxor IN, STATE1
3061 movdqu STATE1, 0x00(OUTP)
3062
3063 movdqu 0x10(OUTP), IN
3064 pxor IN, STATE2
3065 movdqu STATE2, 0x10(OUTP)
3066
3067 movdqu 0x20(OUTP), IN
3068 pxor IN, STATE3
3069 movdqu STATE3, 0x20(OUTP)
3070
3071 movdqu 0x30(OUTP), IN
3072 pxor IN, STATE4
3073 movdqu STATE4, 0x30(OUTP)
3074
3075 _aesni_gf128mul_x_ble()
3076
3077 add $64, INP
3078 add $64, OUTP
3079 test LEN, LEN
3080 jnz .Lxts_dec_loop4
3081
3082.Lxts_dec_ret_iv:
3083 movups IV, (IVP)
3084
3085.Lxts_dec_ret:
3086#ifndef __x86_64__
3087 popl KLEN
3088 popl KEYP
3089 popl LEN
3090 popl IVP
3091#endif
3092 FRAME_END
3093 RET
3094
3095.Lxts_dec_1x:
3096 add $64, LEN
3097 jz .Lxts_dec_ret_iv
3098
3099.Lxts_dec_loop1:
3100 movdqu (INP), STATE
3101
3102 add $16, INP
3103 sub $16, LEN
3104 jl .Lxts_dec_cts1
3105
3106 pxor IV, STATE
3107 call _aesni_dec1
3108 pxor IV, STATE
3109 _aesni_gf128mul_x_ble()
3110
3111 test LEN, LEN
3112 jz .Lxts_dec_out
3113
3114 movdqu STATE, (OUTP)
3115 add $16, OUTP
3116 jmp .Lxts_dec_loop1
3117
3118.Lxts_dec_out:
3119 movdqu STATE, (OUTP)
3120 jmp .Lxts_dec_ret_iv
3121
3122.Lxts_dec_cts1:
3123 movdqa IV, STATE4
3124 _aesni_gf128mul_x_ble()
3125
3126 pxor IV, STATE
3127 call _aesni_dec1
3128 pxor IV, STATE
3129
3130#ifndef __x86_64__
3131 lea .Lcts_permute_table, T1
3132#else
3133 lea .Lcts_permute_table(%rip), T1
3134#endif
3135 add LEN, INP /* rewind input pointer */
3136 add $16, LEN /* # bytes in final block */
3137 movups (INP), IN1
3138
3139 mov T1, IVP
3140 add $32, IVP
3141 add LEN, T1
3142 sub LEN, IVP
3143 add OUTP, LEN
3144
3145 movups (T1), %xmm4
3146 movaps STATE, IN2
3147 pshufb %xmm4, STATE
3148 movups STATE, (LEN)
3149
3150 movups (IVP), %xmm0
3151 pshufb %xmm0, IN1
3152 pblendvb IN2, IN1
3153 movaps IN1, STATE
3154
3155 pxor STATE4, STATE
3156 call _aesni_dec1
3157 pxor STATE4, STATE
3158
3159 movups STATE, (OUTP)
3160 jmp .Lxts_dec_ret
3161SYM_FUNC_END(aesni_xts_decrypt)