Loading...
Note: File does not exist in v6.13.7.
1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses. You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15# notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18# notice, this list of conditions and the following disclaimer in the
19# documentation and/or other materials provided with the
20# distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23# contributors may be used to endorse or promote products derived from
24# this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41## Erdinc Ozturk <erdinc.ozturk@intel.com>
42## Vinodh Gopal <vinodh.gopal@intel.com>
43## James Guilford <james.guilford@intel.com>
44## Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47## This code was derived and highly optimized from the code described in paper:
48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49## on Intel Architecture Processors. August, 2010
50## The details of the implementation is explained in:
51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52## on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59## 0 1 2 3
60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62## | Salt (From the SA) |
63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64## | Initialization Vector |
65## | (This is the sequence number from IPSec header) |
66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67## | 0x1 |
68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73## AAD padded to 128 bits with 0
74## for example, assume AAD is a u32 vector
75##
76## if AAD is 8 bytes:
77## AAD[3] = {A0, A1}#
78## padded AAD in xmm register = {A1 A0 0 0}
79##
80## 0 1 2 3
81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83## | SPI (A1) |
84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85## | 32-bit Sequence Number (A0) |
86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87## | 0x0 |
88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90## AAD Format with 32-bit Sequence Number
91##
92## if AAD is 12 bytes:
93## AAD[3] = {A0, A1, A2}#
94## padded AAD in xmm register = {A2 A1 A0 0}
95##
96## 0 1 2 3
97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99## | SPI (A2) |
100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101## | 64-bit Extended Sequence Number {A1,A0} |
102## | |
103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104## | 0x0 |
105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107## AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112## The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123
124# constants in mergeable sections, linker can reorder and merge
125.section .rodata.cst16.POLY, "aM", @progbits, 16
126.align 16
127POLY: .octa 0xC2000000000000000000000000000001
128
129.section .rodata.cst16.POLY2, "aM", @progbits, 16
130.align 16
131POLY2: .octa 0xC20000000000000000000001C2000000
132
133.section .rodata.cst16.TWOONE, "aM", @progbits, 16
134.align 16
135TWOONE: .octa 0x00000001000000000000000000000001
136
137.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138.align 16
139SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
140
141.section .rodata.cst16.ONE, "aM", @progbits, 16
142.align 16
143ONE: .octa 0x00000000000000000000000000000001
144
145.section .rodata.cst16.ONEf, "aM", @progbits, 16
146.align 16
147ONEf: .octa 0x01000000000000000000000000000000
148
149# order of these constants should not change.
150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151.section .rodata, "a", @progbits
152.align 16
153SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
154ALL_F: .octa 0xffffffffffffffffffffffffffffffff
155 .octa 0x00000000000000000000000000000000
156
157.section .rodata
158.align 16
159.type aad_shift_arr, @object
160.size aad_shift_arr, 272
161aad_shift_arr:
162 .octa 0xffffffffffffffffffffffffffffffff
163 .octa 0xffffffffffffffffffffffffffffff0C
164 .octa 0xffffffffffffffffffffffffffff0D0C
165 .octa 0xffffffffffffffffffffffffff0E0D0C
166 .octa 0xffffffffffffffffffffffff0F0E0D0C
167 .octa 0xffffffffffffffffffffff0C0B0A0908
168 .octa 0xffffffffffffffffffff0D0C0B0A0908
169 .octa 0xffffffffffffffffff0E0D0C0B0A0908
170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
171 .octa 0xffffffffffffff0C0B0A090807060504
172 .octa 0xffffffffffff0D0C0B0A090807060504
173 .octa 0xffffffffff0E0D0C0B0A090807060504
174 .octa 0xffffffff0F0E0D0C0B0A090807060504
175 .octa 0xffffff0C0B0A09080706050403020100
176 .octa 0xffff0D0C0B0A09080706050403020100
177 .octa 0xff0E0D0C0B0A09080706050403020100
178 .octa 0x0F0E0D0C0B0A09080706050403020100
179
180
181.text
182
183
184#define AadHash 16*0
185#define AadLen 16*1
186#define InLen (16*1)+8
187#define PBlockEncKey 16*2
188#define OrigIV 16*3
189#define CurCount 16*4
190#define PBlockLen 16*5
191
192HashKey = 16*6 # store HashKey <<1 mod poly here
193HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
194HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
195HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
196HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
197HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
198HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
199HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
200HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
208
209#define arg1 %rdi
210#define arg2 %rsi
211#define arg3 %rdx
212#define arg4 %rcx
213#define arg5 %r8
214#define arg6 %r9
215#define arg7 STACK_OFFSET+8*1(%r14)
216#define arg8 STACK_OFFSET+8*2(%r14)
217#define arg9 STACK_OFFSET+8*3(%r14)
218#define arg10 STACK_OFFSET+8*4(%r14)
219#define keysize 2*15*16(arg1)
220
221i = 0
222j = 0
223
224out_order = 0
225in_order = 1
226DEC = 0
227ENC = 1
228
229.macro define_reg r n
230reg_\r = %xmm\n
231.endm
232
233.macro setreg
234.altmacro
235define_reg i %i
236define_reg j %j
237.noaltmacro
238.endm
239
240# need to push 4 registers into stack to maintain
241STACK_OFFSET = 8*4
242
243TMP1 = 16*0 # Temporary storage for AAD
244TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
245TMP3 = 16*2 # Temporary storage for AES State 3
246TMP4 = 16*3 # Temporary storage for AES State 4
247TMP5 = 16*4 # Temporary storage for AES State 5
248TMP6 = 16*5 # Temporary storage for AES State 6
249TMP7 = 16*6 # Temporary storage for AES State 7
250TMP8 = 16*7 # Temporary storage for AES State 8
251
252VARIABLE_OFFSET = 16*8
253
254################################
255# Utility Macros
256################################
257
258.macro FUNC_SAVE
259 #the number of pushes must equal STACK_OFFSET
260 push %r12
261 push %r13
262 push %r14
263 push %r15
264
265 mov %rsp, %r14
266
267
268
269 sub $VARIABLE_OFFSET, %rsp
270 and $~63, %rsp # align rsp to 64 bytes
271.endm
272
273.macro FUNC_RESTORE
274 mov %r14, %rsp
275
276 pop %r15
277 pop %r14
278 pop %r13
279 pop %r12
280.endm
281
282# Encryption of a single block
283.macro ENCRYPT_SINGLE_BLOCK REP XMM0
284 vpxor (arg1), \XMM0, \XMM0
285 i = 1
286 setreg
287.rep \REP
288 vaesenc 16*i(arg1), \XMM0, \XMM0
289 i = (i+1)
290 setreg
291.endr
292 vaesenclast 16*i(arg1), \XMM0, \XMM0
293.endm
294
295# combined for GCM encrypt and decrypt functions
296# clobbering all xmm registers
297# clobbering r10, r11, r12, r13, r14, r15
298.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
299 vmovdqu AadHash(arg2), %xmm8
300 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
301 add arg5, InLen(arg2)
302
303 # initialize the data pointer offset as zero
304 xor %r11d, %r11d
305
306 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
307 sub %r11, arg5
308
309 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
310 and $-16, %r13 # r13 = r13 - (r13 mod 16)
311
312 mov %r13, %r12
313 shr $4, %r12
314 and $7, %r12
315 jz _initial_num_blocks_is_0\@
316
317 cmp $7, %r12
318 je _initial_num_blocks_is_7\@
319 cmp $6, %r12
320 je _initial_num_blocks_is_6\@
321 cmp $5, %r12
322 je _initial_num_blocks_is_5\@
323 cmp $4, %r12
324 je _initial_num_blocks_is_4\@
325 cmp $3, %r12
326 je _initial_num_blocks_is_3\@
327 cmp $2, %r12
328 je _initial_num_blocks_is_2\@
329
330 jmp _initial_num_blocks_is_1\@
331
332_initial_num_blocks_is_7\@:
333 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334 sub $16*7, %r13
335 jmp _initial_blocks_encrypted\@
336
337_initial_num_blocks_is_6\@:
338 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339 sub $16*6, %r13
340 jmp _initial_blocks_encrypted\@
341
342_initial_num_blocks_is_5\@:
343 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344 sub $16*5, %r13
345 jmp _initial_blocks_encrypted\@
346
347_initial_num_blocks_is_4\@:
348 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349 sub $16*4, %r13
350 jmp _initial_blocks_encrypted\@
351
352_initial_num_blocks_is_3\@:
353 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
354 sub $16*3, %r13
355 jmp _initial_blocks_encrypted\@
356
357_initial_num_blocks_is_2\@:
358 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
359 sub $16*2, %r13
360 jmp _initial_blocks_encrypted\@
361
362_initial_num_blocks_is_1\@:
363 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
364 sub $16*1, %r13
365 jmp _initial_blocks_encrypted\@
366
367_initial_num_blocks_is_0\@:
368 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
369
370
371_initial_blocks_encrypted\@:
372 cmp $0, %r13
373 je _zero_cipher_left\@
374
375 sub $128, %r13
376 je _eight_cipher_left\@
377
378
379
380
381 vmovd %xmm9, %r15d
382 and $255, %r15d
383 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
384
385
386_encrypt_by_8_new\@:
387 cmp $(255-8), %r15d
388 jg _encrypt_by_8\@
389
390
391
392 add $8, %r15b
393 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
394 add $128, %r11
395 sub $128, %r13
396 jne _encrypt_by_8_new\@
397
398 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
399 jmp _eight_cipher_left\@
400
401_encrypt_by_8\@:
402 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
403 add $8, %r15b
404 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
405 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
406 add $128, %r11
407 sub $128, %r13
408 jne _encrypt_by_8_new\@
409
410 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
411
412
413
414
415_eight_cipher_left\@:
416 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
417
418
419_zero_cipher_left\@:
420 vmovdqu %xmm14, AadHash(arg2)
421 vmovdqu %xmm9, CurCount(arg2)
422
423 # check for 0 length
424 mov arg5, %r13
425 and $15, %r13 # r13 = (arg5 mod 16)
426
427 je _multiple_of_16_bytes\@
428
429 # handle the last <16 Byte block separately
430
431 mov %r13, PBlockLen(arg2)
432
433 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
434 vmovdqu %xmm9, CurCount(arg2)
435 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
436
437 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
438 vmovdqu %xmm9, PBlockEncKey(arg2)
439
440 cmp $16, arg5
441 jge _large_enough_update\@
442
443 lea (arg4,%r11,1), %r10
444 mov %r13, %r12
445
446 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
447
448 lea SHIFT_MASK+16(%rip), %r12
449 sub %r13, %r12 # adjust the shuffle mask pointer to be
450 # able to shift 16-r13 bytes (r13 is the
451 # number of bytes in plaintext mod 16)
452
453 jmp _final_ghash_mul\@
454
455_large_enough_update\@:
456 sub $16, %r11
457 add %r13, %r11
458
459 # receive the last <16 Byte block
460 vmovdqu (arg4, %r11, 1), %xmm1
461
462 sub %r13, %r11
463 add $16, %r11
464
465 lea SHIFT_MASK+16(%rip), %r12
466 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
467 # (r13 is the number of bytes in plaintext mod 16)
468 sub %r13, %r12
469 # get the appropriate shuffle mask
470 vmovdqu (%r12), %xmm2
471 # shift right 16-r13 bytes
472 vpshufb %xmm2, %xmm1, %xmm1
473
474_final_ghash_mul\@:
475 .if \ENC_DEC == DEC
476 vmovdqa %xmm1, %xmm2
477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
479 # mask out top 16-r13 bytes of xmm9
480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
481 vpand %xmm1, %xmm2, %xmm2
482 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
483 vpxor %xmm2, %xmm14, %xmm14
484
485 vmovdqu %xmm14, AadHash(arg2)
486 .else
487 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
488 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
489 # mask out top 16-r13 bytes of xmm9
490 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
491 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
492 vpxor %xmm9, %xmm14, %xmm14
493
494 vmovdqu %xmm14, AadHash(arg2)
495 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
496 .endif
497
498
499 #############################
500 # output r13 Bytes
501 vmovq %xmm9, %rax
502 cmp $8, %r13
503 jle _less_than_8_bytes_left\@
504
505 mov %rax, (arg3 , %r11)
506 add $8, %r11
507 vpsrldq $8, %xmm9, %xmm9
508 vmovq %xmm9, %rax
509 sub $8, %r13
510
511_less_than_8_bytes_left\@:
512 movb %al, (arg3 , %r11)
513 add $1, %r11
514 shr $8, %rax
515 sub $1, %r13
516 jne _less_than_8_bytes_left\@
517 #############################
518
519_multiple_of_16_bytes\@:
520.endm
521
522
523# GCM_COMPLETE Finishes update of tag of last partial block
524# Output: Authorization Tag (AUTH_TAG)
525# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
526.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
527 vmovdqu AadHash(arg2), %xmm14
528 vmovdqu HashKey(arg2), %xmm13
529
530 mov PBlockLen(arg2), %r12
531 cmp $0, %r12
532 je _partial_done\@
533
534 #GHASH computation for the last <16 Byte block
535 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
536
537_partial_done\@:
538 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
539 shl $3, %r12 # convert into number of bits
540 vmovd %r12d, %xmm15 # len(A) in xmm15
541
542 mov InLen(arg2), %r12
543 shl $3, %r12 # len(C) in bits (*128)
544 vmovq %r12, %xmm1
545 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
546 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
547
548 vpxor %xmm15, %xmm14, %xmm14
549 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
550 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
551
552 vmovdqu OrigIV(arg2), %xmm9
553
554 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
555
556 vpxor %xmm14, %xmm9, %xmm9
557
558
559
560_return_T\@:
561 mov \AUTH_TAG, %r10 # r10 = authTag
562 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
563
564 cmp $16, %r11
565 je _T_16\@
566
567 cmp $8, %r11
568 jl _T_4\@
569
570_T_8\@:
571 vmovq %xmm9, %rax
572 mov %rax, (%r10)
573 add $8, %r10
574 sub $8, %r11
575 vpsrldq $8, %xmm9, %xmm9
576 cmp $0, %r11
577 je _return_T_done\@
578_T_4\@:
579 vmovd %xmm9, %eax
580 mov %eax, (%r10)
581 add $4, %r10
582 sub $4, %r11
583 vpsrldq $4, %xmm9, %xmm9
584 cmp $0, %r11
585 je _return_T_done\@
586_T_123\@:
587 vmovd %xmm9, %eax
588 cmp $2, %r11
589 jl _T_1\@
590 mov %ax, (%r10)
591 cmp $2, %r11
592 je _return_T_done\@
593 add $2, %r10
594 sar $16, %eax
595_T_1\@:
596 mov %al, (%r10)
597 jmp _return_T_done\@
598
599_T_16\@:
600 vmovdqu %xmm9, (%r10)
601
602_return_T_done\@:
603.endm
604
605.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
606
607 mov \AAD, %r10 # r10 = AAD
608 mov \AADLEN, %r12 # r12 = aadLen
609
610
611 mov %r12, %r11
612
613 vpxor \T8, \T8, \T8
614 vpxor \T7, \T7, \T7
615 cmp $16, %r11
616 jl _get_AAD_rest8\@
617_get_AAD_blocks\@:
618 vmovdqu (%r10), \T7
619 vpshufb SHUF_MASK(%rip), \T7, \T7
620 vpxor \T7, \T8, \T8
621 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
622 add $16, %r10
623 sub $16, %r12
624 sub $16, %r11
625 cmp $16, %r11
626 jge _get_AAD_blocks\@
627 vmovdqu \T8, \T7
628 cmp $0, %r11
629 je _get_AAD_done\@
630
631 vpxor \T7, \T7, \T7
632
633 /* read the last <16B of AAD. since we have at least 4B of
634 data right after the AAD (the ICV, and maybe some CT), we can
635 read 4B/8B blocks safely, and then get rid of the extra stuff */
636_get_AAD_rest8\@:
637 cmp $4, %r11
638 jle _get_AAD_rest4\@
639 movq (%r10), \T1
640 add $8, %r10
641 sub $8, %r11
642 vpslldq $8, \T1, \T1
643 vpsrldq $8, \T7, \T7
644 vpxor \T1, \T7, \T7
645 jmp _get_AAD_rest8\@
646_get_AAD_rest4\@:
647 cmp $0, %r11
648 jle _get_AAD_rest0\@
649 mov (%r10), %eax
650 movq %rax, \T1
651 add $4, %r10
652 sub $4, %r11
653 vpslldq $12, \T1, \T1
654 vpsrldq $4, \T7, \T7
655 vpxor \T1, \T7, \T7
656_get_AAD_rest0\@:
657 /* finalize: shift out the extra bytes we read, and align
658 left. since pslldq can only shift by an immediate, we use
659 vpshufb and an array of shuffle masks */
660 movq %r12, %r11
661 salq $4, %r11
662 vmovdqu aad_shift_arr(%r11), \T1
663 vpshufb \T1, \T7, \T7
664_get_AAD_rest_final\@:
665 vpshufb SHUF_MASK(%rip), \T7, \T7
666 vpxor \T8, \T7, \T7
667 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
668
669_get_AAD_done\@:
670 vmovdqu \T7, AadHash(arg2)
671.endm
672
673.macro INIT GHASH_MUL PRECOMPUTE
674 mov arg6, %r11
675 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
676 xor %r11d, %r11d
677 mov %r11, InLen(arg2) # ctx_data.in_length = 0
678
679 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
680 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
681 mov arg3, %rax
682 movdqu (%rax), %xmm0
683 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
684
685 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
686 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
687
688 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
689
690 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
691 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
692 vmovdqa %xmm6, %xmm2
693 vpsllq $1, %xmm6, %xmm6
694 vpsrlq $63, %xmm2, %xmm2
695 vmovdqa %xmm2, %xmm1
696 vpslldq $8, %xmm2, %xmm2
697 vpsrldq $8, %xmm1, %xmm1
698 vpor %xmm2, %xmm6, %xmm6
699 #reduction
700 vpshufd $0b00100100, %xmm1, %xmm2
701 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
702 vpand POLY(%rip), %xmm2, %xmm2
703 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
704 #######################################################################
705 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
706
707 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
708
709 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
710.endm
711
712
713# Reads DLEN bytes starting at DPTR and stores in XMMDst
714# where 0 < DLEN < 16
715# Clobbers %rax, DLEN
716.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
717 vpxor \XMMDst, \XMMDst, \XMMDst
718
719 cmp $8, \DLEN
720 jl _read_lt8_\@
721 mov (\DPTR), %rax
722 vpinsrq $0, %rax, \XMMDst, \XMMDst
723 sub $8, \DLEN
724 jz _done_read_partial_block_\@
725 xor %eax, %eax
726_read_next_byte_\@:
727 shl $8, %rax
728 mov 7(\DPTR, \DLEN, 1), %al
729 dec \DLEN
730 jnz _read_next_byte_\@
731 vpinsrq $1, %rax, \XMMDst, \XMMDst
732 jmp _done_read_partial_block_\@
733_read_lt8_\@:
734 xor %eax, %eax
735_read_next_byte_lt8_\@:
736 shl $8, %rax
737 mov -1(\DPTR, \DLEN, 1), %al
738 dec \DLEN
739 jnz _read_next_byte_lt8_\@
740 vpinsrq $0, %rax, \XMMDst, \XMMDst
741_done_read_partial_block_\@:
742.endm
743
744# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
745# between update calls.
746# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
747# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
748# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
749.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
750 AAD_HASH ENC_DEC
751 mov PBlockLen(arg2), %r13
752 cmp $0, %r13
753 je _partial_block_done_\@ # Leave Macro if no partial blocks
754 # Read in input data without over reading
755 cmp $16, \PLAIN_CYPH_LEN
756 jl _fewer_than_16_bytes_\@
757 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
758 jmp _data_read_\@
759
760_fewer_than_16_bytes_\@:
761 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
762 mov \PLAIN_CYPH_LEN, %r12
763 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
764
765 mov PBlockLen(arg2), %r13
766
767_data_read_\@: # Finished reading in data
768
769 vmovdqu PBlockEncKey(arg2), %xmm9
770 vmovdqu HashKey(arg2), %xmm13
771
772 lea SHIFT_MASK(%rip), %r12
773
774 # adjust the shuffle mask pointer to be able to shift r13 bytes
775 # r16-r13 is the number of bytes in plaintext mod 16)
776 add %r13, %r12
777 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
778 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
779
780.if \ENC_DEC == DEC
781 vmovdqa %xmm1, %xmm3
782 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
783
784 mov \PLAIN_CYPH_LEN, %r10
785 add %r13, %r10
786 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
787 sub $16, %r10
788 # Determine if if partial block is not being filled and
789 # shift mask accordingly
790 jge _no_extra_mask_1_\@
791 sub %r10, %r12
792_no_extra_mask_1_\@:
793
794 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
795 # get the appropriate mask to mask out bottom r13 bytes of xmm9
796 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
797
798 vpand %xmm1, %xmm3, %xmm3
799 vmovdqa SHUF_MASK(%rip), %xmm10
800 vpshufb %xmm10, %xmm3, %xmm3
801 vpshufb %xmm2, %xmm3, %xmm3
802 vpxor %xmm3, \AAD_HASH, \AAD_HASH
803
804 cmp $0, %r10
805 jl _partial_incomplete_1_\@
806
807 # GHASH computation for the last <16 Byte block
808 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
809 xor %eax,%eax
810
811 mov %rax, PBlockLen(arg2)
812 jmp _dec_done_\@
813_partial_incomplete_1_\@:
814 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
815_dec_done_\@:
816 vmovdqu \AAD_HASH, AadHash(arg2)
817.else
818 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
819
820 mov \PLAIN_CYPH_LEN, %r10
821 add %r13, %r10
822 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
823 sub $16, %r10
824 # Determine if if partial block is not being filled and
825 # shift mask accordingly
826 jge _no_extra_mask_2_\@
827 sub %r10, %r12
828_no_extra_mask_2_\@:
829
830 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
831 # get the appropriate mask to mask out bottom r13 bytes of xmm9
832 vpand %xmm1, %xmm9, %xmm9
833
834 vmovdqa SHUF_MASK(%rip), %xmm1
835 vpshufb %xmm1, %xmm9, %xmm9
836 vpshufb %xmm2, %xmm9, %xmm9
837 vpxor %xmm9, \AAD_HASH, \AAD_HASH
838
839 cmp $0, %r10
840 jl _partial_incomplete_2_\@
841
842 # GHASH computation for the last <16 Byte block
843 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
844 xor %eax,%eax
845
846 mov %rax, PBlockLen(arg2)
847 jmp _encode_done_\@
848_partial_incomplete_2_\@:
849 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
850_encode_done_\@:
851 vmovdqu \AAD_HASH, AadHash(arg2)
852
853 vmovdqa SHUF_MASK(%rip), %xmm10
854 # shuffle xmm9 back to output as ciphertext
855 vpshufb %xmm10, %xmm9, %xmm9
856 vpshufb %xmm2, %xmm9, %xmm9
857.endif
858 # output encrypted Bytes
859 cmp $0, %r10
860 jl _partial_fill_\@
861 mov %r13, %r12
862 mov $16, %r13
863 # Set r13 to be the number of bytes to write out
864 sub %r12, %r13
865 jmp _count_set_\@
866_partial_fill_\@:
867 mov \PLAIN_CYPH_LEN, %r13
868_count_set_\@:
869 vmovdqa %xmm9, %xmm0
870 vmovq %xmm0, %rax
871 cmp $8, %r13
872 jle _less_than_8_bytes_left_\@
873
874 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
875 add $8, \DATA_OFFSET
876 psrldq $8, %xmm0
877 vmovq %xmm0, %rax
878 sub $8, %r13
879_less_than_8_bytes_left_\@:
880 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
881 add $1, \DATA_OFFSET
882 shr $8, %rax
883 sub $1, %r13
884 jne _less_than_8_bytes_left_\@
885_partial_block_done_\@:
886.endm # PARTIAL_BLOCK
887
888###############################################################################
889# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
890# Input: A and B (128-bits each, bit-reflected)
891# Output: C = A*B*x mod poly, (i.e. >>1 )
892# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
893# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
894###############################################################################
895.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
896
897 vpshufd $0b01001110, \GH, \T2
898 vpshufd $0b01001110, \HK, \T3
899 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
900 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
901
902 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
903 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
904 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
905 vpxor \GH, \T2,\T2
906 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
907
908 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
909 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
910 vpxor \T3, \GH, \GH
911 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
912
913 #first phase of the reduction
914 vpslld $31, \GH, \T2 # packed right shifting << 31
915 vpslld $30, \GH, \T3 # packed right shifting shift << 30
916 vpslld $25, \GH, \T4 # packed right shifting shift << 25
917
918 vpxor \T3, \T2, \T2 # xor the shifted versions
919 vpxor \T4, \T2, \T2
920
921 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
922
923 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
924 vpxor \T2, \GH, \GH # first phase of the reduction complete
925
926 #second phase of the reduction
927
928 vpsrld $1,\GH, \T2 # packed left shifting >> 1
929 vpsrld $2,\GH, \T3 # packed left shifting >> 2
930 vpsrld $7,\GH, \T4 # packed left shifting >> 7
931 vpxor \T3, \T2, \T2 # xor the shifted versions
932 vpxor \T4, \T2, \T2
933
934 vpxor \T5, \T2, \T2
935 vpxor \T2, \GH, \GH
936 vpxor \T1, \GH, \GH # the result is in GH
937
938
939.endm
940
941.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
942
943 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
944 vmovdqa \HK, \T5
945
946 vpshufd $0b01001110, \T5, \T1
947 vpxor \T5, \T1, \T1
948 vmovdqu \T1, HashKey_k(arg2)
949
950 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
951 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
952 vpshufd $0b01001110, \T5, \T1
953 vpxor \T5, \T1, \T1
954 vmovdqu \T1, HashKey_2_k(arg2)
955
956 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
957 vmovdqu \T5, HashKey_3(arg2)
958 vpshufd $0b01001110, \T5, \T1
959 vpxor \T5, \T1, \T1
960 vmovdqu \T1, HashKey_3_k(arg2)
961
962 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
963 vmovdqu \T5, HashKey_4(arg2)
964 vpshufd $0b01001110, \T5, \T1
965 vpxor \T5, \T1, \T1
966 vmovdqu \T1, HashKey_4_k(arg2)
967
968 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
969 vmovdqu \T5, HashKey_5(arg2)
970 vpshufd $0b01001110, \T5, \T1
971 vpxor \T5, \T1, \T1
972 vmovdqu \T1, HashKey_5_k(arg2)
973
974 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
975 vmovdqu \T5, HashKey_6(arg2)
976 vpshufd $0b01001110, \T5, \T1
977 vpxor \T5, \T1, \T1
978 vmovdqu \T1, HashKey_6_k(arg2)
979
980 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
981 vmovdqu \T5, HashKey_7(arg2)
982 vpshufd $0b01001110, \T5, \T1
983 vpxor \T5, \T1, \T1
984 vmovdqu \T1, HashKey_7_k(arg2)
985
986 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
987 vmovdqu \T5, HashKey_8(arg2)
988 vpshufd $0b01001110, \T5, \T1
989 vpxor \T5, \T1, \T1
990 vmovdqu \T1, HashKey_8_k(arg2)
991
992.endm
993
994## if a = number of total plaintext bytes
995## b = floor(a/16)
996## num_initial_blocks = b mod 4#
997## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
998## r10, r11, r12, rax are clobbered
999## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1000
1001.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1002 i = (8-\num_initial_blocks)
1003 setreg
1004 vmovdqu AadHash(arg2), reg_i
1005
1006 # start AES for num_initial_blocks blocks
1007 vmovdqu CurCount(arg2), \CTR
1008
1009 i = (9-\num_initial_blocks)
1010 setreg
1011.rep \num_initial_blocks
1012 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1013 vmovdqa \CTR, reg_i
1014 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1015 i = (i+1)
1016 setreg
1017.endr
1018
1019 vmovdqa (arg1), \T_key
1020 i = (9-\num_initial_blocks)
1021 setreg
1022.rep \num_initial_blocks
1023 vpxor \T_key, reg_i, reg_i
1024 i = (i+1)
1025 setreg
1026.endr
1027
1028 j = 1
1029 setreg
1030.rep \REP
1031 vmovdqa 16*j(arg1), \T_key
1032 i = (9-\num_initial_blocks)
1033 setreg
1034.rep \num_initial_blocks
1035 vaesenc \T_key, reg_i, reg_i
1036 i = (i+1)
1037 setreg
1038.endr
1039
1040 j = (j+1)
1041 setreg
1042.endr
1043
1044 vmovdqa 16*j(arg1), \T_key
1045 i = (9-\num_initial_blocks)
1046 setreg
1047.rep \num_initial_blocks
1048 vaesenclast \T_key, reg_i, reg_i
1049 i = (i+1)
1050 setreg
1051.endr
1052
1053 i = (9-\num_initial_blocks)
1054 setreg
1055.rep \num_initial_blocks
1056 vmovdqu (arg4, %r11), \T1
1057 vpxor \T1, reg_i, reg_i
1058 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1059 add $16, %r11
1060.if \ENC_DEC == DEC
1061 vmovdqa \T1, reg_i
1062.endif
1063 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1064 i = (i+1)
1065 setreg
1066.endr
1067
1068
1069 i = (8-\num_initial_blocks)
1070 j = (9-\num_initial_blocks)
1071 setreg
1072
1073.rep \num_initial_blocks
1074 vpxor reg_i, reg_j, reg_j
1075 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1076 i = (i+1)
1077 j = (j+1)
1078 setreg
1079.endr
1080 # XMM8 has the combined result here
1081
1082 vmovdqa \XMM8, TMP1(%rsp)
1083 vmovdqa \XMM8, \T3
1084
1085 cmp $128, %r13
1086 jl _initial_blocks_done\@ # no need for precomputed constants
1087
1088###############################################################################
1089# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1090 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1091 vmovdqa \CTR, \XMM1
1092 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1093
1094 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1095 vmovdqa \CTR, \XMM2
1096 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1097
1098 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1099 vmovdqa \CTR, \XMM3
1100 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1101
1102 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1103 vmovdqa \CTR, \XMM4
1104 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1105
1106 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1107 vmovdqa \CTR, \XMM5
1108 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1109
1110 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1111 vmovdqa \CTR, \XMM6
1112 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1113
1114 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1115 vmovdqa \CTR, \XMM7
1116 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1117
1118 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1119 vmovdqa \CTR, \XMM8
1120 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1121
1122 vmovdqa (arg1), \T_key
1123 vpxor \T_key, \XMM1, \XMM1
1124 vpxor \T_key, \XMM2, \XMM2
1125 vpxor \T_key, \XMM3, \XMM3
1126 vpxor \T_key, \XMM4, \XMM4
1127 vpxor \T_key, \XMM5, \XMM5
1128 vpxor \T_key, \XMM6, \XMM6
1129 vpxor \T_key, \XMM7, \XMM7
1130 vpxor \T_key, \XMM8, \XMM8
1131
1132 i = 1
1133 setreg
1134.rep \REP # do REP rounds
1135 vmovdqa 16*i(arg1), \T_key
1136 vaesenc \T_key, \XMM1, \XMM1
1137 vaesenc \T_key, \XMM2, \XMM2
1138 vaesenc \T_key, \XMM3, \XMM3
1139 vaesenc \T_key, \XMM4, \XMM4
1140 vaesenc \T_key, \XMM5, \XMM5
1141 vaesenc \T_key, \XMM6, \XMM6
1142 vaesenc \T_key, \XMM7, \XMM7
1143 vaesenc \T_key, \XMM8, \XMM8
1144 i = (i+1)
1145 setreg
1146.endr
1147
1148 vmovdqa 16*i(arg1), \T_key
1149 vaesenclast \T_key, \XMM1, \XMM1
1150 vaesenclast \T_key, \XMM2, \XMM2
1151 vaesenclast \T_key, \XMM3, \XMM3
1152 vaesenclast \T_key, \XMM4, \XMM4
1153 vaesenclast \T_key, \XMM5, \XMM5
1154 vaesenclast \T_key, \XMM6, \XMM6
1155 vaesenclast \T_key, \XMM7, \XMM7
1156 vaesenclast \T_key, \XMM8, \XMM8
1157
1158 vmovdqu (arg4, %r11), \T1
1159 vpxor \T1, \XMM1, \XMM1
1160 vmovdqu \XMM1, (arg3 , %r11)
1161 .if \ENC_DEC == DEC
1162 vmovdqa \T1, \XMM1
1163 .endif
1164
1165 vmovdqu 16*1(arg4, %r11), \T1
1166 vpxor \T1, \XMM2, \XMM2
1167 vmovdqu \XMM2, 16*1(arg3 , %r11)
1168 .if \ENC_DEC == DEC
1169 vmovdqa \T1, \XMM2
1170 .endif
1171
1172 vmovdqu 16*2(arg4, %r11), \T1
1173 vpxor \T1, \XMM3, \XMM3
1174 vmovdqu \XMM3, 16*2(arg3 , %r11)
1175 .if \ENC_DEC == DEC
1176 vmovdqa \T1, \XMM3
1177 .endif
1178
1179 vmovdqu 16*3(arg4, %r11), \T1
1180 vpxor \T1, \XMM4, \XMM4
1181 vmovdqu \XMM4, 16*3(arg3 , %r11)
1182 .if \ENC_DEC == DEC
1183 vmovdqa \T1, \XMM4
1184 .endif
1185
1186 vmovdqu 16*4(arg4, %r11), \T1
1187 vpxor \T1, \XMM5, \XMM5
1188 vmovdqu \XMM5, 16*4(arg3 , %r11)
1189 .if \ENC_DEC == DEC
1190 vmovdqa \T1, \XMM5
1191 .endif
1192
1193 vmovdqu 16*5(arg4, %r11), \T1
1194 vpxor \T1, \XMM6, \XMM6
1195 vmovdqu \XMM6, 16*5(arg3 , %r11)
1196 .if \ENC_DEC == DEC
1197 vmovdqa \T1, \XMM6
1198 .endif
1199
1200 vmovdqu 16*6(arg4, %r11), \T1
1201 vpxor \T1, \XMM7, \XMM7
1202 vmovdqu \XMM7, 16*6(arg3 , %r11)
1203 .if \ENC_DEC == DEC
1204 vmovdqa \T1, \XMM7
1205 .endif
1206
1207 vmovdqu 16*7(arg4, %r11), \T1
1208 vpxor \T1, \XMM8, \XMM8
1209 vmovdqu \XMM8, 16*7(arg3 , %r11)
1210 .if \ENC_DEC == DEC
1211 vmovdqa \T1, \XMM8
1212 .endif
1213
1214 add $128, %r11
1215
1216 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1217 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1218 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1219 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1220 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1221 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1222 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1223 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1224 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1225
1226###############################################################################
1227
1228_initial_blocks_done\@:
1229
1230.endm
1231
1232# encrypt 8 blocks at a time
1233# ghash the 8 previously encrypted ciphertext blocks
1234# arg1, arg3, arg4 are used as pointers only, not modified
1235# r11 is the data offset value
1236.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1237
1238 vmovdqa \XMM1, \T2
1239 vmovdqa \XMM2, TMP2(%rsp)
1240 vmovdqa \XMM3, TMP3(%rsp)
1241 vmovdqa \XMM4, TMP4(%rsp)
1242 vmovdqa \XMM5, TMP5(%rsp)
1243 vmovdqa \XMM6, TMP6(%rsp)
1244 vmovdqa \XMM7, TMP7(%rsp)
1245 vmovdqa \XMM8, TMP8(%rsp)
1246
1247.if \loop_idx == in_order
1248 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1249 vpaddd ONE(%rip), \XMM1, \XMM2
1250 vpaddd ONE(%rip), \XMM2, \XMM3
1251 vpaddd ONE(%rip), \XMM3, \XMM4
1252 vpaddd ONE(%rip), \XMM4, \XMM5
1253 vpaddd ONE(%rip), \XMM5, \XMM6
1254 vpaddd ONE(%rip), \XMM6, \XMM7
1255 vpaddd ONE(%rip), \XMM7, \XMM8
1256 vmovdqa \XMM8, \CTR
1257
1258 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1259 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1260 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1261 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1262 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1263 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1264 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1265 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1266.else
1267 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1268 vpaddd ONEf(%rip), \XMM1, \XMM2
1269 vpaddd ONEf(%rip), \XMM2, \XMM3
1270 vpaddd ONEf(%rip), \XMM3, \XMM4
1271 vpaddd ONEf(%rip), \XMM4, \XMM5
1272 vpaddd ONEf(%rip), \XMM5, \XMM6
1273 vpaddd ONEf(%rip), \XMM6, \XMM7
1274 vpaddd ONEf(%rip), \XMM7, \XMM8
1275 vmovdqa \XMM8, \CTR
1276.endif
1277
1278
1279 #######################################################################
1280
1281 vmovdqu (arg1), \T1
1282 vpxor \T1, \XMM1, \XMM1
1283 vpxor \T1, \XMM2, \XMM2
1284 vpxor \T1, \XMM3, \XMM3
1285 vpxor \T1, \XMM4, \XMM4
1286 vpxor \T1, \XMM5, \XMM5
1287 vpxor \T1, \XMM6, \XMM6
1288 vpxor \T1, \XMM7, \XMM7
1289 vpxor \T1, \XMM8, \XMM8
1290
1291 #######################################################################
1292
1293
1294
1295
1296
1297 vmovdqu 16*1(arg1), \T1
1298 vaesenc \T1, \XMM1, \XMM1
1299 vaesenc \T1, \XMM2, \XMM2
1300 vaesenc \T1, \XMM3, \XMM3
1301 vaesenc \T1, \XMM4, \XMM4
1302 vaesenc \T1, \XMM5, \XMM5
1303 vaesenc \T1, \XMM6, \XMM6
1304 vaesenc \T1, \XMM7, \XMM7
1305 vaesenc \T1, \XMM8, \XMM8
1306
1307 vmovdqu 16*2(arg1), \T1
1308 vaesenc \T1, \XMM1, \XMM1
1309 vaesenc \T1, \XMM2, \XMM2
1310 vaesenc \T1, \XMM3, \XMM3
1311 vaesenc \T1, \XMM4, \XMM4
1312 vaesenc \T1, \XMM5, \XMM5
1313 vaesenc \T1, \XMM6, \XMM6
1314 vaesenc \T1, \XMM7, \XMM7
1315 vaesenc \T1, \XMM8, \XMM8
1316
1317
1318 #######################################################################
1319
1320 vmovdqu HashKey_8(arg2), \T5
1321 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1322 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1323
1324 vpshufd $0b01001110, \T2, \T6
1325 vpxor \T2, \T6, \T6
1326
1327 vmovdqu HashKey_8_k(arg2), \T5
1328 vpclmulqdq $0x00, \T5, \T6, \T6
1329
1330 vmovdqu 16*3(arg1), \T1
1331 vaesenc \T1, \XMM1, \XMM1
1332 vaesenc \T1, \XMM2, \XMM2
1333 vaesenc \T1, \XMM3, \XMM3
1334 vaesenc \T1, \XMM4, \XMM4
1335 vaesenc \T1, \XMM5, \XMM5
1336 vaesenc \T1, \XMM6, \XMM6
1337 vaesenc \T1, \XMM7, \XMM7
1338 vaesenc \T1, \XMM8, \XMM8
1339
1340 vmovdqa TMP2(%rsp), \T1
1341 vmovdqu HashKey_7(arg2), \T5
1342 vpclmulqdq $0x11, \T5, \T1, \T3
1343 vpxor \T3, \T4, \T4
1344 vpclmulqdq $0x00, \T5, \T1, \T3
1345 vpxor \T3, \T7, \T7
1346
1347 vpshufd $0b01001110, \T1, \T3
1348 vpxor \T1, \T3, \T3
1349 vmovdqu HashKey_7_k(arg2), \T5
1350 vpclmulqdq $0x10, \T5, \T3, \T3
1351 vpxor \T3, \T6, \T6
1352
1353 vmovdqu 16*4(arg1), \T1
1354 vaesenc \T1, \XMM1, \XMM1
1355 vaesenc \T1, \XMM2, \XMM2
1356 vaesenc \T1, \XMM3, \XMM3
1357 vaesenc \T1, \XMM4, \XMM4
1358 vaesenc \T1, \XMM5, \XMM5
1359 vaesenc \T1, \XMM6, \XMM6
1360 vaesenc \T1, \XMM7, \XMM7
1361 vaesenc \T1, \XMM8, \XMM8
1362
1363 #######################################################################
1364
1365 vmovdqa TMP3(%rsp), \T1
1366 vmovdqu HashKey_6(arg2), \T5
1367 vpclmulqdq $0x11, \T5, \T1, \T3
1368 vpxor \T3, \T4, \T4
1369 vpclmulqdq $0x00, \T5, \T1, \T3
1370 vpxor \T3, \T7, \T7
1371
1372 vpshufd $0b01001110, \T1, \T3
1373 vpxor \T1, \T3, \T3
1374 vmovdqu HashKey_6_k(arg2), \T5
1375 vpclmulqdq $0x10, \T5, \T3, \T3
1376 vpxor \T3, \T6, \T6
1377
1378 vmovdqu 16*5(arg1), \T1
1379 vaesenc \T1, \XMM1, \XMM1
1380 vaesenc \T1, \XMM2, \XMM2
1381 vaesenc \T1, \XMM3, \XMM3
1382 vaesenc \T1, \XMM4, \XMM4
1383 vaesenc \T1, \XMM5, \XMM5
1384 vaesenc \T1, \XMM6, \XMM6
1385 vaesenc \T1, \XMM7, \XMM7
1386 vaesenc \T1, \XMM8, \XMM8
1387
1388 vmovdqa TMP4(%rsp), \T1
1389 vmovdqu HashKey_5(arg2), \T5
1390 vpclmulqdq $0x11, \T5, \T1, \T3
1391 vpxor \T3, \T4, \T4
1392 vpclmulqdq $0x00, \T5, \T1, \T3
1393 vpxor \T3, \T7, \T7
1394
1395 vpshufd $0b01001110, \T1, \T3
1396 vpxor \T1, \T3, \T3
1397 vmovdqu HashKey_5_k(arg2), \T5
1398 vpclmulqdq $0x10, \T5, \T3, \T3
1399 vpxor \T3, \T6, \T6
1400
1401 vmovdqu 16*6(arg1), \T1
1402 vaesenc \T1, \XMM1, \XMM1
1403 vaesenc \T1, \XMM2, \XMM2
1404 vaesenc \T1, \XMM3, \XMM3
1405 vaesenc \T1, \XMM4, \XMM4
1406 vaesenc \T1, \XMM5, \XMM5
1407 vaesenc \T1, \XMM6, \XMM6
1408 vaesenc \T1, \XMM7, \XMM7
1409 vaesenc \T1, \XMM8, \XMM8
1410
1411
1412 vmovdqa TMP5(%rsp), \T1
1413 vmovdqu HashKey_4(arg2), \T5
1414 vpclmulqdq $0x11, \T5, \T1, \T3
1415 vpxor \T3, \T4, \T4
1416 vpclmulqdq $0x00, \T5, \T1, \T3
1417 vpxor \T3, \T7, \T7
1418
1419 vpshufd $0b01001110, \T1, \T3
1420 vpxor \T1, \T3, \T3
1421 vmovdqu HashKey_4_k(arg2), \T5
1422 vpclmulqdq $0x10, \T5, \T3, \T3
1423 vpxor \T3, \T6, \T6
1424
1425 vmovdqu 16*7(arg1), \T1
1426 vaesenc \T1, \XMM1, \XMM1
1427 vaesenc \T1, \XMM2, \XMM2
1428 vaesenc \T1, \XMM3, \XMM3
1429 vaesenc \T1, \XMM4, \XMM4
1430 vaesenc \T1, \XMM5, \XMM5
1431 vaesenc \T1, \XMM6, \XMM6
1432 vaesenc \T1, \XMM7, \XMM7
1433 vaesenc \T1, \XMM8, \XMM8
1434
1435 vmovdqa TMP6(%rsp), \T1
1436 vmovdqu HashKey_3(arg2), \T5
1437 vpclmulqdq $0x11, \T5, \T1, \T3
1438 vpxor \T3, \T4, \T4
1439 vpclmulqdq $0x00, \T5, \T1, \T3
1440 vpxor \T3, \T7, \T7
1441
1442 vpshufd $0b01001110, \T1, \T3
1443 vpxor \T1, \T3, \T3
1444 vmovdqu HashKey_3_k(arg2), \T5
1445 vpclmulqdq $0x10, \T5, \T3, \T3
1446 vpxor \T3, \T6, \T6
1447
1448
1449 vmovdqu 16*8(arg1), \T1
1450 vaesenc \T1, \XMM1, \XMM1
1451 vaesenc \T1, \XMM2, \XMM2
1452 vaesenc \T1, \XMM3, \XMM3
1453 vaesenc \T1, \XMM4, \XMM4
1454 vaesenc \T1, \XMM5, \XMM5
1455 vaesenc \T1, \XMM6, \XMM6
1456 vaesenc \T1, \XMM7, \XMM7
1457 vaesenc \T1, \XMM8, \XMM8
1458
1459 vmovdqa TMP7(%rsp), \T1
1460 vmovdqu HashKey_2(arg2), \T5
1461 vpclmulqdq $0x11, \T5, \T1, \T3
1462 vpxor \T3, \T4, \T4
1463 vpclmulqdq $0x00, \T5, \T1, \T3
1464 vpxor \T3, \T7, \T7
1465
1466 vpshufd $0b01001110, \T1, \T3
1467 vpxor \T1, \T3, \T3
1468 vmovdqu HashKey_2_k(arg2), \T5
1469 vpclmulqdq $0x10, \T5, \T3, \T3
1470 vpxor \T3, \T6, \T6
1471
1472 #######################################################################
1473
1474 vmovdqu 16*9(arg1), \T5
1475 vaesenc \T5, \XMM1, \XMM1
1476 vaesenc \T5, \XMM2, \XMM2
1477 vaesenc \T5, \XMM3, \XMM3
1478 vaesenc \T5, \XMM4, \XMM4
1479 vaesenc \T5, \XMM5, \XMM5
1480 vaesenc \T5, \XMM6, \XMM6
1481 vaesenc \T5, \XMM7, \XMM7
1482 vaesenc \T5, \XMM8, \XMM8
1483
1484 vmovdqa TMP8(%rsp), \T1
1485 vmovdqu HashKey(arg2), \T5
1486 vpclmulqdq $0x11, \T5, \T1, \T3
1487 vpxor \T3, \T4, \T4
1488 vpclmulqdq $0x00, \T5, \T1, \T3
1489 vpxor \T3, \T7, \T7
1490
1491 vpshufd $0b01001110, \T1, \T3
1492 vpxor \T1, \T3, \T3
1493 vmovdqu HashKey_k(arg2), \T5
1494 vpclmulqdq $0x10, \T5, \T3, \T3
1495 vpxor \T3, \T6, \T6
1496
1497 vpxor \T4, \T6, \T6
1498 vpxor \T7, \T6, \T6
1499
1500 vmovdqu 16*10(arg1), \T5
1501
1502 i = 11
1503 setreg
1504.rep (\REP-9)
1505
1506 vaesenc \T5, \XMM1, \XMM1
1507 vaesenc \T5, \XMM2, \XMM2
1508 vaesenc \T5, \XMM3, \XMM3
1509 vaesenc \T5, \XMM4, \XMM4
1510 vaesenc \T5, \XMM5, \XMM5
1511 vaesenc \T5, \XMM6, \XMM6
1512 vaesenc \T5, \XMM7, \XMM7
1513 vaesenc \T5, \XMM8, \XMM8
1514
1515 vmovdqu 16*i(arg1), \T5
1516 i = i + 1
1517 setreg
1518.endr
1519
1520 i = 0
1521 j = 1
1522 setreg
1523.rep 8
1524 vpxor 16*i(arg4, %r11), \T5, \T2
1525 .if \ENC_DEC == ENC
1526 vaesenclast \T2, reg_j, reg_j
1527 .else
1528 vaesenclast \T2, reg_j, \T3
1529 vmovdqu 16*i(arg4, %r11), reg_j
1530 vmovdqu \T3, 16*i(arg3, %r11)
1531 .endif
1532 i = (i+1)
1533 j = (j+1)
1534 setreg
1535.endr
1536 #######################################################################
1537
1538
1539 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1540 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1541 vpxor \T3, \T7, \T7
1542 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1543
1544
1545
1546 #######################################################################
1547 #first phase of the reduction
1548 #######################################################################
1549 vpslld $31, \T7, \T2 # packed right shifting << 31
1550 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1551 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1552
1553 vpxor \T3, \T2, \T2 # xor the shifted versions
1554 vpxor \T4, \T2, \T2
1555
1556 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1557
1558 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1559 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1560 #######################################################################
1561 .if \ENC_DEC == ENC
1562 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1563 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1564 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1565 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1566 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1567 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1568 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1569 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1570 .endif
1571
1572 #######################################################################
1573 #second phase of the reduction
1574 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1575 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1576 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1577 vpxor \T3, \T2, \T2 # xor the shifted versions
1578 vpxor \T4, \T2, \T2
1579
1580 vpxor \T1, \T2, \T2
1581 vpxor \T2, \T7, \T7
1582 vpxor \T7, \T6, \T6 # the result is in T6
1583 #######################################################################
1584
1585 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1586 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1587 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1588 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1589 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1590 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1591 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1592 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1593
1594
1595 vpxor \T6, \XMM1, \XMM1
1596
1597
1598
1599.endm
1600
1601
1602# GHASH the last 4 ciphertext blocks.
1603.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1604
1605 ## Karatsuba Method
1606
1607
1608 vpshufd $0b01001110, \XMM1, \T2
1609 vpxor \XMM1, \T2, \T2
1610 vmovdqu HashKey_8(arg2), \T5
1611 vpclmulqdq $0x11, \T5, \XMM1, \T6
1612 vpclmulqdq $0x00, \T5, \XMM1, \T7
1613
1614 vmovdqu HashKey_8_k(arg2), \T3
1615 vpclmulqdq $0x00, \T3, \T2, \XMM1
1616
1617 ######################
1618
1619 vpshufd $0b01001110, \XMM2, \T2
1620 vpxor \XMM2, \T2, \T2
1621 vmovdqu HashKey_7(arg2), \T5
1622 vpclmulqdq $0x11, \T5, \XMM2, \T4
1623 vpxor \T4, \T6, \T6
1624
1625 vpclmulqdq $0x00, \T5, \XMM2, \T4
1626 vpxor \T4, \T7, \T7
1627
1628 vmovdqu HashKey_7_k(arg2), \T3
1629 vpclmulqdq $0x00, \T3, \T2, \T2
1630 vpxor \T2, \XMM1, \XMM1
1631
1632 ######################
1633
1634 vpshufd $0b01001110, \XMM3, \T2
1635 vpxor \XMM3, \T2, \T2
1636 vmovdqu HashKey_6(arg2), \T5
1637 vpclmulqdq $0x11, \T5, \XMM3, \T4
1638 vpxor \T4, \T6, \T6
1639
1640 vpclmulqdq $0x00, \T5, \XMM3, \T4
1641 vpxor \T4, \T7, \T7
1642
1643 vmovdqu HashKey_6_k(arg2), \T3
1644 vpclmulqdq $0x00, \T3, \T2, \T2
1645 vpxor \T2, \XMM1, \XMM1
1646
1647 ######################
1648
1649 vpshufd $0b01001110, \XMM4, \T2
1650 vpxor \XMM4, \T2, \T2
1651 vmovdqu HashKey_5(arg2), \T5
1652 vpclmulqdq $0x11, \T5, \XMM4, \T4
1653 vpxor \T4, \T6, \T6
1654
1655 vpclmulqdq $0x00, \T5, \XMM4, \T4
1656 vpxor \T4, \T7, \T7
1657
1658 vmovdqu HashKey_5_k(arg2), \T3
1659 vpclmulqdq $0x00, \T3, \T2, \T2
1660 vpxor \T2, \XMM1, \XMM1
1661
1662 ######################
1663
1664 vpshufd $0b01001110, \XMM5, \T2
1665 vpxor \XMM5, \T2, \T2
1666 vmovdqu HashKey_4(arg2), \T5
1667 vpclmulqdq $0x11, \T5, \XMM5, \T4
1668 vpxor \T4, \T6, \T6
1669
1670 vpclmulqdq $0x00, \T5, \XMM5, \T4
1671 vpxor \T4, \T7, \T7
1672
1673 vmovdqu HashKey_4_k(arg2), \T3
1674 vpclmulqdq $0x00, \T3, \T2, \T2
1675 vpxor \T2, \XMM1, \XMM1
1676
1677 ######################
1678
1679 vpshufd $0b01001110, \XMM6, \T2
1680 vpxor \XMM6, \T2, \T2
1681 vmovdqu HashKey_3(arg2), \T5
1682 vpclmulqdq $0x11, \T5, \XMM6, \T4
1683 vpxor \T4, \T6, \T6
1684
1685 vpclmulqdq $0x00, \T5, \XMM6, \T4
1686 vpxor \T4, \T7, \T7
1687
1688 vmovdqu HashKey_3_k(arg2), \T3
1689 vpclmulqdq $0x00, \T3, \T2, \T2
1690 vpxor \T2, \XMM1, \XMM1
1691
1692 ######################
1693
1694 vpshufd $0b01001110, \XMM7, \T2
1695 vpxor \XMM7, \T2, \T2
1696 vmovdqu HashKey_2(arg2), \T5
1697 vpclmulqdq $0x11, \T5, \XMM7, \T4
1698 vpxor \T4, \T6, \T6
1699
1700 vpclmulqdq $0x00, \T5, \XMM7, \T4
1701 vpxor \T4, \T7, \T7
1702
1703 vmovdqu HashKey_2_k(arg2), \T3
1704 vpclmulqdq $0x00, \T3, \T2, \T2
1705 vpxor \T2, \XMM1, \XMM1
1706
1707 ######################
1708
1709 vpshufd $0b01001110, \XMM8, \T2
1710 vpxor \XMM8, \T2, \T2
1711 vmovdqu HashKey(arg2), \T5
1712 vpclmulqdq $0x11, \T5, \XMM8, \T4
1713 vpxor \T4, \T6, \T6
1714
1715 vpclmulqdq $0x00, \T5, \XMM8, \T4
1716 vpxor \T4, \T7, \T7
1717
1718 vmovdqu HashKey_k(arg2), \T3
1719 vpclmulqdq $0x00, \T3, \T2, \T2
1720
1721 vpxor \T2, \XMM1, \XMM1
1722 vpxor \T6, \XMM1, \XMM1
1723 vpxor \T7, \XMM1, \T2
1724
1725
1726
1727
1728 vpslldq $8, \T2, \T4
1729 vpsrldq $8, \T2, \T2
1730
1731 vpxor \T4, \T7, \T7
1732 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1733 # the accumulated carry-less multiplications
1734
1735 #######################################################################
1736 #first phase of the reduction
1737 vpslld $31, \T7, \T2 # packed right shifting << 31
1738 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1739 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1740
1741 vpxor \T3, \T2, \T2 # xor the shifted versions
1742 vpxor \T4, \T2, \T2
1743
1744 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1745
1746 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1747 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1748 #######################################################################
1749
1750
1751 #second phase of the reduction
1752 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1753 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1754 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1755 vpxor \T3, \T2, \T2 # xor the shifted versions
1756 vpxor \T4, \T2, \T2
1757
1758 vpxor \T1, \T2, \T2
1759 vpxor \T2, \T7, \T7
1760 vpxor \T7, \T6, \T6 # the result is in T6
1761
1762.endm
1763
1764#############################################################
1765#void aesni_gcm_precomp_avx_gen2
1766# (gcm_data *my_ctx_data,
1767# gcm_context_data *data,
1768# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1769# u8 *iv, /* Pre-counter block j0: 4 byte salt
1770# (from Security Association) concatenated with 8 byte
1771# Initialisation Vector (from IPSec ESP Payload)
1772# concatenated with 0x00000001. 16-byte aligned pointer. */
1773# const u8 *aad, /* Additional Authentication Data (AAD)*/
1774# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1775#############################################################
1776SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1777 FUNC_SAVE
1778 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1779 FUNC_RESTORE
1780 ret
1781SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1782
1783###############################################################################
1784#void aesni_gcm_enc_update_avx_gen2(
1785# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1786# gcm_context_data *data,
1787# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1788# const u8 *in, /* Plaintext input */
1789# u64 plaintext_len) /* Length of data in Bytes for encryption. */
1790###############################################################################
1791SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1792 FUNC_SAVE
1793 mov keysize, %eax
1794 cmp $32, %eax
1795 je key_256_enc_update
1796 cmp $16, %eax
1797 je key_128_enc_update
1798 # must be 192
1799 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1800 FUNC_RESTORE
1801 ret
1802key_128_enc_update:
1803 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1804 FUNC_RESTORE
1805 ret
1806key_256_enc_update:
1807 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1808 FUNC_RESTORE
1809 ret
1810SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1811
1812###############################################################################
1813#void aesni_gcm_dec_update_avx_gen2(
1814# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1815# gcm_context_data *data,
1816# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1817# const u8 *in, /* Ciphertext input */
1818# u64 plaintext_len) /* Length of data in Bytes for encryption. */
1819###############################################################################
1820SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1821 FUNC_SAVE
1822 mov keysize,%eax
1823 cmp $32, %eax
1824 je key_256_dec_update
1825 cmp $16, %eax
1826 je key_128_dec_update
1827 # must be 192
1828 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1829 FUNC_RESTORE
1830 ret
1831key_128_dec_update:
1832 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1833 FUNC_RESTORE
1834 ret
1835key_256_dec_update:
1836 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1837 FUNC_RESTORE
1838 ret
1839SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1840
1841###############################################################################
1842#void aesni_gcm_finalize_avx_gen2(
1843# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1844# gcm_context_data *data,
1845# u8 *auth_tag, /* Authenticated Tag output. */
1846# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1847# Valid values are 16 (most likely), 12 or 8. */
1848###############################################################################
1849SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1850 FUNC_SAVE
1851 mov keysize,%eax
1852 cmp $32, %eax
1853 je key_256_finalize
1854 cmp $16, %eax
1855 je key_128_finalize
1856 # must be 192
1857 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1858 FUNC_RESTORE
1859 ret
1860key_128_finalize:
1861 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1862 FUNC_RESTORE
1863 ret
1864key_256_finalize:
1865 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1866 FUNC_RESTORE
1867 ret
1868SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1869
1870###############################################################################
1871# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1872# Input: A and B (128-bits each, bit-reflected)
1873# Output: C = A*B*x mod poly, (i.e. >>1 )
1874# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1875# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1876###############################################################################
1877.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1878
1879 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1880 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1881 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1882 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1883 vpxor \T3, \GH, \GH
1884
1885
1886 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1887 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1888
1889 vpxor \T3, \T1, \T1
1890 vpxor \T2, \GH, \GH
1891
1892 #######################################################################
1893 #first phase of the reduction
1894 vmovdqa POLY2(%rip), \T3
1895
1896 vpclmulqdq $0x01, \GH, \T3, \T2
1897 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1898
1899 vpxor \T2, \GH, \GH # first phase of the reduction complete
1900 #######################################################################
1901 #second phase of the reduction
1902 vpclmulqdq $0x00, \GH, \T3, \T2
1903 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1904
1905 vpclmulqdq $0x10, \GH, \T3, \GH
1906 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1907
1908 vpxor \T2, \GH, \GH # second phase of the reduction complete
1909 #######################################################################
1910 vpxor \T1, \GH, \GH # the result is in GH
1911
1912
1913.endm
1914
1915.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1916
1917 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1918 vmovdqa \HK, \T5
1919 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1920 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1921
1922 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1923 vmovdqu \T5, HashKey_3(arg2)
1924
1925 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1926 vmovdqu \T5, HashKey_4(arg2)
1927
1928 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1929 vmovdqu \T5, HashKey_5(arg2)
1930
1931 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1932 vmovdqu \T5, HashKey_6(arg2)
1933
1934 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1935 vmovdqu \T5, HashKey_7(arg2)
1936
1937 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1938 vmovdqu \T5, HashKey_8(arg2)
1939
1940.endm
1941
1942## if a = number of total plaintext bytes
1943## b = floor(a/16)
1944## num_initial_blocks = b mod 4#
1945## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1946## r10, r11, r12, rax are clobbered
1947## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1948
1949.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1950 i = (8-\num_initial_blocks)
1951 setreg
1952 vmovdqu AadHash(arg2), reg_i
1953
1954 # start AES for num_initial_blocks blocks
1955 vmovdqu CurCount(arg2), \CTR
1956
1957 i = (9-\num_initial_blocks)
1958 setreg
1959.rep \num_initial_blocks
1960 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1961 vmovdqa \CTR, reg_i
1962 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1963 i = (i+1)
1964 setreg
1965.endr
1966
1967 vmovdqa (arg1), \T_key
1968 i = (9-\num_initial_blocks)
1969 setreg
1970.rep \num_initial_blocks
1971 vpxor \T_key, reg_i, reg_i
1972 i = (i+1)
1973 setreg
1974.endr
1975
1976 j = 1
1977 setreg
1978.rep \REP
1979 vmovdqa 16*j(arg1), \T_key
1980 i = (9-\num_initial_blocks)
1981 setreg
1982.rep \num_initial_blocks
1983 vaesenc \T_key, reg_i, reg_i
1984 i = (i+1)
1985 setreg
1986.endr
1987
1988 j = (j+1)
1989 setreg
1990.endr
1991
1992
1993 vmovdqa 16*j(arg1), \T_key
1994 i = (9-\num_initial_blocks)
1995 setreg
1996.rep \num_initial_blocks
1997 vaesenclast \T_key, reg_i, reg_i
1998 i = (i+1)
1999 setreg
2000.endr
2001
2002 i = (9-\num_initial_blocks)
2003 setreg
2004.rep \num_initial_blocks
2005 vmovdqu (arg4, %r11), \T1
2006 vpxor \T1, reg_i, reg_i
2007 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
2008 # num_initial_blocks blocks
2009 add $16, %r11
2010.if \ENC_DEC == DEC
2011 vmovdqa \T1, reg_i
2012.endif
2013 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2014 i = (i+1)
2015 setreg
2016.endr
2017
2018
2019 i = (8-\num_initial_blocks)
2020 j = (9-\num_initial_blocks)
2021 setreg
2022
2023.rep \num_initial_blocks
2024 vpxor reg_i, reg_j, reg_j
2025 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2026 i = (i+1)
2027 j = (j+1)
2028 setreg
2029.endr
2030 # XMM8 has the combined result here
2031
2032 vmovdqa \XMM8, TMP1(%rsp)
2033 vmovdqa \XMM8, \T3
2034
2035 cmp $128, %r13
2036 jl _initial_blocks_done\@ # no need for precomputed constants
2037
2038###############################################################################
2039# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2040 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2041 vmovdqa \CTR, \XMM1
2042 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2043
2044 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2045 vmovdqa \CTR, \XMM2
2046 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2047
2048 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2049 vmovdqa \CTR, \XMM3
2050 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2051
2052 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2053 vmovdqa \CTR, \XMM4
2054 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2055
2056 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2057 vmovdqa \CTR, \XMM5
2058 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2059
2060 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2061 vmovdqa \CTR, \XMM6
2062 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2063
2064 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2065 vmovdqa \CTR, \XMM7
2066 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2067
2068 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2069 vmovdqa \CTR, \XMM8
2070 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2071
2072 vmovdqa (arg1), \T_key
2073 vpxor \T_key, \XMM1, \XMM1
2074 vpxor \T_key, \XMM2, \XMM2
2075 vpxor \T_key, \XMM3, \XMM3
2076 vpxor \T_key, \XMM4, \XMM4
2077 vpxor \T_key, \XMM5, \XMM5
2078 vpxor \T_key, \XMM6, \XMM6
2079 vpxor \T_key, \XMM7, \XMM7
2080 vpxor \T_key, \XMM8, \XMM8
2081
2082 i = 1
2083 setreg
2084.rep \REP # do REP rounds
2085 vmovdqa 16*i(arg1), \T_key
2086 vaesenc \T_key, \XMM1, \XMM1
2087 vaesenc \T_key, \XMM2, \XMM2
2088 vaesenc \T_key, \XMM3, \XMM3
2089 vaesenc \T_key, \XMM4, \XMM4
2090 vaesenc \T_key, \XMM5, \XMM5
2091 vaesenc \T_key, \XMM6, \XMM6
2092 vaesenc \T_key, \XMM7, \XMM7
2093 vaesenc \T_key, \XMM8, \XMM8
2094 i = (i+1)
2095 setreg
2096.endr
2097
2098
2099 vmovdqa 16*i(arg1), \T_key
2100 vaesenclast \T_key, \XMM1, \XMM1
2101 vaesenclast \T_key, \XMM2, \XMM2
2102 vaesenclast \T_key, \XMM3, \XMM3
2103 vaesenclast \T_key, \XMM4, \XMM4
2104 vaesenclast \T_key, \XMM5, \XMM5
2105 vaesenclast \T_key, \XMM6, \XMM6
2106 vaesenclast \T_key, \XMM7, \XMM7
2107 vaesenclast \T_key, \XMM8, \XMM8
2108
2109 vmovdqu (arg4, %r11), \T1
2110 vpxor \T1, \XMM1, \XMM1
2111 vmovdqu \XMM1, (arg3 , %r11)
2112 .if \ENC_DEC == DEC
2113 vmovdqa \T1, \XMM1
2114 .endif
2115
2116 vmovdqu 16*1(arg4, %r11), \T1
2117 vpxor \T1, \XMM2, \XMM2
2118 vmovdqu \XMM2, 16*1(arg3 , %r11)
2119 .if \ENC_DEC == DEC
2120 vmovdqa \T1, \XMM2
2121 .endif
2122
2123 vmovdqu 16*2(arg4, %r11), \T1
2124 vpxor \T1, \XMM3, \XMM3
2125 vmovdqu \XMM3, 16*2(arg3 , %r11)
2126 .if \ENC_DEC == DEC
2127 vmovdqa \T1, \XMM3
2128 .endif
2129
2130 vmovdqu 16*3(arg4, %r11), \T1
2131 vpxor \T1, \XMM4, \XMM4
2132 vmovdqu \XMM4, 16*3(arg3 , %r11)
2133 .if \ENC_DEC == DEC
2134 vmovdqa \T1, \XMM4
2135 .endif
2136
2137 vmovdqu 16*4(arg4, %r11), \T1
2138 vpxor \T1, \XMM5, \XMM5
2139 vmovdqu \XMM5, 16*4(arg3 , %r11)
2140 .if \ENC_DEC == DEC
2141 vmovdqa \T1, \XMM5
2142 .endif
2143
2144 vmovdqu 16*5(arg4, %r11), \T1
2145 vpxor \T1, \XMM6, \XMM6
2146 vmovdqu \XMM6, 16*5(arg3 , %r11)
2147 .if \ENC_DEC == DEC
2148 vmovdqa \T1, \XMM6
2149 .endif
2150
2151 vmovdqu 16*6(arg4, %r11), \T1
2152 vpxor \T1, \XMM7, \XMM7
2153 vmovdqu \XMM7, 16*6(arg3 , %r11)
2154 .if \ENC_DEC == DEC
2155 vmovdqa \T1, \XMM7
2156 .endif
2157
2158 vmovdqu 16*7(arg4, %r11), \T1
2159 vpxor \T1, \XMM8, \XMM8
2160 vmovdqu \XMM8, 16*7(arg3 , %r11)
2161 .if \ENC_DEC == DEC
2162 vmovdqa \T1, \XMM8
2163 .endif
2164
2165 add $128, %r11
2166
2167 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2168 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2169 # the corresponding ciphertext
2170 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2171 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2172 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2173 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2174 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2175 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2176 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2177
2178###############################################################################
2179
2180_initial_blocks_done\@:
2181
2182
2183.endm
2184
2185
2186
2187# encrypt 8 blocks at a time
2188# ghash the 8 previously encrypted ciphertext blocks
2189# arg1, arg3, arg4 are used as pointers only, not modified
2190# r11 is the data offset value
2191.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2192
2193 vmovdqa \XMM1, \T2
2194 vmovdqa \XMM2, TMP2(%rsp)
2195 vmovdqa \XMM3, TMP3(%rsp)
2196 vmovdqa \XMM4, TMP4(%rsp)
2197 vmovdqa \XMM5, TMP5(%rsp)
2198 vmovdqa \XMM6, TMP6(%rsp)
2199 vmovdqa \XMM7, TMP7(%rsp)
2200 vmovdqa \XMM8, TMP8(%rsp)
2201
2202.if \loop_idx == in_order
2203 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2204 vpaddd ONE(%rip), \XMM1, \XMM2
2205 vpaddd ONE(%rip), \XMM2, \XMM3
2206 vpaddd ONE(%rip), \XMM3, \XMM4
2207 vpaddd ONE(%rip), \XMM4, \XMM5
2208 vpaddd ONE(%rip), \XMM5, \XMM6
2209 vpaddd ONE(%rip), \XMM6, \XMM7
2210 vpaddd ONE(%rip), \XMM7, \XMM8
2211 vmovdqa \XMM8, \CTR
2212
2213 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2214 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2215 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2216 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2217 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2218 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2219 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2220 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2221.else
2222 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2223 vpaddd ONEf(%rip), \XMM1, \XMM2
2224 vpaddd ONEf(%rip), \XMM2, \XMM3
2225 vpaddd ONEf(%rip), \XMM3, \XMM4
2226 vpaddd ONEf(%rip), \XMM4, \XMM5
2227 vpaddd ONEf(%rip), \XMM5, \XMM6
2228 vpaddd ONEf(%rip), \XMM6, \XMM7
2229 vpaddd ONEf(%rip), \XMM7, \XMM8
2230 vmovdqa \XMM8, \CTR
2231.endif
2232
2233
2234 #######################################################################
2235
2236 vmovdqu (arg1), \T1
2237 vpxor \T1, \XMM1, \XMM1
2238 vpxor \T1, \XMM2, \XMM2
2239 vpxor \T1, \XMM3, \XMM3
2240 vpxor \T1, \XMM4, \XMM4
2241 vpxor \T1, \XMM5, \XMM5
2242 vpxor \T1, \XMM6, \XMM6
2243 vpxor \T1, \XMM7, \XMM7
2244 vpxor \T1, \XMM8, \XMM8
2245
2246 #######################################################################
2247
2248
2249
2250
2251
2252 vmovdqu 16*1(arg1), \T1
2253 vaesenc \T1, \XMM1, \XMM1
2254 vaesenc \T1, \XMM2, \XMM2
2255 vaesenc \T1, \XMM3, \XMM3
2256 vaesenc \T1, \XMM4, \XMM4
2257 vaesenc \T1, \XMM5, \XMM5
2258 vaesenc \T1, \XMM6, \XMM6
2259 vaesenc \T1, \XMM7, \XMM7
2260 vaesenc \T1, \XMM8, \XMM8
2261
2262 vmovdqu 16*2(arg1), \T1
2263 vaesenc \T1, \XMM1, \XMM1
2264 vaesenc \T1, \XMM2, \XMM2
2265 vaesenc \T1, \XMM3, \XMM3
2266 vaesenc \T1, \XMM4, \XMM4
2267 vaesenc \T1, \XMM5, \XMM5
2268 vaesenc \T1, \XMM6, \XMM6
2269 vaesenc \T1, \XMM7, \XMM7
2270 vaesenc \T1, \XMM8, \XMM8
2271
2272
2273 #######################################################################
2274
2275 vmovdqu HashKey_8(arg2), \T5
2276 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2277 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2278 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2279 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2280 vpxor \T5, \T6, \T6
2281
2282 vmovdqu 16*3(arg1), \T1
2283 vaesenc \T1, \XMM1, \XMM1
2284 vaesenc \T1, \XMM2, \XMM2
2285 vaesenc \T1, \XMM3, \XMM3
2286 vaesenc \T1, \XMM4, \XMM4
2287 vaesenc \T1, \XMM5, \XMM5
2288 vaesenc \T1, \XMM6, \XMM6
2289 vaesenc \T1, \XMM7, \XMM7
2290 vaesenc \T1, \XMM8, \XMM8
2291
2292 vmovdqa TMP2(%rsp), \T1
2293 vmovdqu HashKey_7(arg2), \T5
2294 vpclmulqdq $0x11, \T5, \T1, \T3
2295 vpxor \T3, \T4, \T4
2296
2297 vpclmulqdq $0x00, \T5, \T1, \T3
2298 vpxor \T3, \T7, \T7
2299
2300 vpclmulqdq $0x01, \T5, \T1, \T3
2301 vpxor \T3, \T6, \T6
2302
2303 vpclmulqdq $0x10, \T5, \T1, \T3
2304 vpxor \T3, \T6, \T6
2305
2306 vmovdqu 16*4(arg1), \T1
2307 vaesenc \T1, \XMM1, \XMM1
2308 vaesenc \T1, \XMM2, \XMM2
2309 vaesenc \T1, \XMM3, \XMM3
2310 vaesenc \T1, \XMM4, \XMM4
2311 vaesenc \T1, \XMM5, \XMM5
2312 vaesenc \T1, \XMM6, \XMM6
2313 vaesenc \T1, \XMM7, \XMM7
2314 vaesenc \T1, \XMM8, \XMM8
2315
2316 #######################################################################
2317
2318 vmovdqa TMP3(%rsp), \T1
2319 vmovdqu HashKey_6(arg2), \T5
2320 vpclmulqdq $0x11, \T5, \T1, \T3
2321 vpxor \T3, \T4, \T4
2322
2323 vpclmulqdq $0x00, \T5, \T1, \T3
2324 vpxor \T3, \T7, \T7
2325
2326 vpclmulqdq $0x01, \T5, \T1, \T3
2327 vpxor \T3, \T6, \T6
2328
2329 vpclmulqdq $0x10, \T5, \T1, \T3
2330 vpxor \T3, \T6, \T6
2331
2332 vmovdqu 16*5(arg1), \T1
2333 vaesenc \T1, \XMM1, \XMM1
2334 vaesenc \T1, \XMM2, \XMM2
2335 vaesenc \T1, \XMM3, \XMM3
2336 vaesenc \T1, \XMM4, \XMM4
2337 vaesenc \T1, \XMM5, \XMM5
2338 vaesenc \T1, \XMM6, \XMM6
2339 vaesenc \T1, \XMM7, \XMM7
2340 vaesenc \T1, \XMM8, \XMM8
2341
2342 vmovdqa TMP4(%rsp), \T1
2343 vmovdqu HashKey_5(arg2), \T5
2344 vpclmulqdq $0x11, \T5, \T1, \T3
2345 vpxor \T3, \T4, \T4
2346
2347 vpclmulqdq $0x00, \T5, \T1, \T3
2348 vpxor \T3, \T7, \T7
2349
2350 vpclmulqdq $0x01, \T5, \T1, \T3
2351 vpxor \T3, \T6, \T6
2352
2353 vpclmulqdq $0x10, \T5, \T1, \T3
2354 vpxor \T3, \T6, \T6
2355
2356 vmovdqu 16*6(arg1), \T1
2357 vaesenc \T1, \XMM1, \XMM1
2358 vaesenc \T1, \XMM2, \XMM2
2359 vaesenc \T1, \XMM3, \XMM3
2360 vaesenc \T1, \XMM4, \XMM4
2361 vaesenc \T1, \XMM5, \XMM5
2362 vaesenc \T1, \XMM6, \XMM6
2363 vaesenc \T1, \XMM7, \XMM7
2364 vaesenc \T1, \XMM8, \XMM8
2365
2366
2367 vmovdqa TMP5(%rsp), \T1
2368 vmovdqu HashKey_4(arg2), \T5
2369 vpclmulqdq $0x11, \T5, \T1, \T3
2370 vpxor \T3, \T4, \T4
2371
2372 vpclmulqdq $0x00, \T5, \T1, \T3
2373 vpxor \T3, \T7, \T7
2374
2375 vpclmulqdq $0x01, \T5, \T1, \T3
2376 vpxor \T3, \T6, \T6
2377
2378 vpclmulqdq $0x10, \T5, \T1, \T3
2379 vpxor \T3, \T6, \T6
2380
2381 vmovdqu 16*7(arg1), \T1
2382 vaesenc \T1, \XMM1, \XMM1
2383 vaesenc \T1, \XMM2, \XMM2
2384 vaesenc \T1, \XMM3, \XMM3
2385 vaesenc \T1, \XMM4, \XMM4
2386 vaesenc \T1, \XMM5, \XMM5
2387 vaesenc \T1, \XMM6, \XMM6
2388 vaesenc \T1, \XMM7, \XMM7
2389 vaesenc \T1, \XMM8, \XMM8
2390
2391 vmovdqa TMP6(%rsp), \T1
2392 vmovdqu HashKey_3(arg2), \T5
2393 vpclmulqdq $0x11, \T5, \T1, \T3
2394 vpxor \T3, \T4, \T4
2395
2396 vpclmulqdq $0x00, \T5, \T1, \T3
2397 vpxor \T3, \T7, \T7
2398
2399 vpclmulqdq $0x01, \T5, \T1, \T3
2400 vpxor \T3, \T6, \T6
2401
2402 vpclmulqdq $0x10, \T5, \T1, \T3
2403 vpxor \T3, \T6, \T6
2404
2405 vmovdqu 16*8(arg1), \T1
2406 vaesenc \T1, \XMM1, \XMM1
2407 vaesenc \T1, \XMM2, \XMM2
2408 vaesenc \T1, \XMM3, \XMM3
2409 vaesenc \T1, \XMM4, \XMM4
2410 vaesenc \T1, \XMM5, \XMM5
2411 vaesenc \T1, \XMM6, \XMM6
2412 vaesenc \T1, \XMM7, \XMM7
2413 vaesenc \T1, \XMM8, \XMM8
2414
2415 vmovdqa TMP7(%rsp), \T1
2416 vmovdqu HashKey_2(arg2), \T5
2417 vpclmulqdq $0x11, \T5, \T1, \T3
2418 vpxor \T3, \T4, \T4
2419
2420 vpclmulqdq $0x00, \T5, \T1, \T3
2421 vpxor \T3, \T7, \T7
2422
2423 vpclmulqdq $0x01, \T5, \T1, \T3
2424 vpxor \T3, \T6, \T6
2425
2426 vpclmulqdq $0x10, \T5, \T1, \T3
2427 vpxor \T3, \T6, \T6
2428
2429
2430 #######################################################################
2431
2432 vmovdqu 16*9(arg1), \T5
2433 vaesenc \T5, \XMM1, \XMM1
2434 vaesenc \T5, \XMM2, \XMM2
2435 vaesenc \T5, \XMM3, \XMM3
2436 vaesenc \T5, \XMM4, \XMM4
2437 vaesenc \T5, \XMM5, \XMM5
2438 vaesenc \T5, \XMM6, \XMM6
2439 vaesenc \T5, \XMM7, \XMM7
2440 vaesenc \T5, \XMM8, \XMM8
2441
2442 vmovdqa TMP8(%rsp), \T1
2443 vmovdqu HashKey(arg2), \T5
2444
2445 vpclmulqdq $0x00, \T5, \T1, \T3
2446 vpxor \T3, \T7, \T7
2447
2448 vpclmulqdq $0x01, \T5, \T1, \T3
2449 vpxor \T3, \T6, \T6
2450
2451 vpclmulqdq $0x10, \T5, \T1, \T3
2452 vpxor \T3, \T6, \T6
2453
2454 vpclmulqdq $0x11, \T5, \T1, \T3
2455 vpxor \T3, \T4, \T1
2456
2457
2458 vmovdqu 16*10(arg1), \T5
2459
2460 i = 11
2461 setreg
2462.rep (\REP-9)
2463 vaesenc \T5, \XMM1, \XMM1
2464 vaesenc \T5, \XMM2, \XMM2
2465 vaesenc \T5, \XMM3, \XMM3
2466 vaesenc \T5, \XMM4, \XMM4
2467 vaesenc \T5, \XMM5, \XMM5
2468 vaesenc \T5, \XMM6, \XMM6
2469 vaesenc \T5, \XMM7, \XMM7
2470 vaesenc \T5, \XMM8, \XMM8
2471
2472 vmovdqu 16*i(arg1), \T5
2473 i = i + 1
2474 setreg
2475.endr
2476
2477 i = 0
2478 j = 1
2479 setreg
2480.rep 8
2481 vpxor 16*i(arg4, %r11), \T5, \T2
2482 .if \ENC_DEC == ENC
2483 vaesenclast \T2, reg_j, reg_j
2484 .else
2485 vaesenclast \T2, reg_j, \T3
2486 vmovdqu 16*i(arg4, %r11), reg_j
2487 vmovdqu \T3, 16*i(arg3, %r11)
2488 .endif
2489 i = (i+1)
2490 j = (j+1)
2491 setreg
2492.endr
2493 #######################################################################
2494
2495
2496 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2497 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2498 vpxor \T3, \T7, \T7
2499 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2500
2501
2502
2503 #######################################################################
2504 #first phase of the reduction
2505 vmovdqa POLY2(%rip), \T3
2506
2507 vpclmulqdq $0x01, \T7, \T3, \T2
2508 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2509
2510 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2511 #######################################################################
2512 .if \ENC_DEC == ENC
2513 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2514 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2515 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2516 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2517 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2518 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2519 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2520 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2521 .endif
2522
2523 #######################################################################
2524 #second phase of the reduction
2525 vpclmulqdq $0x00, \T7, \T3, \T2
2526 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2527
2528 vpclmulqdq $0x10, \T7, \T3, \T4
2529 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2530
2531 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2532 #######################################################################
2533 vpxor \T4, \T1, \T1 # the result is in T1
2534
2535 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2536 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2537 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2538 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2539 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2540 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2541 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2542 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2543
2544
2545 vpxor \T1, \XMM1, \XMM1
2546
2547
2548
2549.endm
2550
2551
2552# GHASH the last 4 ciphertext blocks.
2553.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2554
2555 ## Karatsuba Method
2556
2557 vmovdqu HashKey_8(arg2), \T5
2558
2559 vpshufd $0b01001110, \XMM1, \T2
2560 vpshufd $0b01001110, \T5, \T3
2561 vpxor \XMM1, \T2, \T2
2562 vpxor \T5, \T3, \T3
2563
2564 vpclmulqdq $0x11, \T5, \XMM1, \T6
2565 vpclmulqdq $0x00, \T5, \XMM1, \T7
2566
2567 vpclmulqdq $0x00, \T3, \T2, \XMM1
2568
2569 ######################
2570
2571 vmovdqu HashKey_7(arg2), \T5
2572 vpshufd $0b01001110, \XMM2, \T2
2573 vpshufd $0b01001110, \T5, \T3
2574 vpxor \XMM2, \T2, \T2
2575 vpxor \T5, \T3, \T3
2576
2577 vpclmulqdq $0x11, \T5, \XMM2, \T4
2578 vpxor \T4, \T6, \T6
2579
2580 vpclmulqdq $0x00, \T5, \XMM2, \T4
2581 vpxor \T4, \T7, \T7
2582
2583 vpclmulqdq $0x00, \T3, \T2, \T2
2584
2585 vpxor \T2, \XMM1, \XMM1
2586
2587 ######################
2588
2589 vmovdqu HashKey_6(arg2), \T5
2590 vpshufd $0b01001110, \XMM3, \T2
2591 vpshufd $0b01001110, \T5, \T3
2592 vpxor \XMM3, \T2, \T2
2593 vpxor \T5, \T3, \T3
2594
2595 vpclmulqdq $0x11, \T5, \XMM3, \T4
2596 vpxor \T4, \T6, \T6
2597
2598 vpclmulqdq $0x00, \T5, \XMM3, \T4
2599 vpxor \T4, \T7, \T7
2600
2601 vpclmulqdq $0x00, \T3, \T2, \T2
2602
2603 vpxor \T2, \XMM1, \XMM1
2604
2605 ######################
2606
2607 vmovdqu HashKey_5(arg2), \T5
2608 vpshufd $0b01001110, \XMM4, \T2
2609 vpshufd $0b01001110, \T5, \T3
2610 vpxor \XMM4, \T2, \T2
2611 vpxor \T5, \T3, \T3
2612
2613 vpclmulqdq $0x11, \T5, \XMM4, \T4
2614 vpxor \T4, \T6, \T6
2615
2616 vpclmulqdq $0x00, \T5, \XMM4, \T4
2617 vpxor \T4, \T7, \T7
2618
2619 vpclmulqdq $0x00, \T3, \T2, \T2
2620
2621 vpxor \T2, \XMM1, \XMM1
2622
2623 ######################
2624
2625 vmovdqu HashKey_4(arg2), \T5
2626 vpshufd $0b01001110, \XMM5, \T2
2627 vpshufd $0b01001110, \T5, \T3
2628 vpxor \XMM5, \T2, \T2
2629 vpxor \T5, \T3, \T3
2630
2631 vpclmulqdq $0x11, \T5, \XMM5, \T4
2632 vpxor \T4, \T6, \T6
2633
2634 vpclmulqdq $0x00, \T5, \XMM5, \T4
2635 vpxor \T4, \T7, \T7
2636
2637 vpclmulqdq $0x00, \T3, \T2, \T2
2638
2639 vpxor \T2, \XMM1, \XMM1
2640
2641 ######################
2642
2643 vmovdqu HashKey_3(arg2), \T5
2644 vpshufd $0b01001110, \XMM6, \T2
2645 vpshufd $0b01001110, \T5, \T3
2646 vpxor \XMM6, \T2, \T2
2647 vpxor \T5, \T3, \T3
2648
2649 vpclmulqdq $0x11, \T5, \XMM6, \T4
2650 vpxor \T4, \T6, \T6
2651
2652 vpclmulqdq $0x00, \T5, \XMM6, \T4
2653 vpxor \T4, \T7, \T7
2654
2655 vpclmulqdq $0x00, \T3, \T2, \T2
2656
2657 vpxor \T2, \XMM1, \XMM1
2658
2659 ######################
2660
2661 vmovdqu HashKey_2(arg2), \T5
2662 vpshufd $0b01001110, \XMM7, \T2
2663 vpshufd $0b01001110, \T5, \T3
2664 vpxor \XMM7, \T2, \T2
2665 vpxor \T5, \T3, \T3
2666
2667 vpclmulqdq $0x11, \T5, \XMM7, \T4
2668 vpxor \T4, \T6, \T6
2669
2670 vpclmulqdq $0x00, \T5, \XMM7, \T4
2671 vpxor \T4, \T7, \T7
2672
2673 vpclmulqdq $0x00, \T3, \T2, \T2
2674
2675 vpxor \T2, \XMM1, \XMM1
2676
2677 ######################
2678
2679 vmovdqu HashKey(arg2), \T5
2680 vpshufd $0b01001110, \XMM8, \T2
2681 vpshufd $0b01001110, \T5, \T3
2682 vpxor \XMM8, \T2, \T2
2683 vpxor \T5, \T3, \T3
2684
2685 vpclmulqdq $0x11, \T5, \XMM8, \T4
2686 vpxor \T4, \T6, \T6
2687
2688 vpclmulqdq $0x00, \T5, \XMM8, \T4
2689 vpxor \T4, \T7, \T7
2690
2691 vpclmulqdq $0x00, \T3, \T2, \T2
2692
2693 vpxor \T2, \XMM1, \XMM1
2694 vpxor \T6, \XMM1, \XMM1
2695 vpxor \T7, \XMM1, \T2
2696
2697
2698
2699
2700 vpslldq $8, \T2, \T4
2701 vpsrldq $8, \T2, \T2
2702
2703 vpxor \T4, \T7, \T7
2704 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2705 # accumulated carry-less multiplications
2706
2707 #######################################################################
2708 #first phase of the reduction
2709 vmovdqa POLY2(%rip), \T3
2710
2711 vpclmulqdq $0x01, \T7, \T3, \T2
2712 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2713
2714 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2715 #######################################################################
2716
2717
2718 #second phase of the reduction
2719 vpclmulqdq $0x00, \T7, \T3, \T2
2720 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2721
2722 vpclmulqdq $0x10, \T7, \T3, \T4
2723 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2724
2725 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2726 #######################################################################
2727 vpxor \T4, \T6, \T6 # the result is in T6
2728.endm
2729
2730
2731
2732#############################################################
2733#void aesni_gcm_init_avx_gen4
2734# (gcm_data *my_ctx_data,
2735# gcm_context_data *data,
2736# u8 *iv, /* Pre-counter block j0: 4 byte salt
2737# (from Security Association) concatenated with 8 byte
2738# Initialisation Vector (from IPSec ESP Payload)
2739# concatenated with 0x00000001. 16-byte aligned pointer. */
2740# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2741# const u8 *aad, /* Additional Authentication Data (AAD)*/
2742# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2743#############################################################
2744SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2745 FUNC_SAVE
2746 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2747 FUNC_RESTORE
2748 ret
2749SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2750
2751###############################################################################
2752#void aesni_gcm_enc_avx_gen4(
2753# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2754# gcm_context_data *data,
2755# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2756# const u8 *in, /* Plaintext input */
2757# u64 plaintext_len) /* Length of data in Bytes for encryption. */
2758###############################################################################
2759SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2760 FUNC_SAVE
2761 mov keysize,%eax
2762 cmp $32, %eax
2763 je key_256_enc_update4
2764 cmp $16, %eax
2765 je key_128_enc_update4
2766 # must be 192
2767 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2768 FUNC_RESTORE
2769 ret
2770key_128_enc_update4:
2771 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2772 FUNC_RESTORE
2773 ret
2774key_256_enc_update4:
2775 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2776 FUNC_RESTORE
2777 ret
2778SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2779
2780###############################################################################
2781#void aesni_gcm_dec_update_avx_gen4(
2782# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2783# gcm_context_data *data,
2784# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2785# const u8 *in, /* Ciphertext input */
2786# u64 plaintext_len) /* Length of data in Bytes for encryption. */
2787###############################################################################
2788SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2789 FUNC_SAVE
2790 mov keysize,%eax
2791 cmp $32, %eax
2792 je key_256_dec_update4
2793 cmp $16, %eax
2794 je key_128_dec_update4
2795 # must be 192
2796 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2797 FUNC_RESTORE
2798 ret
2799key_128_dec_update4:
2800 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2801 FUNC_RESTORE
2802 ret
2803key_256_dec_update4:
2804 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2805 FUNC_RESTORE
2806 ret
2807SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2808
2809###############################################################################
2810#void aesni_gcm_finalize_avx_gen4(
2811# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2812# gcm_context_data *data,
2813# u8 *auth_tag, /* Authenticated Tag output. */
2814# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2815# Valid values are 16 (most likely), 12 or 8. */
2816###############################################################################
2817SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2818 FUNC_SAVE
2819 mov keysize,%eax
2820 cmp $32, %eax
2821 je key_256_finalize4
2822 cmp $16, %eax
2823 je key_128_finalize4
2824 # must be 192
2825 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2826 FUNC_RESTORE
2827 ret
2828key_128_finalize4:
2829 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2830 FUNC_RESTORE
2831 ret
2832key_256_finalize4:
2833 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2834 FUNC_RESTORE
2835 ret
2836SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)