Loading...
Note: File does not exist in v3.1.
1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses. You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15# notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18# notice, this list of conditions and the following disclaimer in the
19# documentation and/or other materials provided with the
20# distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23# contributors may be used to endorse or promote products derived from
24# this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41## Erdinc Ozturk <erdinc.ozturk@intel.com>
42## Vinodh Gopal <vinodh.gopal@intel.com>
43## James Guilford <james.guilford@intel.com>
44## Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47## This code was derived and highly optimized from the code described in paper:
48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49## on Intel Architecture Processors. August, 2010
50## The details of the implementation is explained in:
51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52## on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59## 0 1 2 3
60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62## | Salt (From the SA) |
63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64## | Initialization Vector |
65## | (This is the sequence number from IPSec header) |
66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67## | 0x1 |
68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73## AAD padded to 128 bits with 0
74## for example, assume AAD is a u32 vector
75##
76## if AAD is 8 bytes:
77## AAD[3] = {A0, A1}#
78## padded AAD in xmm register = {A1 A0 0 0}
79##
80## 0 1 2 3
81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83## | SPI (A1) |
84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85## | 32-bit Sequence Number (A0) |
86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87## | 0x0 |
88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90## AAD Format with 32-bit Sequence Number
91##
92## if AAD is 12 bytes:
93## AAD[3] = {A0, A1, A2}#
94## padded AAD in xmm register = {A2 A1 A0 0}
95##
96## 0 1 2 3
97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99## | SPI (A2) |
100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101## | 64-bit Extended Sequence Number {A1,A0} |
102## | |
103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104## | 0x0 |
105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107## AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112## The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123
124# constants in mergeable sections, linker can reorder and merge
125.section .rodata.cst16.POLY, "aM", @progbits, 16
126.align 16
127POLY: .octa 0xC2000000000000000000000000000001
128
129.section .rodata.cst16.POLY2, "aM", @progbits, 16
130.align 16
131POLY2: .octa 0xC20000000000000000000001C2000000
132
133.section .rodata.cst16.TWOONE, "aM", @progbits, 16
134.align 16
135TWOONE: .octa 0x00000001000000000000000000000001
136
137.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138.align 16
139SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
140
141.section .rodata.cst16.ONE, "aM", @progbits, 16
142.align 16
143ONE: .octa 0x00000000000000000000000000000001
144
145.section .rodata.cst16.ONEf, "aM", @progbits, 16
146.align 16
147ONEf: .octa 0x01000000000000000000000000000000
148
149# order of these constants should not change.
150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151.section .rodata, "a", @progbits
152.align 16
153SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
154ALL_F: .octa 0xffffffffffffffffffffffffffffffff
155 .octa 0x00000000000000000000000000000000
156
157.section .rodata
158.align 16
159.type aad_shift_arr, @object
160.size aad_shift_arr, 272
161aad_shift_arr:
162 .octa 0xffffffffffffffffffffffffffffffff
163 .octa 0xffffffffffffffffffffffffffffff0C
164 .octa 0xffffffffffffffffffffffffffff0D0C
165 .octa 0xffffffffffffffffffffffffff0E0D0C
166 .octa 0xffffffffffffffffffffffff0F0E0D0C
167 .octa 0xffffffffffffffffffffff0C0B0A0908
168 .octa 0xffffffffffffffffffff0D0C0B0A0908
169 .octa 0xffffffffffffffffff0E0D0C0B0A0908
170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
171 .octa 0xffffffffffffff0C0B0A090807060504
172 .octa 0xffffffffffff0D0C0B0A090807060504
173 .octa 0xffffffffff0E0D0C0B0A090807060504
174 .octa 0xffffffff0F0E0D0C0B0A090807060504
175 .octa 0xffffff0C0B0A09080706050403020100
176 .octa 0xffff0D0C0B0A09080706050403020100
177 .octa 0xff0E0D0C0B0A09080706050403020100
178 .octa 0x0F0E0D0C0B0A09080706050403020100
179
180
181.text
182
183
184#define AadHash 16*0
185#define AadLen 16*1
186#define InLen (16*1)+8
187#define PBlockEncKey 16*2
188#define OrigIV 16*3
189#define CurCount 16*4
190#define PBlockLen 16*5
191
192HashKey = 16*6 # store HashKey <<1 mod poly here
193HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
194HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
195HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
196HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
197HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
198HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
199HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
200HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
208
209#define arg1 %rdi
210#define arg2 %rsi
211#define arg3 %rdx
212#define arg4 %rcx
213#define arg5 %r8
214#define arg6 %r9
215#define keysize 2*15*16(arg1)
216
217i = 0
218j = 0
219
220out_order = 0
221in_order = 1
222DEC = 0
223ENC = 1
224
225.macro define_reg r n
226reg_\r = %xmm\n
227.endm
228
229.macro setreg
230.altmacro
231define_reg i %i
232define_reg j %j
233.noaltmacro
234.endm
235
236TMP1 = 16*0 # Temporary storage for AAD
237TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
238TMP3 = 16*2 # Temporary storage for AES State 3
239TMP4 = 16*3 # Temporary storage for AES State 4
240TMP5 = 16*4 # Temporary storage for AES State 5
241TMP6 = 16*5 # Temporary storage for AES State 6
242TMP7 = 16*6 # Temporary storage for AES State 7
243TMP8 = 16*7 # Temporary storage for AES State 8
244
245VARIABLE_OFFSET = 16*8
246
247################################
248# Utility Macros
249################################
250
251.macro FUNC_SAVE
252 push %r12
253 push %r13
254 push %r15
255
256 push %rbp
257 mov %rsp, %rbp
258
259 sub $VARIABLE_OFFSET, %rsp
260 and $~63, %rsp # align rsp to 64 bytes
261.endm
262
263.macro FUNC_RESTORE
264 mov %rbp, %rsp
265 pop %rbp
266
267 pop %r15
268 pop %r13
269 pop %r12
270.endm
271
272# Encryption of a single block
273.macro ENCRYPT_SINGLE_BLOCK REP XMM0
274 vpxor (arg1), \XMM0, \XMM0
275 i = 1
276 setreg
277.rep \REP
278 vaesenc 16*i(arg1), \XMM0, \XMM0
279 i = (i+1)
280 setreg
281.endr
282 vaesenclast 16*i(arg1), \XMM0, \XMM0
283.endm
284
285# combined for GCM encrypt and decrypt functions
286# clobbering all xmm registers
287# clobbering r10, r11, r12, r13, r15, rax
288.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
289 vmovdqu AadHash(arg2), %xmm8
290 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
291 add arg5, InLen(arg2)
292
293 # initialize the data pointer offset as zero
294 xor %r11d, %r11d
295
296 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
297 sub %r11, arg5
298
299 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
300 and $-16, %r13 # r13 = r13 - (r13 mod 16)
301
302 mov %r13, %r12
303 shr $4, %r12
304 and $7, %r12
305 jz _initial_num_blocks_is_0\@
306
307 cmp $7, %r12
308 je _initial_num_blocks_is_7\@
309 cmp $6, %r12
310 je _initial_num_blocks_is_6\@
311 cmp $5, %r12
312 je _initial_num_blocks_is_5\@
313 cmp $4, %r12
314 je _initial_num_blocks_is_4\@
315 cmp $3, %r12
316 je _initial_num_blocks_is_3\@
317 cmp $2, %r12
318 je _initial_num_blocks_is_2\@
319
320 jmp _initial_num_blocks_is_1\@
321
322_initial_num_blocks_is_7\@:
323 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
324 sub $16*7, %r13
325 jmp _initial_blocks_encrypted\@
326
327_initial_num_blocks_is_6\@:
328 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
329 sub $16*6, %r13
330 jmp _initial_blocks_encrypted\@
331
332_initial_num_blocks_is_5\@:
333 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334 sub $16*5, %r13
335 jmp _initial_blocks_encrypted\@
336
337_initial_num_blocks_is_4\@:
338 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339 sub $16*4, %r13
340 jmp _initial_blocks_encrypted\@
341
342_initial_num_blocks_is_3\@:
343 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344 sub $16*3, %r13
345 jmp _initial_blocks_encrypted\@
346
347_initial_num_blocks_is_2\@:
348 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349 sub $16*2, %r13
350 jmp _initial_blocks_encrypted\@
351
352_initial_num_blocks_is_1\@:
353 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
354 sub $16*1, %r13
355 jmp _initial_blocks_encrypted\@
356
357_initial_num_blocks_is_0\@:
358 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
359
360
361_initial_blocks_encrypted\@:
362 test %r13, %r13
363 je _zero_cipher_left\@
364
365 sub $128, %r13
366 je _eight_cipher_left\@
367
368
369
370
371 vmovd %xmm9, %r15d
372 and $255, %r15d
373 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
374
375
376_encrypt_by_8_new\@:
377 cmp $(255-8), %r15d
378 jg _encrypt_by_8\@
379
380
381
382 add $8, %r15b
383 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
384 add $128, %r11
385 sub $128, %r13
386 jne _encrypt_by_8_new\@
387
388 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
389 jmp _eight_cipher_left\@
390
391_encrypt_by_8\@:
392 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
393 add $8, %r15b
394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
395 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
396 add $128, %r11
397 sub $128, %r13
398 jne _encrypt_by_8_new\@
399
400 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
401
402
403
404
405_eight_cipher_left\@:
406 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
407
408
409_zero_cipher_left\@:
410 vmovdqu %xmm14, AadHash(arg2)
411 vmovdqu %xmm9, CurCount(arg2)
412
413 # check for 0 length
414 mov arg5, %r13
415 and $15, %r13 # r13 = (arg5 mod 16)
416
417 je _multiple_of_16_bytes\@
418
419 # handle the last <16 Byte block separately
420
421 mov %r13, PBlockLen(arg2)
422
423 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
424 vmovdqu %xmm9, CurCount(arg2)
425 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
426
427 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
428 vmovdqu %xmm9, PBlockEncKey(arg2)
429
430 cmp $16, arg5
431 jge _large_enough_update\@
432
433 lea (arg4,%r11,1), %r10
434 mov %r13, %r12
435
436 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
437
438 lea SHIFT_MASK+16(%rip), %r12
439 sub %r13, %r12 # adjust the shuffle mask pointer to be
440 # able to shift 16-r13 bytes (r13 is the
441 # number of bytes in plaintext mod 16)
442
443 jmp _final_ghash_mul\@
444
445_large_enough_update\@:
446 sub $16, %r11
447 add %r13, %r11
448
449 # receive the last <16 Byte block
450 vmovdqu (arg4, %r11, 1), %xmm1
451
452 sub %r13, %r11
453 add $16, %r11
454
455 lea SHIFT_MASK+16(%rip), %r12
456 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
457 # (r13 is the number of bytes in plaintext mod 16)
458 sub %r13, %r12
459 # get the appropriate shuffle mask
460 vmovdqu (%r12), %xmm2
461 # shift right 16-r13 bytes
462 vpshufb %xmm2, %xmm1, %xmm1
463
464_final_ghash_mul\@:
465 .if \ENC_DEC == DEC
466 vmovdqa %xmm1, %xmm2
467 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
468 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
469 # mask out top 16-r13 bytes of xmm9
470 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
471 vpand %xmm1, %xmm2, %xmm2
472 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
473 vpxor %xmm2, %xmm14, %xmm14
474
475 vmovdqu %xmm14, AadHash(arg2)
476 .else
477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
479 # mask out top 16-r13 bytes of xmm9
480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
481 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
482 vpxor %xmm9, %xmm14, %xmm14
483
484 vmovdqu %xmm14, AadHash(arg2)
485 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
486 .endif
487
488
489 #############################
490 # output r13 Bytes
491 vmovq %xmm9, %rax
492 cmp $8, %r13
493 jle _less_than_8_bytes_left\@
494
495 mov %rax, (arg3 , %r11)
496 add $8, %r11
497 vpsrldq $8, %xmm9, %xmm9
498 vmovq %xmm9, %rax
499 sub $8, %r13
500
501_less_than_8_bytes_left\@:
502 movb %al, (arg3 , %r11)
503 add $1, %r11
504 shr $8, %rax
505 sub $1, %r13
506 jne _less_than_8_bytes_left\@
507 #############################
508
509_multiple_of_16_bytes\@:
510.endm
511
512
513# GCM_COMPLETE Finishes update of tag of last partial block
514# Output: Authorization Tag (AUTH_TAG)
515# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
516.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
517 vmovdqu AadHash(arg2), %xmm14
518 vmovdqu HashKey(arg2), %xmm13
519
520 mov PBlockLen(arg2), %r12
521 test %r12, %r12
522 je _partial_done\@
523
524 #GHASH computation for the last <16 Byte block
525 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
526
527_partial_done\@:
528 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
529 shl $3, %r12 # convert into number of bits
530 vmovd %r12d, %xmm15 # len(A) in xmm15
531
532 mov InLen(arg2), %r12
533 shl $3, %r12 # len(C) in bits (*128)
534 vmovq %r12, %xmm1
535 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
536 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
537
538 vpxor %xmm15, %xmm14, %xmm14
539 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
540 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
541
542 vmovdqu OrigIV(arg2), %xmm9
543
544 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
545
546 vpxor %xmm14, %xmm9, %xmm9
547
548
549
550_return_T\@:
551 mov \AUTH_TAG, %r10 # r10 = authTag
552 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
553
554 cmp $16, %r11
555 je _T_16\@
556
557 cmp $8, %r11
558 jl _T_4\@
559
560_T_8\@:
561 vmovq %xmm9, %rax
562 mov %rax, (%r10)
563 add $8, %r10
564 sub $8, %r11
565 vpsrldq $8, %xmm9, %xmm9
566 test %r11, %r11
567 je _return_T_done\@
568_T_4\@:
569 vmovd %xmm9, %eax
570 mov %eax, (%r10)
571 add $4, %r10
572 sub $4, %r11
573 vpsrldq $4, %xmm9, %xmm9
574 test %r11, %r11
575 je _return_T_done\@
576_T_123\@:
577 vmovd %xmm9, %eax
578 cmp $2, %r11
579 jl _T_1\@
580 mov %ax, (%r10)
581 cmp $2, %r11
582 je _return_T_done\@
583 add $2, %r10
584 sar $16, %eax
585_T_1\@:
586 mov %al, (%r10)
587 jmp _return_T_done\@
588
589_T_16\@:
590 vmovdqu %xmm9, (%r10)
591
592_return_T_done\@:
593.endm
594
595.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
596
597 mov \AAD, %r10 # r10 = AAD
598 mov \AADLEN, %r12 # r12 = aadLen
599
600
601 mov %r12, %r11
602
603 vpxor \T8, \T8, \T8
604 vpxor \T7, \T7, \T7
605 cmp $16, %r11
606 jl _get_AAD_rest8\@
607_get_AAD_blocks\@:
608 vmovdqu (%r10), \T7
609 vpshufb SHUF_MASK(%rip), \T7, \T7
610 vpxor \T7, \T8, \T8
611 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
612 add $16, %r10
613 sub $16, %r12
614 sub $16, %r11
615 cmp $16, %r11
616 jge _get_AAD_blocks\@
617 vmovdqu \T8, \T7
618 test %r11, %r11
619 je _get_AAD_done\@
620
621 vpxor \T7, \T7, \T7
622
623 /* read the last <16B of AAD. since we have at least 4B of
624 data right after the AAD (the ICV, and maybe some CT), we can
625 read 4B/8B blocks safely, and then get rid of the extra stuff */
626_get_AAD_rest8\@:
627 cmp $4, %r11
628 jle _get_AAD_rest4\@
629 movq (%r10), \T1
630 add $8, %r10
631 sub $8, %r11
632 vpslldq $8, \T1, \T1
633 vpsrldq $8, \T7, \T7
634 vpxor \T1, \T7, \T7
635 jmp _get_AAD_rest8\@
636_get_AAD_rest4\@:
637 test %r11, %r11
638 jle _get_AAD_rest0\@
639 mov (%r10), %eax
640 movq %rax, \T1
641 add $4, %r10
642 sub $4, %r11
643 vpslldq $12, \T1, \T1
644 vpsrldq $4, \T7, \T7
645 vpxor \T1, \T7, \T7
646_get_AAD_rest0\@:
647 /* finalize: shift out the extra bytes we read, and align
648 left. since pslldq can only shift by an immediate, we use
649 vpshufb and an array of shuffle masks */
650 movq %r12, %r11
651 salq $4, %r11
652 vmovdqu aad_shift_arr(%r11), \T1
653 vpshufb \T1, \T7, \T7
654_get_AAD_rest_final\@:
655 vpshufb SHUF_MASK(%rip), \T7, \T7
656 vpxor \T8, \T7, \T7
657 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
658
659_get_AAD_done\@:
660 vmovdqu \T7, AadHash(arg2)
661.endm
662
663.macro INIT GHASH_MUL PRECOMPUTE
664 mov arg6, %r11
665 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
666 xor %r11d, %r11d
667 mov %r11, InLen(arg2) # ctx_data.in_length = 0
668
669 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
670 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
671 mov arg3, %rax
672 movdqu (%rax), %xmm0
673 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
674
675 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
676 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
677
678 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
679
680 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
681 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
682 vmovdqa %xmm6, %xmm2
683 vpsllq $1, %xmm6, %xmm6
684 vpsrlq $63, %xmm2, %xmm2
685 vmovdqa %xmm2, %xmm1
686 vpslldq $8, %xmm2, %xmm2
687 vpsrldq $8, %xmm1, %xmm1
688 vpor %xmm2, %xmm6, %xmm6
689 #reduction
690 vpshufd $0b00100100, %xmm1, %xmm2
691 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
692 vpand POLY(%rip), %xmm2, %xmm2
693 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
694 #######################################################################
695 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
696
697 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
698
699 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
700.endm
701
702
703# Reads DLEN bytes starting at DPTR and stores in XMMDst
704# where 0 < DLEN < 16
705# Clobbers %rax, DLEN
706.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
707 vpxor \XMMDst, \XMMDst, \XMMDst
708
709 cmp $8, \DLEN
710 jl _read_lt8_\@
711 mov (\DPTR), %rax
712 vpinsrq $0, %rax, \XMMDst, \XMMDst
713 sub $8, \DLEN
714 jz _done_read_partial_block_\@
715 xor %eax, %eax
716_read_next_byte_\@:
717 shl $8, %rax
718 mov 7(\DPTR, \DLEN, 1), %al
719 dec \DLEN
720 jnz _read_next_byte_\@
721 vpinsrq $1, %rax, \XMMDst, \XMMDst
722 jmp _done_read_partial_block_\@
723_read_lt8_\@:
724 xor %eax, %eax
725_read_next_byte_lt8_\@:
726 shl $8, %rax
727 mov -1(\DPTR, \DLEN, 1), %al
728 dec \DLEN
729 jnz _read_next_byte_lt8_\@
730 vpinsrq $0, %rax, \XMMDst, \XMMDst
731_done_read_partial_block_\@:
732.endm
733
734# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
735# between update calls.
736# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
737# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
738# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
739.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
740 AAD_HASH ENC_DEC
741 mov PBlockLen(arg2), %r13
742 test %r13, %r13
743 je _partial_block_done_\@ # Leave Macro if no partial blocks
744 # Read in input data without over reading
745 cmp $16, \PLAIN_CYPH_LEN
746 jl _fewer_than_16_bytes_\@
747 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
748 jmp _data_read_\@
749
750_fewer_than_16_bytes_\@:
751 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
752 mov \PLAIN_CYPH_LEN, %r12
753 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
754
755 mov PBlockLen(arg2), %r13
756
757_data_read_\@: # Finished reading in data
758
759 vmovdqu PBlockEncKey(arg2), %xmm9
760 vmovdqu HashKey(arg2), %xmm13
761
762 lea SHIFT_MASK(%rip), %r12
763
764 # adjust the shuffle mask pointer to be able to shift r13 bytes
765 # r16-r13 is the number of bytes in plaintext mod 16)
766 add %r13, %r12
767 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
768 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
769
770.if \ENC_DEC == DEC
771 vmovdqa %xmm1, %xmm3
772 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
773
774 mov \PLAIN_CYPH_LEN, %r10
775 add %r13, %r10
776 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
777 sub $16, %r10
778 # Determine if if partial block is not being filled and
779 # shift mask accordingly
780 jge _no_extra_mask_1_\@
781 sub %r10, %r12
782_no_extra_mask_1_\@:
783
784 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
785 # get the appropriate mask to mask out bottom r13 bytes of xmm9
786 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
787
788 vpand %xmm1, %xmm3, %xmm3
789 vmovdqa SHUF_MASK(%rip), %xmm10
790 vpshufb %xmm10, %xmm3, %xmm3
791 vpshufb %xmm2, %xmm3, %xmm3
792 vpxor %xmm3, \AAD_HASH, \AAD_HASH
793
794 test %r10, %r10
795 jl _partial_incomplete_1_\@
796
797 # GHASH computation for the last <16 Byte block
798 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
799 xor %eax,%eax
800
801 mov %rax, PBlockLen(arg2)
802 jmp _dec_done_\@
803_partial_incomplete_1_\@:
804 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
805_dec_done_\@:
806 vmovdqu \AAD_HASH, AadHash(arg2)
807.else
808 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
809
810 mov \PLAIN_CYPH_LEN, %r10
811 add %r13, %r10
812 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
813 sub $16, %r10
814 # Determine if if partial block is not being filled and
815 # shift mask accordingly
816 jge _no_extra_mask_2_\@
817 sub %r10, %r12
818_no_extra_mask_2_\@:
819
820 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
821 # get the appropriate mask to mask out bottom r13 bytes of xmm9
822 vpand %xmm1, %xmm9, %xmm9
823
824 vmovdqa SHUF_MASK(%rip), %xmm1
825 vpshufb %xmm1, %xmm9, %xmm9
826 vpshufb %xmm2, %xmm9, %xmm9
827 vpxor %xmm9, \AAD_HASH, \AAD_HASH
828
829 test %r10, %r10
830 jl _partial_incomplete_2_\@
831
832 # GHASH computation for the last <16 Byte block
833 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
834 xor %eax,%eax
835
836 mov %rax, PBlockLen(arg2)
837 jmp _encode_done_\@
838_partial_incomplete_2_\@:
839 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
840_encode_done_\@:
841 vmovdqu \AAD_HASH, AadHash(arg2)
842
843 vmovdqa SHUF_MASK(%rip), %xmm10
844 # shuffle xmm9 back to output as ciphertext
845 vpshufb %xmm10, %xmm9, %xmm9
846 vpshufb %xmm2, %xmm9, %xmm9
847.endif
848 # output encrypted Bytes
849 test %r10, %r10
850 jl _partial_fill_\@
851 mov %r13, %r12
852 mov $16, %r13
853 # Set r13 to be the number of bytes to write out
854 sub %r12, %r13
855 jmp _count_set_\@
856_partial_fill_\@:
857 mov \PLAIN_CYPH_LEN, %r13
858_count_set_\@:
859 vmovdqa %xmm9, %xmm0
860 vmovq %xmm0, %rax
861 cmp $8, %r13
862 jle _less_than_8_bytes_left_\@
863
864 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
865 add $8, \DATA_OFFSET
866 psrldq $8, %xmm0
867 vmovq %xmm0, %rax
868 sub $8, %r13
869_less_than_8_bytes_left_\@:
870 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
871 add $1, \DATA_OFFSET
872 shr $8, %rax
873 sub $1, %r13
874 jne _less_than_8_bytes_left_\@
875_partial_block_done_\@:
876.endm # PARTIAL_BLOCK
877
878###############################################################################
879# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
880# Input: A and B (128-bits each, bit-reflected)
881# Output: C = A*B*x mod poly, (i.e. >>1 )
882# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
883# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
884###############################################################################
885.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
886
887 vpshufd $0b01001110, \GH, \T2
888 vpshufd $0b01001110, \HK, \T3
889 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
890 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
891
892 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
893 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
894 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
895 vpxor \GH, \T2,\T2
896 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
897
898 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
899 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
900 vpxor \T3, \GH, \GH
901 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
902
903 #first phase of the reduction
904 vpslld $31, \GH, \T2 # packed right shifting << 31
905 vpslld $30, \GH, \T3 # packed right shifting shift << 30
906 vpslld $25, \GH, \T4 # packed right shifting shift << 25
907
908 vpxor \T3, \T2, \T2 # xor the shifted versions
909 vpxor \T4, \T2, \T2
910
911 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
912
913 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
914 vpxor \T2, \GH, \GH # first phase of the reduction complete
915
916 #second phase of the reduction
917
918 vpsrld $1,\GH, \T2 # packed left shifting >> 1
919 vpsrld $2,\GH, \T3 # packed left shifting >> 2
920 vpsrld $7,\GH, \T4 # packed left shifting >> 7
921 vpxor \T3, \T2, \T2 # xor the shifted versions
922 vpxor \T4, \T2, \T2
923
924 vpxor \T5, \T2, \T2
925 vpxor \T2, \GH, \GH
926 vpxor \T1, \GH, \GH # the result is in GH
927
928
929.endm
930
931.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
932
933 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
934 vmovdqa \HK, \T5
935
936 vpshufd $0b01001110, \T5, \T1
937 vpxor \T5, \T1, \T1
938 vmovdqu \T1, HashKey_k(arg2)
939
940 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
941 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
942 vpshufd $0b01001110, \T5, \T1
943 vpxor \T5, \T1, \T1
944 vmovdqu \T1, HashKey_2_k(arg2)
945
946 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
947 vmovdqu \T5, HashKey_3(arg2)
948 vpshufd $0b01001110, \T5, \T1
949 vpxor \T5, \T1, \T1
950 vmovdqu \T1, HashKey_3_k(arg2)
951
952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
953 vmovdqu \T5, HashKey_4(arg2)
954 vpshufd $0b01001110, \T5, \T1
955 vpxor \T5, \T1, \T1
956 vmovdqu \T1, HashKey_4_k(arg2)
957
958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
959 vmovdqu \T5, HashKey_5(arg2)
960 vpshufd $0b01001110, \T5, \T1
961 vpxor \T5, \T1, \T1
962 vmovdqu \T1, HashKey_5_k(arg2)
963
964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
965 vmovdqu \T5, HashKey_6(arg2)
966 vpshufd $0b01001110, \T5, \T1
967 vpxor \T5, \T1, \T1
968 vmovdqu \T1, HashKey_6_k(arg2)
969
970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
971 vmovdqu \T5, HashKey_7(arg2)
972 vpshufd $0b01001110, \T5, \T1
973 vpxor \T5, \T1, \T1
974 vmovdqu \T1, HashKey_7_k(arg2)
975
976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
977 vmovdqu \T5, HashKey_8(arg2)
978 vpshufd $0b01001110, \T5, \T1
979 vpxor \T5, \T1, \T1
980 vmovdqu \T1, HashKey_8_k(arg2)
981
982.endm
983
984## if a = number of total plaintext bytes
985## b = floor(a/16)
986## num_initial_blocks = b mod 4#
987## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
988## r10, r11, r12, rax are clobbered
989## arg1, arg2, arg3, arg4 are used as pointers only, not modified
990
991.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
992 i = (8-\num_initial_blocks)
993 setreg
994 vmovdqu AadHash(arg2), reg_i
995
996 # start AES for num_initial_blocks blocks
997 vmovdqu CurCount(arg2), \CTR
998
999 i = (9-\num_initial_blocks)
1000 setreg
1001.rep \num_initial_blocks
1002 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1003 vmovdqa \CTR, reg_i
1004 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1005 i = (i+1)
1006 setreg
1007.endr
1008
1009 vmovdqa (arg1), \T_key
1010 i = (9-\num_initial_blocks)
1011 setreg
1012.rep \num_initial_blocks
1013 vpxor \T_key, reg_i, reg_i
1014 i = (i+1)
1015 setreg
1016.endr
1017
1018 j = 1
1019 setreg
1020.rep \REP
1021 vmovdqa 16*j(arg1), \T_key
1022 i = (9-\num_initial_blocks)
1023 setreg
1024.rep \num_initial_blocks
1025 vaesenc \T_key, reg_i, reg_i
1026 i = (i+1)
1027 setreg
1028.endr
1029
1030 j = (j+1)
1031 setreg
1032.endr
1033
1034 vmovdqa 16*j(arg1), \T_key
1035 i = (9-\num_initial_blocks)
1036 setreg
1037.rep \num_initial_blocks
1038 vaesenclast \T_key, reg_i, reg_i
1039 i = (i+1)
1040 setreg
1041.endr
1042
1043 i = (9-\num_initial_blocks)
1044 setreg
1045.rep \num_initial_blocks
1046 vmovdqu (arg4, %r11), \T1
1047 vpxor \T1, reg_i, reg_i
1048 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1049 add $16, %r11
1050.if \ENC_DEC == DEC
1051 vmovdqa \T1, reg_i
1052.endif
1053 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1054 i = (i+1)
1055 setreg
1056.endr
1057
1058
1059 i = (8-\num_initial_blocks)
1060 j = (9-\num_initial_blocks)
1061 setreg
1062
1063.rep \num_initial_blocks
1064 vpxor reg_i, reg_j, reg_j
1065 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1066 i = (i+1)
1067 j = (j+1)
1068 setreg
1069.endr
1070 # XMM8 has the combined result here
1071
1072 vmovdqa \XMM8, TMP1(%rsp)
1073 vmovdqa \XMM8, \T3
1074
1075 cmp $128, %r13
1076 jl _initial_blocks_done\@ # no need for precomputed constants
1077
1078###############################################################################
1079# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1080 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1081 vmovdqa \CTR, \XMM1
1082 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1083
1084 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1085 vmovdqa \CTR, \XMM2
1086 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1087
1088 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1089 vmovdqa \CTR, \XMM3
1090 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1091
1092 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1093 vmovdqa \CTR, \XMM4
1094 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1095
1096 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1097 vmovdqa \CTR, \XMM5
1098 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1099
1100 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1101 vmovdqa \CTR, \XMM6
1102 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1103
1104 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1105 vmovdqa \CTR, \XMM7
1106 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1107
1108 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1109 vmovdqa \CTR, \XMM8
1110 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1111
1112 vmovdqa (arg1), \T_key
1113 vpxor \T_key, \XMM1, \XMM1
1114 vpxor \T_key, \XMM2, \XMM2
1115 vpxor \T_key, \XMM3, \XMM3
1116 vpxor \T_key, \XMM4, \XMM4
1117 vpxor \T_key, \XMM5, \XMM5
1118 vpxor \T_key, \XMM6, \XMM6
1119 vpxor \T_key, \XMM7, \XMM7
1120 vpxor \T_key, \XMM8, \XMM8
1121
1122 i = 1
1123 setreg
1124.rep \REP # do REP rounds
1125 vmovdqa 16*i(arg1), \T_key
1126 vaesenc \T_key, \XMM1, \XMM1
1127 vaesenc \T_key, \XMM2, \XMM2
1128 vaesenc \T_key, \XMM3, \XMM3
1129 vaesenc \T_key, \XMM4, \XMM4
1130 vaesenc \T_key, \XMM5, \XMM5
1131 vaesenc \T_key, \XMM6, \XMM6
1132 vaesenc \T_key, \XMM7, \XMM7
1133 vaesenc \T_key, \XMM8, \XMM8
1134 i = (i+1)
1135 setreg
1136.endr
1137
1138 vmovdqa 16*i(arg1), \T_key
1139 vaesenclast \T_key, \XMM1, \XMM1
1140 vaesenclast \T_key, \XMM2, \XMM2
1141 vaesenclast \T_key, \XMM3, \XMM3
1142 vaesenclast \T_key, \XMM4, \XMM4
1143 vaesenclast \T_key, \XMM5, \XMM5
1144 vaesenclast \T_key, \XMM6, \XMM6
1145 vaesenclast \T_key, \XMM7, \XMM7
1146 vaesenclast \T_key, \XMM8, \XMM8
1147
1148 vmovdqu (arg4, %r11), \T1
1149 vpxor \T1, \XMM1, \XMM1
1150 vmovdqu \XMM1, (arg3 , %r11)
1151 .if \ENC_DEC == DEC
1152 vmovdqa \T1, \XMM1
1153 .endif
1154
1155 vmovdqu 16*1(arg4, %r11), \T1
1156 vpxor \T1, \XMM2, \XMM2
1157 vmovdqu \XMM2, 16*1(arg3 , %r11)
1158 .if \ENC_DEC == DEC
1159 vmovdqa \T1, \XMM2
1160 .endif
1161
1162 vmovdqu 16*2(arg4, %r11), \T1
1163 vpxor \T1, \XMM3, \XMM3
1164 vmovdqu \XMM3, 16*2(arg3 , %r11)
1165 .if \ENC_DEC == DEC
1166 vmovdqa \T1, \XMM3
1167 .endif
1168
1169 vmovdqu 16*3(arg4, %r11), \T1
1170 vpxor \T1, \XMM4, \XMM4
1171 vmovdqu \XMM4, 16*3(arg3 , %r11)
1172 .if \ENC_DEC == DEC
1173 vmovdqa \T1, \XMM4
1174 .endif
1175
1176 vmovdqu 16*4(arg4, %r11), \T1
1177 vpxor \T1, \XMM5, \XMM5
1178 vmovdqu \XMM5, 16*4(arg3 , %r11)
1179 .if \ENC_DEC == DEC
1180 vmovdqa \T1, \XMM5
1181 .endif
1182
1183 vmovdqu 16*5(arg4, %r11), \T1
1184 vpxor \T1, \XMM6, \XMM6
1185 vmovdqu \XMM6, 16*5(arg3 , %r11)
1186 .if \ENC_DEC == DEC
1187 vmovdqa \T1, \XMM6
1188 .endif
1189
1190 vmovdqu 16*6(arg4, %r11), \T1
1191 vpxor \T1, \XMM7, \XMM7
1192 vmovdqu \XMM7, 16*6(arg3 , %r11)
1193 .if \ENC_DEC == DEC
1194 vmovdqa \T1, \XMM7
1195 .endif
1196
1197 vmovdqu 16*7(arg4, %r11), \T1
1198 vpxor \T1, \XMM8, \XMM8
1199 vmovdqu \XMM8, 16*7(arg3 , %r11)
1200 .if \ENC_DEC == DEC
1201 vmovdqa \T1, \XMM8
1202 .endif
1203
1204 add $128, %r11
1205
1206 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1207 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1208 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1209 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1210 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1211 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1212 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1213 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1214 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1215
1216###############################################################################
1217
1218_initial_blocks_done\@:
1219
1220.endm
1221
1222# encrypt 8 blocks at a time
1223# ghash the 8 previously encrypted ciphertext blocks
1224# arg1, arg2, arg3, arg4 are used as pointers only, not modified
1225# r11 is the data offset value
1226.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1227
1228 vmovdqa \XMM1, \T2
1229 vmovdqa \XMM2, TMP2(%rsp)
1230 vmovdqa \XMM3, TMP3(%rsp)
1231 vmovdqa \XMM4, TMP4(%rsp)
1232 vmovdqa \XMM5, TMP5(%rsp)
1233 vmovdqa \XMM6, TMP6(%rsp)
1234 vmovdqa \XMM7, TMP7(%rsp)
1235 vmovdqa \XMM8, TMP8(%rsp)
1236
1237.if \loop_idx == in_order
1238 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1239 vpaddd ONE(%rip), \XMM1, \XMM2
1240 vpaddd ONE(%rip), \XMM2, \XMM3
1241 vpaddd ONE(%rip), \XMM3, \XMM4
1242 vpaddd ONE(%rip), \XMM4, \XMM5
1243 vpaddd ONE(%rip), \XMM5, \XMM6
1244 vpaddd ONE(%rip), \XMM6, \XMM7
1245 vpaddd ONE(%rip), \XMM7, \XMM8
1246 vmovdqa \XMM8, \CTR
1247
1248 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1249 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1250 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1251 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1252 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1253 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1254 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1255 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1256.else
1257 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1258 vpaddd ONEf(%rip), \XMM1, \XMM2
1259 vpaddd ONEf(%rip), \XMM2, \XMM3
1260 vpaddd ONEf(%rip), \XMM3, \XMM4
1261 vpaddd ONEf(%rip), \XMM4, \XMM5
1262 vpaddd ONEf(%rip), \XMM5, \XMM6
1263 vpaddd ONEf(%rip), \XMM6, \XMM7
1264 vpaddd ONEf(%rip), \XMM7, \XMM8
1265 vmovdqa \XMM8, \CTR
1266.endif
1267
1268
1269 #######################################################################
1270
1271 vmovdqu (arg1), \T1
1272 vpxor \T1, \XMM1, \XMM1
1273 vpxor \T1, \XMM2, \XMM2
1274 vpxor \T1, \XMM3, \XMM3
1275 vpxor \T1, \XMM4, \XMM4
1276 vpxor \T1, \XMM5, \XMM5
1277 vpxor \T1, \XMM6, \XMM6
1278 vpxor \T1, \XMM7, \XMM7
1279 vpxor \T1, \XMM8, \XMM8
1280
1281 #######################################################################
1282
1283
1284
1285
1286
1287 vmovdqu 16*1(arg1), \T1
1288 vaesenc \T1, \XMM1, \XMM1
1289 vaesenc \T1, \XMM2, \XMM2
1290 vaesenc \T1, \XMM3, \XMM3
1291 vaesenc \T1, \XMM4, \XMM4
1292 vaesenc \T1, \XMM5, \XMM5
1293 vaesenc \T1, \XMM6, \XMM6
1294 vaesenc \T1, \XMM7, \XMM7
1295 vaesenc \T1, \XMM8, \XMM8
1296
1297 vmovdqu 16*2(arg1), \T1
1298 vaesenc \T1, \XMM1, \XMM1
1299 vaesenc \T1, \XMM2, \XMM2
1300 vaesenc \T1, \XMM3, \XMM3
1301 vaesenc \T1, \XMM4, \XMM4
1302 vaesenc \T1, \XMM5, \XMM5
1303 vaesenc \T1, \XMM6, \XMM6
1304 vaesenc \T1, \XMM7, \XMM7
1305 vaesenc \T1, \XMM8, \XMM8
1306
1307
1308 #######################################################################
1309
1310 vmovdqu HashKey_8(arg2), \T5
1311 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1312 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1313
1314 vpshufd $0b01001110, \T2, \T6
1315 vpxor \T2, \T6, \T6
1316
1317 vmovdqu HashKey_8_k(arg2), \T5
1318 vpclmulqdq $0x00, \T5, \T6, \T6
1319
1320 vmovdqu 16*3(arg1), \T1
1321 vaesenc \T1, \XMM1, \XMM1
1322 vaesenc \T1, \XMM2, \XMM2
1323 vaesenc \T1, \XMM3, \XMM3
1324 vaesenc \T1, \XMM4, \XMM4
1325 vaesenc \T1, \XMM5, \XMM5
1326 vaesenc \T1, \XMM6, \XMM6
1327 vaesenc \T1, \XMM7, \XMM7
1328 vaesenc \T1, \XMM8, \XMM8
1329
1330 vmovdqa TMP2(%rsp), \T1
1331 vmovdqu HashKey_7(arg2), \T5
1332 vpclmulqdq $0x11, \T5, \T1, \T3
1333 vpxor \T3, \T4, \T4
1334 vpclmulqdq $0x00, \T5, \T1, \T3
1335 vpxor \T3, \T7, \T7
1336
1337 vpshufd $0b01001110, \T1, \T3
1338 vpxor \T1, \T3, \T3
1339 vmovdqu HashKey_7_k(arg2), \T5
1340 vpclmulqdq $0x10, \T5, \T3, \T3
1341 vpxor \T3, \T6, \T6
1342
1343 vmovdqu 16*4(arg1), \T1
1344 vaesenc \T1, \XMM1, \XMM1
1345 vaesenc \T1, \XMM2, \XMM2
1346 vaesenc \T1, \XMM3, \XMM3
1347 vaesenc \T1, \XMM4, \XMM4
1348 vaesenc \T1, \XMM5, \XMM5
1349 vaesenc \T1, \XMM6, \XMM6
1350 vaesenc \T1, \XMM7, \XMM7
1351 vaesenc \T1, \XMM8, \XMM8
1352
1353 #######################################################################
1354
1355 vmovdqa TMP3(%rsp), \T1
1356 vmovdqu HashKey_6(arg2), \T5
1357 vpclmulqdq $0x11, \T5, \T1, \T3
1358 vpxor \T3, \T4, \T4
1359 vpclmulqdq $0x00, \T5, \T1, \T3
1360 vpxor \T3, \T7, \T7
1361
1362 vpshufd $0b01001110, \T1, \T3
1363 vpxor \T1, \T3, \T3
1364 vmovdqu HashKey_6_k(arg2), \T5
1365 vpclmulqdq $0x10, \T5, \T3, \T3
1366 vpxor \T3, \T6, \T6
1367
1368 vmovdqu 16*5(arg1), \T1
1369 vaesenc \T1, \XMM1, \XMM1
1370 vaesenc \T1, \XMM2, \XMM2
1371 vaesenc \T1, \XMM3, \XMM3
1372 vaesenc \T1, \XMM4, \XMM4
1373 vaesenc \T1, \XMM5, \XMM5
1374 vaesenc \T1, \XMM6, \XMM6
1375 vaesenc \T1, \XMM7, \XMM7
1376 vaesenc \T1, \XMM8, \XMM8
1377
1378 vmovdqa TMP4(%rsp), \T1
1379 vmovdqu HashKey_5(arg2), \T5
1380 vpclmulqdq $0x11, \T5, \T1, \T3
1381 vpxor \T3, \T4, \T4
1382 vpclmulqdq $0x00, \T5, \T1, \T3
1383 vpxor \T3, \T7, \T7
1384
1385 vpshufd $0b01001110, \T1, \T3
1386 vpxor \T1, \T3, \T3
1387 vmovdqu HashKey_5_k(arg2), \T5
1388 vpclmulqdq $0x10, \T5, \T3, \T3
1389 vpxor \T3, \T6, \T6
1390
1391 vmovdqu 16*6(arg1), \T1
1392 vaesenc \T1, \XMM1, \XMM1
1393 vaesenc \T1, \XMM2, \XMM2
1394 vaesenc \T1, \XMM3, \XMM3
1395 vaesenc \T1, \XMM4, \XMM4
1396 vaesenc \T1, \XMM5, \XMM5
1397 vaesenc \T1, \XMM6, \XMM6
1398 vaesenc \T1, \XMM7, \XMM7
1399 vaesenc \T1, \XMM8, \XMM8
1400
1401
1402 vmovdqa TMP5(%rsp), \T1
1403 vmovdqu HashKey_4(arg2), \T5
1404 vpclmulqdq $0x11, \T5, \T1, \T3
1405 vpxor \T3, \T4, \T4
1406 vpclmulqdq $0x00, \T5, \T1, \T3
1407 vpxor \T3, \T7, \T7
1408
1409 vpshufd $0b01001110, \T1, \T3
1410 vpxor \T1, \T3, \T3
1411 vmovdqu HashKey_4_k(arg2), \T5
1412 vpclmulqdq $0x10, \T5, \T3, \T3
1413 vpxor \T3, \T6, \T6
1414
1415 vmovdqu 16*7(arg1), \T1
1416 vaesenc \T1, \XMM1, \XMM1
1417 vaesenc \T1, \XMM2, \XMM2
1418 vaesenc \T1, \XMM3, \XMM3
1419 vaesenc \T1, \XMM4, \XMM4
1420 vaesenc \T1, \XMM5, \XMM5
1421 vaesenc \T1, \XMM6, \XMM6
1422 vaesenc \T1, \XMM7, \XMM7
1423 vaesenc \T1, \XMM8, \XMM8
1424
1425 vmovdqa TMP6(%rsp), \T1
1426 vmovdqu HashKey_3(arg2), \T5
1427 vpclmulqdq $0x11, \T5, \T1, \T3
1428 vpxor \T3, \T4, \T4
1429 vpclmulqdq $0x00, \T5, \T1, \T3
1430 vpxor \T3, \T7, \T7
1431
1432 vpshufd $0b01001110, \T1, \T3
1433 vpxor \T1, \T3, \T3
1434 vmovdqu HashKey_3_k(arg2), \T5
1435 vpclmulqdq $0x10, \T5, \T3, \T3
1436 vpxor \T3, \T6, \T6
1437
1438
1439 vmovdqu 16*8(arg1), \T1
1440 vaesenc \T1, \XMM1, \XMM1
1441 vaesenc \T1, \XMM2, \XMM2
1442 vaesenc \T1, \XMM3, \XMM3
1443 vaesenc \T1, \XMM4, \XMM4
1444 vaesenc \T1, \XMM5, \XMM5
1445 vaesenc \T1, \XMM6, \XMM6
1446 vaesenc \T1, \XMM7, \XMM7
1447 vaesenc \T1, \XMM8, \XMM8
1448
1449 vmovdqa TMP7(%rsp), \T1
1450 vmovdqu HashKey_2(arg2), \T5
1451 vpclmulqdq $0x11, \T5, \T1, \T3
1452 vpxor \T3, \T4, \T4
1453 vpclmulqdq $0x00, \T5, \T1, \T3
1454 vpxor \T3, \T7, \T7
1455
1456 vpshufd $0b01001110, \T1, \T3
1457 vpxor \T1, \T3, \T3
1458 vmovdqu HashKey_2_k(arg2), \T5
1459 vpclmulqdq $0x10, \T5, \T3, \T3
1460 vpxor \T3, \T6, \T6
1461
1462 #######################################################################
1463
1464 vmovdqu 16*9(arg1), \T5
1465 vaesenc \T5, \XMM1, \XMM1
1466 vaesenc \T5, \XMM2, \XMM2
1467 vaesenc \T5, \XMM3, \XMM3
1468 vaesenc \T5, \XMM4, \XMM4
1469 vaesenc \T5, \XMM5, \XMM5
1470 vaesenc \T5, \XMM6, \XMM6
1471 vaesenc \T5, \XMM7, \XMM7
1472 vaesenc \T5, \XMM8, \XMM8
1473
1474 vmovdqa TMP8(%rsp), \T1
1475 vmovdqu HashKey(arg2), \T5
1476 vpclmulqdq $0x11, \T5, \T1, \T3
1477 vpxor \T3, \T4, \T4
1478 vpclmulqdq $0x00, \T5, \T1, \T3
1479 vpxor \T3, \T7, \T7
1480
1481 vpshufd $0b01001110, \T1, \T3
1482 vpxor \T1, \T3, \T3
1483 vmovdqu HashKey_k(arg2), \T5
1484 vpclmulqdq $0x10, \T5, \T3, \T3
1485 vpxor \T3, \T6, \T6
1486
1487 vpxor \T4, \T6, \T6
1488 vpxor \T7, \T6, \T6
1489
1490 vmovdqu 16*10(arg1), \T5
1491
1492 i = 11
1493 setreg
1494.rep (\REP-9)
1495
1496 vaesenc \T5, \XMM1, \XMM1
1497 vaesenc \T5, \XMM2, \XMM2
1498 vaesenc \T5, \XMM3, \XMM3
1499 vaesenc \T5, \XMM4, \XMM4
1500 vaesenc \T5, \XMM5, \XMM5
1501 vaesenc \T5, \XMM6, \XMM6
1502 vaesenc \T5, \XMM7, \XMM7
1503 vaesenc \T5, \XMM8, \XMM8
1504
1505 vmovdqu 16*i(arg1), \T5
1506 i = i + 1
1507 setreg
1508.endr
1509
1510 i = 0
1511 j = 1
1512 setreg
1513.rep 8
1514 vpxor 16*i(arg4, %r11), \T5, \T2
1515 .if \ENC_DEC == ENC
1516 vaesenclast \T2, reg_j, reg_j
1517 .else
1518 vaesenclast \T2, reg_j, \T3
1519 vmovdqu 16*i(arg4, %r11), reg_j
1520 vmovdqu \T3, 16*i(arg3, %r11)
1521 .endif
1522 i = (i+1)
1523 j = (j+1)
1524 setreg
1525.endr
1526 #######################################################################
1527
1528
1529 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1530 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1531 vpxor \T3, \T7, \T7
1532 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1533
1534
1535
1536 #######################################################################
1537 #first phase of the reduction
1538 #######################################################################
1539 vpslld $31, \T7, \T2 # packed right shifting << 31
1540 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1541 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1542
1543 vpxor \T3, \T2, \T2 # xor the shifted versions
1544 vpxor \T4, \T2, \T2
1545
1546 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1547
1548 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1549 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1550 #######################################################################
1551 .if \ENC_DEC == ENC
1552 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1553 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1554 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1555 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1556 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1557 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1558 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1559 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1560 .endif
1561
1562 #######################################################################
1563 #second phase of the reduction
1564 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1565 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1566 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1567 vpxor \T3, \T2, \T2 # xor the shifted versions
1568 vpxor \T4, \T2, \T2
1569
1570 vpxor \T1, \T2, \T2
1571 vpxor \T2, \T7, \T7
1572 vpxor \T7, \T6, \T6 # the result is in T6
1573 #######################################################################
1574
1575 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1576 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1577 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1578 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1579 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1580 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1581 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1582 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1583
1584
1585 vpxor \T6, \XMM1, \XMM1
1586
1587
1588
1589.endm
1590
1591
1592# GHASH the last 4 ciphertext blocks.
1593.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1594
1595 ## Karatsuba Method
1596
1597
1598 vpshufd $0b01001110, \XMM1, \T2
1599 vpxor \XMM1, \T2, \T2
1600 vmovdqu HashKey_8(arg2), \T5
1601 vpclmulqdq $0x11, \T5, \XMM1, \T6
1602 vpclmulqdq $0x00, \T5, \XMM1, \T7
1603
1604 vmovdqu HashKey_8_k(arg2), \T3
1605 vpclmulqdq $0x00, \T3, \T2, \XMM1
1606
1607 ######################
1608
1609 vpshufd $0b01001110, \XMM2, \T2
1610 vpxor \XMM2, \T2, \T2
1611 vmovdqu HashKey_7(arg2), \T5
1612 vpclmulqdq $0x11, \T5, \XMM2, \T4
1613 vpxor \T4, \T6, \T6
1614
1615 vpclmulqdq $0x00, \T5, \XMM2, \T4
1616 vpxor \T4, \T7, \T7
1617
1618 vmovdqu HashKey_7_k(arg2), \T3
1619 vpclmulqdq $0x00, \T3, \T2, \T2
1620 vpxor \T2, \XMM1, \XMM1
1621
1622 ######################
1623
1624 vpshufd $0b01001110, \XMM3, \T2
1625 vpxor \XMM3, \T2, \T2
1626 vmovdqu HashKey_6(arg2), \T5
1627 vpclmulqdq $0x11, \T5, \XMM3, \T4
1628 vpxor \T4, \T6, \T6
1629
1630 vpclmulqdq $0x00, \T5, \XMM3, \T4
1631 vpxor \T4, \T7, \T7
1632
1633 vmovdqu HashKey_6_k(arg2), \T3
1634 vpclmulqdq $0x00, \T3, \T2, \T2
1635 vpxor \T2, \XMM1, \XMM1
1636
1637 ######################
1638
1639 vpshufd $0b01001110, \XMM4, \T2
1640 vpxor \XMM4, \T2, \T2
1641 vmovdqu HashKey_5(arg2), \T5
1642 vpclmulqdq $0x11, \T5, \XMM4, \T4
1643 vpxor \T4, \T6, \T6
1644
1645 vpclmulqdq $0x00, \T5, \XMM4, \T4
1646 vpxor \T4, \T7, \T7
1647
1648 vmovdqu HashKey_5_k(arg2), \T3
1649 vpclmulqdq $0x00, \T3, \T2, \T2
1650 vpxor \T2, \XMM1, \XMM1
1651
1652 ######################
1653
1654 vpshufd $0b01001110, \XMM5, \T2
1655 vpxor \XMM5, \T2, \T2
1656 vmovdqu HashKey_4(arg2), \T5
1657 vpclmulqdq $0x11, \T5, \XMM5, \T4
1658 vpxor \T4, \T6, \T6
1659
1660 vpclmulqdq $0x00, \T5, \XMM5, \T4
1661 vpxor \T4, \T7, \T7
1662
1663 vmovdqu HashKey_4_k(arg2), \T3
1664 vpclmulqdq $0x00, \T3, \T2, \T2
1665 vpxor \T2, \XMM1, \XMM1
1666
1667 ######################
1668
1669 vpshufd $0b01001110, \XMM6, \T2
1670 vpxor \XMM6, \T2, \T2
1671 vmovdqu HashKey_3(arg2), \T5
1672 vpclmulqdq $0x11, \T5, \XMM6, \T4
1673 vpxor \T4, \T6, \T6
1674
1675 vpclmulqdq $0x00, \T5, \XMM6, \T4
1676 vpxor \T4, \T7, \T7
1677
1678 vmovdqu HashKey_3_k(arg2), \T3
1679 vpclmulqdq $0x00, \T3, \T2, \T2
1680 vpxor \T2, \XMM1, \XMM1
1681
1682 ######################
1683
1684 vpshufd $0b01001110, \XMM7, \T2
1685 vpxor \XMM7, \T2, \T2
1686 vmovdqu HashKey_2(arg2), \T5
1687 vpclmulqdq $0x11, \T5, \XMM7, \T4
1688 vpxor \T4, \T6, \T6
1689
1690 vpclmulqdq $0x00, \T5, \XMM7, \T4
1691 vpxor \T4, \T7, \T7
1692
1693 vmovdqu HashKey_2_k(arg2), \T3
1694 vpclmulqdq $0x00, \T3, \T2, \T2
1695 vpxor \T2, \XMM1, \XMM1
1696
1697 ######################
1698
1699 vpshufd $0b01001110, \XMM8, \T2
1700 vpxor \XMM8, \T2, \T2
1701 vmovdqu HashKey(arg2), \T5
1702 vpclmulqdq $0x11, \T5, \XMM8, \T4
1703 vpxor \T4, \T6, \T6
1704
1705 vpclmulqdq $0x00, \T5, \XMM8, \T4
1706 vpxor \T4, \T7, \T7
1707
1708 vmovdqu HashKey_k(arg2), \T3
1709 vpclmulqdq $0x00, \T3, \T2, \T2
1710
1711 vpxor \T2, \XMM1, \XMM1
1712 vpxor \T6, \XMM1, \XMM1
1713 vpxor \T7, \XMM1, \T2
1714
1715
1716
1717
1718 vpslldq $8, \T2, \T4
1719 vpsrldq $8, \T2, \T2
1720
1721 vpxor \T4, \T7, \T7
1722 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1723 # the accumulated carry-less multiplications
1724
1725 #######################################################################
1726 #first phase of the reduction
1727 vpslld $31, \T7, \T2 # packed right shifting << 31
1728 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1729 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1730
1731 vpxor \T3, \T2, \T2 # xor the shifted versions
1732 vpxor \T4, \T2, \T2
1733
1734 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1735
1736 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1737 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1738 #######################################################################
1739
1740
1741 #second phase of the reduction
1742 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1743 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1744 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1745 vpxor \T3, \T2, \T2 # xor the shifted versions
1746 vpxor \T4, \T2, \T2
1747
1748 vpxor \T1, \T2, \T2
1749 vpxor \T2, \T7, \T7
1750 vpxor \T7, \T6, \T6 # the result is in T6
1751
1752.endm
1753
1754#############################################################
1755#void aesni_gcm_precomp_avx_gen2
1756# (gcm_data *my_ctx_data,
1757# gcm_context_data *data,
1758# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1759# u8 *iv, /* Pre-counter block j0: 4 byte salt
1760# (from Security Association) concatenated with 8 byte
1761# Initialisation Vector (from IPSec ESP Payload)
1762# concatenated with 0x00000001. 16-byte aligned pointer. */
1763# const u8 *aad, /* Additional Authentication Data (AAD)*/
1764# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1765#############################################################
1766SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1767 FUNC_SAVE
1768 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1769 FUNC_RESTORE
1770 RET
1771SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1772
1773###############################################################################
1774#void aesni_gcm_enc_update_avx_gen2(
1775# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1776# gcm_context_data *data,
1777# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1778# const u8 *in, /* Plaintext input */
1779# u64 plaintext_len) /* Length of data in Bytes for encryption. */
1780###############################################################################
1781SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1782 FUNC_SAVE
1783 mov keysize, %eax
1784 cmp $32, %eax
1785 je key_256_enc_update
1786 cmp $16, %eax
1787 je key_128_enc_update
1788 # must be 192
1789 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1790 FUNC_RESTORE
1791 RET
1792key_128_enc_update:
1793 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1794 FUNC_RESTORE
1795 RET
1796key_256_enc_update:
1797 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1798 FUNC_RESTORE
1799 RET
1800SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1801
1802###############################################################################
1803#void aesni_gcm_dec_update_avx_gen2(
1804# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1805# gcm_context_data *data,
1806# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1807# const u8 *in, /* Ciphertext input */
1808# u64 plaintext_len) /* Length of data in Bytes for encryption. */
1809###############################################################################
1810SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1811 FUNC_SAVE
1812 mov keysize,%eax
1813 cmp $32, %eax
1814 je key_256_dec_update
1815 cmp $16, %eax
1816 je key_128_dec_update
1817 # must be 192
1818 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1819 FUNC_RESTORE
1820 RET
1821key_128_dec_update:
1822 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1823 FUNC_RESTORE
1824 RET
1825key_256_dec_update:
1826 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1827 FUNC_RESTORE
1828 RET
1829SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1830
1831###############################################################################
1832#void aesni_gcm_finalize_avx_gen2(
1833# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1834# gcm_context_data *data,
1835# u8 *auth_tag, /* Authenticated Tag output. */
1836# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1837# Valid values are 16 (most likely), 12 or 8. */
1838###############################################################################
1839SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1840 FUNC_SAVE
1841 mov keysize,%eax
1842 cmp $32, %eax
1843 je key_256_finalize
1844 cmp $16, %eax
1845 je key_128_finalize
1846 # must be 192
1847 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1848 FUNC_RESTORE
1849 RET
1850key_128_finalize:
1851 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1852 FUNC_RESTORE
1853 RET
1854key_256_finalize:
1855 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1856 FUNC_RESTORE
1857 RET
1858SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1859
1860###############################################################################
1861# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1862# Input: A and B (128-bits each, bit-reflected)
1863# Output: C = A*B*x mod poly, (i.e. >>1 )
1864# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1865# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1866###############################################################################
1867.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1868
1869 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1870 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1871 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1872 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1873 vpxor \T3, \GH, \GH
1874
1875
1876 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1877 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1878
1879 vpxor \T3, \T1, \T1
1880 vpxor \T2, \GH, \GH
1881
1882 #######################################################################
1883 #first phase of the reduction
1884 vmovdqa POLY2(%rip), \T3
1885
1886 vpclmulqdq $0x01, \GH, \T3, \T2
1887 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1888
1889 vpxor \T2, \GH, \GH # first phase of the reduction complete
1890 #######################################################################
1891 #second phase of the reduction
1892 vpclmulqdq $0x00, \GH, \T3, \T2
1893 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1894
1895 vpclmulqdq $0x10, \GH, \T3, \GH
1896 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1897
1898 vpxor \T2, \GH, \GH # second phase of the reduction complete
1899 #######################################################################
1900 vpxor \T1, \GH, \GH # the result is in GH
1901
1902
1903.endm
1904
1905.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1906
1907 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1908 vmovdqa \HK, \T5
1909 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1910 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1911
1912 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1913 vmovdqu \T5, HashKey_3(arg2)
1914
1915 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1916 vmovdqu \T5, HashKey_4(arg2)
1917
1918 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1919 vmovdqu \T5, HashKey_5(arg2)
1920
1921 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1922 vmovdqu \T5, HashKey_6(arg2)
1923
1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1925 vmovdqu \T5, HashKey_7(arg2)
1926
1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1928 vmovdqu \T5, HashKey_8(arg2)
1929
1930.endm
1931
1932## if a = number of total plaintext bytes
1933## b = floor(a/16)
1934## num_initial_blocks = b mod 4#
1935## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1936## r10, r11, r12, rax are clobbered
1937## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1938
1939.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1940 i = (8-\num_initial_blocks)
1941 setreg
1942 vmovdqu AadHash(arg2), reg_i
1943
1944 # start AES for num_initial_blocks blocks
1945 vmovdqu CurCount(arg2), \CTR
1946
1947 i = (9-\num_initial_blocks)
1948 setreg
1949.rep \num_initial_blocks
1950 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1951 vmovdqa \CTR, reg_i
1952 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1953 i = (i+1)
1954 setreg
1955.endr
1956
1957 vmovdqa (arg1), \T_key
1958 i = (9-\num_initial_blocks)
1959 setreg
1960.rep \num_initial_blocks
1961 vpxor \T_key, reg_i, reg_i
1962 i = (i+1)
1963 setreg
1964.endr
1965
1966 j = 1
1967 setreg
1968.rep \REP
1969 vmovdqa 16*j(arg1), \T_key
1970 i = (9-\num_initial_blocks)
1971 setreg
1972.rep \num_initial_blocks
1973 vaesenc \T_key, reg_i, reg_i
1974 i = (i+1)
1975 setreg
1976.endr
1977
1978 j = (j+1)
1979 setreg
1980.endr
1981
1982
1983 vmovdqa 16*j(arg1), \T_key
1984 i = (9-\num_initial_blocks)
1985 setreg
1986.rep \num_initial_blocks
1987 vaesenclast \T_key, reg_i, reg_i
1988 i = (i+1)
1989 setreg
1990.endr
1991
1992 i = (9-\num_initial_blocks)
1993 setreg
1994.rep \num_initial_blocks
1995 vmovdqu (arg4, %r11), \T1
1996 vpxor \T1, reg_i, reg_i
1997 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
1998 # num_initial_blocks blocks
1999 add $16, %r11
2000.if \ENC_DEC == DEC
2001 vmovdqa \T1, reg_i
2002.endif
2003 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2004 i = (i+1)
2005 setreg
2006.endr
2007
2008
2009 i = (8-\num_initial_blocks)
2010 j = (9-\num_initial_blocks)
2011 setreg
2012
2013.rep \num_initial_blocks
2014 vpxor reg_i, reg_j, reg_j
2015 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2016 i = (i+1)
2017 j = (j+1)
2018 setreg
2019.endr
2020 # XMM8 has the combined result here
2021
2022 vmovdqa \XMM8, TMP1(%rsp)
2023 vmovdqa \XMM8, \T3
2024
2025 cmp $128, %r13
2026 jl _initial_blocks_done\@ # no need for precomputed constants
2027
2028###############################################################################
2029# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2030 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2031 vmovdqa \CTR, \XMM1
2032 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2033
2034 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2035 vmovdqa \CTR, \XMM2
2036 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2037
2038 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2039 vmovdqa \CTR, \XMM3
2040 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2041
2042 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2043 vmovdqa \CTR, \XMM4
2044 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2045
2046 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2047 vmovdqa \CTR, \XMM5
2048 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2049
2050 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2051 vmovdqa \CTR, \XMM6
2052 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2053
2054 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2055 vmovdqa \CTR, \XMM7
2056 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2057
2058 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2059 vmovdqa \CTR, \XMM8
2060 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2061
2062 vmovdqa (arg1), \T_key
2063 vpxor \T_key, \XMM1, \XMM1
2064 vpxor \T_key, \XMM2, \XMM2
2065 vpxor \T_key, \XMM3, \XMM3
2066 vpxor \T_key, \XMM4, \XMM4
2067 vpxor \T_key, \XMM5, \XMM5
2068 vpxor \T_key, \XMM6, \XMM6
2069 vpxor \T_key, \XMM7, \XMM7
2070 vpxor \T_key, \XMM8, \XMM8
2071
2072 i = 1
2073 setreg
2074.rep \REP # do REP rounds
2075 vmovdqa 16*i(arg1), \T_key
2076 vaesenc \T_key, \XMM1, \XMM1
2077 vaesenc \T_key, \XMM2, \XMM2
2078 vaesenc \T_key, \XMM3, \XMM3
2079 vaesenc \T_key, \XMM4, \XMM4
2080 vaesenc \T_key, \XMM5, \XMM5
2081 vaesenc \T_key, \XMM6, \XMM6
2082 vaesenc \T_key, \XMM7, \XMM7
2083 vaesenc \T_key, \XMM8, \XMM8
2084 i = (i+1)
2085 setreg
2086.endr
2087
2088
2089 vmovdqa 16*i(arg1), \T_key
2090 vaesenclast \T_key, \XMM1, \XMM1
2091 vaesenclast \T_key, \XMM2, \XMM2
2092 vaesenclast \T_key, \XMM3, \XMM3
2093 vaesenclast \T_key, \XMM4, \XMM4
2094 vaesenclast \T_key, \XMM5, \XMM5
2095 vaesenclast \T_key, \XMM6, \XMM6
2096 vaesenclast \T_key, \XMM7, \XMM7
2097 vaesenclast \T_key, \XMM8, \XMM8
2098
2099 vmovdqu (arg4, %r11), \T1
2100 vpxor \T1, \XMM1, \XMM1
2101 vmovdqu \XMM1, (arg3 , %r11)
2102 .if \ENC_DEC == DEC
2103 vmovdqa \T1, \XMM1
2104 .endif
2105
2106 vmovdqu 16*1(arg4, %r11), \T1
2107 vpxor \T1, \XMM2, \XMM2
2108 vmovdqu \XMM2, 16*1(arg3 , %r11)
2109 .if \ENC_DEC == DEC
2110 vmovdqa \T1, \XMM2
2111 .endif
2112
2113 vmovdqu 16*2(arg4, %r11), \T1
2114 vpxor \T1, \XMM3, \XMM3
2115 vmovdqu \XMM3, 16*2(arg3 , %r11)
2116 .if \ENC_DEC == DEC
2117 vmovdqa \T1, \XMM3
2118 .endif
2119
2120 vmovdqu 16*3(arg4, %r11), \T1
2121 vpxor \T1, \XMM4, \XMM4
2122 vmovdqu \XMM4, 16*3(arg3 , %r11)
2123 .if \ENC_DEC == DEC
2124 vmovdqa \T1, \XMM4
2125 .endif
2126
2127 vmovdqu 16*4(arg4, %r11), \T1
2128 vpxor \T1, \XMM5, \XMM5
2129 vmovdqu \XMM5, 16*4(arg3 , %r11)
2130 .if \ENC_DEC == DEC
2131 vmovdqa \T1, \XMM5
2132 .endif
2133
2134 vmovdqu 16*5(arg4, %r11), \T1
2135 vpxor \T1, \XMM6, \XMM6
2136 vmovdqu \XMM6, 16*5(arg3 , %r11)
2137 .if \ENC_DEC == DEC
2138 vmovdqa \T1, \XMM6
2139 .endif
2140
2141 vmovdqu 16*6(arg4, %r11), \T1
2142 vpxor \T1, \XMM7, \XMM7
2143 vmovdqu \XMM7, 16*6(arg3 , %r11)
2144 .if \ENC_DEC == DEC
2145 vmovdqa \T1, \XMM7
2146 .endif
2147
2148 vmovdqu 16*7(arg4, %r11), \T1
2149 vpxor \T1, \XMM8, \XMM8
2150 vmovdqu \XMM8, 16*7(arg3 , %r11)
2151 .if \ENC_DEC == DEC
2152 vmovdqa \T1, \XMM8
2153 .endif
2154
2155 add $128, %r11
2156
2157 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2158 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2159 # the corresponding ciphertext
2160 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2161 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2162 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2163 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2164 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2165 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2166 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2167
2168###############################################################################
2169
2170_initial_blocks_done\@:
2171
2172
2173.endm
2174
2175
2176
2177# encrypt 8 blocks at a time
2178# ghash the 8 previously encrypted ciphertext blocks
2179# arg1, arg2, arg3, arg4 are used as pointers only, not modified
2180# r11 is the data offset value
2181.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2182
2183 vmovdqa \XMM1, \T2
2184 vmovdqa \XMM2, TMP2(%rsp)
2185 vmovdqa \XMM3, TMP3(%rsp)
2186 vmovdqa \XMM4, TMP4(%rsp)
2187 vmovdqa \XMM5, TMP5(%rsp)
2188 vmovdqa \XMM6, TMP6(%rsp)
2189 vmovdqa \XMM7, TMP7(%rsp)
2190 vmovdqa \XMM8, TMP8(%rsp)
2191
2192.if \loop_idx == in_order
2193 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2194 vpaddd ONE(%rip), \XMM1, \XMM2
2195 vpaddd ONE(%rip), \XMM2, \XMM3
2196 vpaddd ONE(%rip), \XMM3, \XMM4
2197 vpaddd ONE(%rip), \XMM4, \XMM5
2198 vpaddd ONE(%rip), \XMM5, \XMM6
2199 vpaddd ONE(%rip), \XMM6, \XMM7
2200 vpaddd ONE(%rip), \XMM7, \XMM8
2201 vmovdqa \XMM8, \CTR
2202
2203 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2204 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2205 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2206 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2207 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2208 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2209 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2210 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2211.else
2212 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2213 vpaddd ONEf(%rip), \XMM1, \XMM2
2214 vpaddd ONEf(%rip), \XMM2, \XMM3
2215 vpaddd ONEf(%rip), \XMM3, \XMM4
2216 vpaddd ONEf(%rip), \XMM4, \XMM5
2217 vpaddd ONEf(%rip), \XMM5, \XMM6
2218 vpaddd ONEf(%rip), \XMM6, \XMM7
2219 vpaddd ONEf(%rip), \XMM7, \XMM8
2220 vmovdqa \XMM8, \CTR
2221.endif
2222
2223
2224 #######################################################################
2225
2226 vmovdqu (arg1), \T1
2227 vpxor \T1, \XMM1, \XMM1
2228 vpxor \T1, \XMM2, \XMM2
2229 vpxor \T1, \XMM3, \XMM3
2230 vpxor \T1, \XMM4, \XMM4
2231 vpxor \T1, \XMM5, \XMM5
2232 vpxor \T1, \XMM6, \XMM6
2233 vpxor \T1, \XMM7, \XMM7
2234 vpxor \T1, \XMM8, \XMM8
2235
2236 #######################################################################
2237
2238
2239
2240
2241
2242 vmovdqu 16*1(arg1), \T1
2243 vaesenc \T1, \XMM1, \XMM1
2244 vaesenc \T1, \XMM2, \XMM2
2245 vaesenc \T1, \XMM3, \XMM3
2246 vaesenc \T1, \XMM4, \XMM4
2247 vaesenc \T1, \XMM5, \XMM5
2248 vaesenc \T1, \XMM6, \XMM6
2249 vaesenc \T1, \XMM7, \XMM7
2250 vaesenc \T1, \XMM8, \XMM8
2251
2252 vmovdqu 16*2(arg1), \T1
2253 vaesenc \T1, \XMM1, \XMM1
2254 vaesenc \T1, \XMM2, \XMM2
2255 vaesenc \T1, \XMM3, \XMM3
2256 vaesenc \T1, \XMM4, \XMM4
2257 vaesenc \T1, \XMM5, \XMM5
2258 vaesenc \T1, \XMM6, \XMM6
2259 vaesenc \T1, \XMM7, \XMM7
2260 vaesenc \T1, \XMM8, \XMM8
2261
2262
2263 #######################################################################
2264
2265 vmovdqu HashKey_8(arg2), \T5
2266 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2267 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2268 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2269 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2270 vpxor \T5, \T6, \T6
2271
2272 vmovdqu 16*3(arg1), \T1
2273 vaesenc \T1, \XMM1, \XMM1
2274 vaesenc \T1, \XMM2, \XMM2
2275 vaesenc \T1, \XMM3, \XMM3
2276 vaesenc \T1, \XMM4, \XMM4
2277 vaesenc \T1, \XMM5, \XMM5
2278 vaesenc \T1, \XMM6, \XMM6
2279 vaesenc \T1, \XMM7, \XMM7
2280 vaesenc \T1, \XMM8, \XMM8
2281
2282 vmovdqa TMP2(%rsp), \T1
2283 vmovdqu HashKey_7(arg2), \T5
2284 vpclmulqdq $0x11, \T5, \T1, \T3
2285 vpxor \T3, \T4, \T4
2286
2287 vpclmulqdq $0x00, \T5, \T1, \T3
2288 vpxor \T3, \T7, \T7
2289
2290 vpclmulqdq $0x01, \T5, \T1, \T3
2291 vpxor \T3, \T6, \T6
2292
2293 vpclmulqdq $0x10, \T5, \T1, \T3
2294 vpxor \T3, \T6, \T6
2295
2296 vmovdqu 16*4(arg1), \T1
2297 vaesenc \T1, \XMM1, \XMM1
2298 vaesenc \T1, \XMM2, \XMM2
2299 vaesenc \T1, \XMM3, \XMM3
2300 vaesenc \T1, \XMM4, \XMM4
2301 vaesenc \T1, \XMM5, \XMM5
2302 vaesenc \T1, \XMM6, \XMM6
2303 vaesenc \T1, \XMM7, \XMM7
2304 vaesenc \T1, \XMM8, \XMM8
2305
2306 #######################################################################
2307
2308 vmovdqa TMP3(%rsp), \T1
2309 vmovdqu HashKey_6(arg2), \T5
2310 vpclmulqdq $0x11, \T5, \T1, \T3
2311 vpxor \T3, \T4, \T4
2312
2313 vpclmulqdq $0x00, \T5, \T1, \T3
2314 vpxor \T3, \T7, \T7
2315
2316 vpclmulqdq $0x01, \T5, \T1, \T3
2317 vpxor \T3, \T6, \T6
2318
2319 vpclmulqdq $0x10, \T5, \T1, \T3
2320 vpxor \T3, \T6, \T6
2321
2322 vmovdqu 16*5(arg1), \T1
2323 vaesenc \T1, \XMM1, \XMM1
2324 vaesenc \T1, \XMM2, \XMM2
2325 vaesenc \T1, \XMM3, \XMM3
2326 vaesenc \T1, \XMM4, \XMM4
2327 vaesenc \T1, \XMM5, \XMM5
2328 vaesenc \T1, \XMM6, \XMM6
2329 vaesenc \T1, \XMM7, \XMM7
2330 vaesenc \T1, \XMM8, \XMM8
2331
2332 vmovdqa TMP4(%rsp), \T1
2333 vmovdqu HashKey_5(arg2), \T5
2334 vpclmulqdq $0x11, \T5, \T1, \T3
2335 vpxor \T3, \T4, \T4
2336
2337 vpclmulqdq $0x00, \T5, \T1, \T3
2338 vpxor \T3, \T7, \T7
2339
2340 vpclmulqdq $0x01, \T5, \T1, \T3
2341 vpxor \T3, \T6, \T6
2342
2343 vpclmulqdq $0x10, \T5, \T1, \T3
2344 vpxor \T3, \T6, \T6
2345
2346 vmovdqu 16*6(arg1), \T1
2347 vaesenc \T1, \XMM1, \XMM1
2348 vaesenc \T1, \XMM2, \XMM2
2349 vaesenc \T1, \XMM3, \XMM3
2350 vaesenc \T1, \XMM4, \XMM4
2351 vaesenc \T1, \XMM5, \XMM5
2352 vaesenc \T1, \XMM6, \XMM6
2353 vaesenc \T1, \XMM7, \XMM7
2354 vaesenc \T1, \XMM8, \XMM8
2355
2356
2357 vmovdqa TMP5(%rsp), \T1
2358 vmovdqu HashKey_4(arg2), \T5
2359 vpclmulqdq $0x11, \T5, \T1, \T3
2360 vpxor \T3, \T4, \T4
2361
2362 vpclmulqdq $0x00, \T5, \T1, \T3
2363 vpxor \T3, \T7, \T7
2364
2365 vpclmulqdq $0x01, \T5, \T1, \T3
2366 vpxor \T3, \T6, \T6
2367
2368 vpclmulqdq $0x10, \T5, \T1, \T3
2369 vpxor \T3, \T6, \T6
2370
2371 vmovdqu 16*7(arg1), \T1
2372 vaesenc \T1, \XMM1, \XMM1
2373 vaesenc \T1, \XMM2, \XMM2
2374 vaesenc \T1, \XMM3, \XMM3
2375 vaesenc \T1, \XMM4, \XMM4
2376 vaesenc \T1, \XMM5, \XMM5
2377 vaesenc \T1, \XMM6, \XMM6
2378 vaesenc \T1, \XMM7, \XMM7
2379 vaesenc \T1, \XMM8, \XMM8
2380
2381 vmovdqa TMP6(%rsp), \T1
2382 vmovdqu HashKey_3(arg2), \T5
2383 vpclmulqdq $0x11, \T5, \T1, \T3
2384 vpxor \T3, \T4, \T4
2385
2386 vpclmulqdq $0x00, \T5, \T1, \T3
2387 vpxor \T3, \T7, \T7
2388
2389 vpclmulqdq $0x01, \T5, \T1, \T3
2390 vpxor \T3, \T6, \T6
2391
2392 vpclmulqdq $0x10, \T5, \T1, \T3
2393 vpxor \T3, \T6, \T6
2394
2395 vmovdqu 16*8(arg1), \T1
2396 vaesenc \T1, \XMM1, \XMM1
2397 vaesenc \T1, \XMM2, \XMM2
2398 vaesenc \T1, \XMM3, \XMM3
2399 vaesenc \T1, \XMM4, \XMM4
2400 vaesenc \T1, \XMM5, \XMM5
2401 vaesenc \T1, \XMM6, \XMM6
2402 vaesenc \T1, \XMM7, \XMM7
2403 vaesenc \T1, \XMM8, \XMM8
2404
2405 vmovdqa TMP7(%rsp), \T1
2406 vmovdqu HashKey_2(arg2), \T5
2407 vpclmulqdq $0x11, \T5, \T1, \T3
2408 vpxor \T3, \T4, \T4
2409
2410 vpclmulqdq $0x00, \T5, \T1, \T3
2411 vpxor \T3, \T7, \T7
2412
2413 vpclmulqdq $0x01, \T5, \T1, \T3
2414 vpxor \T3, \T6, \T6
2415
2416 vpclmulqdq $0x10, \T5, \T1, \T3
2417 vpxor \T3, \T6, \T6
2418
2419
2420 #######################################################################
2421
2422 vmovdqu 16*9(arg1), \T5
2423 vaesenc \T5, \XMM1, \XMM1
2424 vaesenc \T5, \XMM2, \XMM2
2425 vaesenc \T5, \XMM3, \XMM3
2426 vaesenc \T5, \XMM4, \XMM4
2427 vaesenc \T5, \XMM5, \XMM5
2428 vaesenc \T5, \XMM6, \XMM6
2429 vaesenc \T5, \XMM7, \XMM7
2430 vaesenc \T5, \XMM8, \XMM8
2431
2432 vmovdqa TMP8(%rsp), \T1
2433 vmovdqu HashKey(arg2), \T5
2434
2435 vpclmulqdq $0x00, \T5, \T1, \T3
2436 vpxor \T3, \T7, \T7
2437
2438 vpclmulqdq $0x01, \T5, \T1, \T3
2439 vpxor \T3, \T6, \T6
2440
2441 vpclmulqdq $0x10, \T5, \T1, \T3
2442 vpxor \T3, \T6, \T6
2443
2444 vpclmulqdq $0x11, \T5, \T1, \T3
2445 vpxor \T3, \T4, \T1
2446
2447
2448 vmovdqu 16*10(arg1), \T5
2449
2450 i = 11
2451 setreg
2452.rep (\REP-9)
2453 vaesenc \T5, \XMM1, \XMM1
2454 vaesenc \T5, \XMM2, \XMM2
2455 vaesenc \T5, \XMM3, \XMM3
2456 vaesenc \T5, \XMM4, \XMM4
2457 vaesenc \T5, \XMM5, \XMM5
2458 vaesenc \T5, \XMM6, \XMM6
2459 vaesenc \T5, \XMM7, \XMM7
2460 vaesenc \T5, \XMM8, \XMM8
2461
2462 vmovdqu 16*i(arg1), \T5
2463 i = i + 1
2464 setreg
2465.endr
2466
2467 i = 0
2468 j = 1
2469 setreg
2470.rep 8
2471 vpxor 16*i(arg4, %r11), \T5, \T2
2472 .if \ENC_DEC == ENC
2473 vaesenclast \T2, reg_j, reg_j
2474 .else
2475 vaesenclast \T2, reg_j, \T3
2476 vmovdqu 16*i(arg4, %r11), reg_j
2477 vmovdqu \T3, 16*i(arg3, %r11)
2478 .endif
2479 i = (i+1)
2480 j = (j+1)
2481 setreg
2482.endr
2483 #######################################################################
2484
2485
2486 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2487 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2488 vpxor \T3, \T7, \T7
2489 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2490
2491
2492
2493 #######################################################################
2494 #first phase of the reduction
2495 vmovdqa POLY2(%rip), \T3
2496
2497 vpclmulqdq $0x01, \T7, \T3, \T2
2498 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2499
2500 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2501 #######################################################################
2502 .if \ENC_DEC == ENC
2503 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2504 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2505 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2506 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2507 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2508 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2509 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2510 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2511 .endif
2512
2513 #######################################################################
2514 #second phase of the reduction
2515 vpclmulqdq $0x00, \T7, \T3, \T2
2516 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2517
2518 vpclmulqdq $0x10, \T7, \T3, \T4
2519 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2520
2521 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2522 #######################################################################
2523 vpxor \T4, \T1, \T1 # the result is in T1
2524
2525 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2526 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2527 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2528 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2529 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2530 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2531 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2532 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2533
2534
2535 vpxor \T1, \XMM1, \XMM1
2536
2537
2538
2539.endm
2540
2541
2542# GHASH the last 4 ciphertext blocks.
2543.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2544
2545 ## Karatsuba Method
2546
2547 vmovdqu HashKey_8(arg2), \T5
2548
2549 vpshufd $0b01001110, \XMM1, \T2
2550 vpshufd $0b01001110, \T5, \T3
2551 vpxor \XMM1, \T2, \T2
2552 vpxor \T5, \T3, \T3
2553
2554 vpclmulqdq $0x11, \T5, \XMM1, \T6
2555 vpclmulqdq $0x00, \T5, \XMM1, \T7
2556
2557 vpclmulqdq $0x00, \T3, \T2, \XMM1
2558
2559 ######################
2560
2561 vmovdqu HashKey_7(arg2), \T5
2562 vpshufd $0b01001110, \XMM2, \T2
2563 vpshufd $0b01001110, \T5, \T3
2564 vpxor \XMM2, \T2, \T2
2565 vpxor \T5, \T3, \T3
2566
2567 vpclmulqdq $0x11, \T5, \XMM2, \T4
2568 vpxor \T4, \T6, \T6
2569
2570 vpclmulqdq $0x00, \T5, \XMM2, \T4
2571 vpxor \T4, \T7, \T7
2572
2573 vpclmulqdq $0x00, \T3, \T2, \T2
2574
2575 vpxor \T2, \XMM1, \XMM1
2576
2577 ######################
2578
2579 vmovdqu HashKey_6(arg2), \T5
2580 vpshufd $0b01001110, \XMM3, \T2
2581 vpshufd $0b01001110, \T5, \T3
2582 vpxor \XMM3, \T2, \T2
2583 vpxor \T5, \T3, \T3
2584
2585 vpclmulqdq $0x11, \T5, \XMM3, \T4
2586 vpxor \T4, \T6, \T6
2587
2588 vpclmulqdq $0x00, \T5, \XMM3, \T4
2589 vpxor \T4, \T7, \T7
2590
2591 vpclmulqdq $0x00, \T3, \T2, \T2
2592
2593 vpxor \T2, \XMM1, \XMM1
2594
2595 ######################
2596
2597 vmovdqu HashKey_5(arg2), \T5
2598 vpshufd $0b01001110, \XMM4, \T2
2599 vpshufd $0b01001110, \T5, \T3
2600 vpxor \XMM4, \T2, \T2
2601 vpxor \T5, \T3, \T3
2602
2603 vpclmulqdq $0x11, \T5, \XMM4, \T4
2604 vpxor \T4, \T6, \T6
2605
2606 vpclmulqdq $0x00, \T5, \XMM4, \T4
2607 vpxor \T4, \T7, \T7
2608
2609 vpclmulqdq $0x00, \T3, \T2, \T2
2610
2611 vpxor \T2, \XMM1, \XMM1
2612
2613 ######################
2614
2615 vmovdqu HashKey_4(arg2), \T5
2616 vpshufd $0b01001110, \XMM5, \T2
2617 vpshufd $0b01001110, \T5, \T3
2618 vpxor \XMM5, \T2, \T2
2619 vpxor \T5, \T3, \T3
2620
2621 vpclmulqdq $0x11, \T5, \XMM5, \T4
2622 vpxor \T4, \T6, \T6
2623
2624 vpclmulqdq $0x00, \T5, \XMM5, \T4
2625 vpxor \T4, \T7, \T7
2626
2627 vpclmulqdq $0x00, \T3, \T2, \T2
2628
2629 vpxor \T2, \XMM1, \XMM1
2630
2631 ######################
2632
2633 vmovdqu HashKey_3(arg2), \T5
2634 vpshufd $0b01001110, \XMM6, \T2
2635 vpshufd $0b01001110, \T5, \T3
2636 vpxor \XMM6, \T2, \T2
2637 vpxor \T5, \T3, \T3
2638
2639 vpclmulqdq $0x11, \T5, \XMM6, \T4
2640 vpxor \T4, \T6, \T6
2641
2642 vpclmulqdq $0x00, \T5, \XMM6, \T4
2643 vpxor \T4, \T7, \T7
2644
2645 vpclmulqdq $0x00, \T3, \T2, \T2
2646
2647 vpxor \T2, \XMM1, \XMM1
2648
2649 ######################
2650
2651 vmovdqu HashKey_2(arg2), \T5
2652 vpshufd $0b01001110, \XMM7, \T2
2653 vpshufd $0b01001110, \T5, \T3
2654 vpxor \XMM7, \T2, \T2
2655 vpxor \T5, \T3, \T3
2656
2657 vpclmulqdq $0x11, \T5, \XMM7, \T4
2658 vpxor \T4, \T6, \T6
2659
2660 vpclmulqdq $0x00, \T5, \XMM7, \T4
2661 vpxor \T4, \T7, \T7
2662
2663 vpclmulqdq $0x00, \T3, \T2, \T2
2664
2665 vpxor \T2, \XMM1, \XMM1
2666
2667 ######################
2668
2669 vmovdqu HashKey(arg2), \T5
2670 vpshufd $0b01001110, \XMM8, \T2
2671 vpshufd $0b01001110, \T5, \T3
2672 vpxor \XMM8, \T2, \T2
2673 vpxor \T5, \T3, \T3
2674
2675 vpclmulqdq $0x11, \T5, \XMM8, \T4
2676 vpxor \T4, \T6, \T6
2677
2678 vpclmulqdq $0x00, \T5, \XMM8, \T4
2679 vpxor \T4, \T7, \T7
2680
2681 vpclmulqdq $0x00, \T3, \T2, \T2
2682
2683 vpxor \T2, \XMM1, \XMM1
2684 vpxor \T6, \XMM1, \XMM1
2685 vpxor \T7, \XMM1, \T2
2686
2687
2688
2689
2690 vpslldq $8, \T2, \T4
2691 vpsrldq $8, \T2, \T2
2692
2693 vpxor \T4, \T7, \T7
2694 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2695 # accumulated carry-less multiplications
2696
2697 #######################################################################
2698 #first phase of the reduction
2699 vmovdqa POLY2(%rip), \T3
2700
2701 vpclmulqdq $0x01, \T7, \T3, \T2
2702 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2703
2704 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2705 #######################################################################
2706
2707
2708 #second phase of the reduction
2709 vpclmulqdq $0x00, \T7, \T3, \T2
2710 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2711
2712 vpclmulqdq $0x10, \T7, \T3, \T4
2713 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2714
2715 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2716 #######################################################################
2717 vpxor \T4, \T6, \T6 # the result is in T6
2718.endm
2719
2720
2721
2722#############################################################
2723#void aesni_gcm_init_avx_gen4
2724# (gcm_data *my_ctx_data,
2725# gcm_context_data *data,
2726# u8 *iv, /* Pre-counter block j0: 4 byte salt
2727# (from Security Association) concatenated with 8 byte
2728# Initialisation Vector (from IPSec ESP Payload)
2729# concatenated with 0x00000001. 16-byte aligned pointer. */
2730# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2731# const u8 *aad, /* Additional Authentication Data (AAD)*/
2732# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2733#############################################################
2734SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2735 FUNC_SAVE
2736 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2737 FUNC_RESTORE
2738 RET
2739SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2740
2741###############################################################################
2742#void aesni_gcm_enc_avx_gen4(
2743# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2744# gcm_context_data *data,
2745# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2746# const u8 *in, /* Plaintext input */
2747# u64 plaintext_len) /* Length of data in Bytes for encryption. */
2748###############################################################################
2749SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2750 FUNC_SAVE
2751 mov keysize,%eax
2752 cmp $32, %eax
2753 je key_256_enc_update4
2754 cmp $16, %eax
2755 je key_128_enc_update4
2756 # must be 192
2757 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2758 FUNC_RESTORE
2759 RET
2760key_128_enc_update4:
2761 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2762 FUNC_RESTORE
2763 RET
2764key_256_enc_update4:
2765 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2766 FUNC_RESTORE
2767 RET
2768SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2769
2770###############################################################################
2771#void aesni_gcm_dec_update_avx_gen4(
2772# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2773# gcm_context_data *data,
2774# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2775# const u8 *in, /* Ciphertext input */
2776# u64 plaintext_len) /* Length of data in Bytes for encryption. */
2777###############################################################################
2778SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2779 FUNC_SAVE
2780 mov keysize,%eax
2781 cmp $32, %eax
2782 je key_256_dec_update4
2783 cmp $16, %eax
2784 je key_128_dec_update4
2785 # must be 192
2786 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2787 FUNC_RESTORE
2788 RET
2789key_128_dec_update4:
2790 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2791 FUNC_RESTORE
2792 RET
2793key_256_dec_update4:
2794 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2795 FUNC_RESTORE
2796 RET
2797SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2798
2799###############################################################################
2800#void aesni_gcm_finalize_avx_gen4(
2801# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2802# gcm_context_data *data,
2803# u8 *auth_tag, /* Authenticated Tag output. */
2804# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2805# Valid values are 16 (most likely), 12 or 8. */
2806###############################################################################
2807SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2808 FUNC_SAVE
2809 mov keysize,%eax
2810 cmp $32, %eax
2811 je key_256_finalize4
2812 cmp $16, %eax
2813 je key_128_finalize4
2814 # must be 192
2815 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2816 FUNC_RESTORE
2817 RET
2818key_128_finalize4:
2819 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2820 FUNC_RESTORE
2821 RET
2822key_256_finalize4:
2823 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2824 FUNC_RESTORE
2825 RET
2826SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)