Linux Audio

Check our new training course

Loading...
   1/*
   2 * Implement AES algorithm in Intel AES-NI instructions.
   3 *
   4 * The white paper of AES-NI instructions can be downloaded from:
   5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   6 *
   7 * Copyright (C) 2008, Intel Corp.
   8 *    Author: Huang Ying <ying.huang@intel.com>
   9 *            Vinodh Gopal <vinodh.gopal@intel.com>
  10 *            Kahraman Akdemir
  11 *
  12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13 * interface for 64-bit kernels.
  14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  16 *             Adrian Hoban <adrian.hoban@intel.com>
  17 *             James Guilford (james.guilford@intel.com)
  18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  19 *             Tadeusz Struk (tadeusz.struk@intel.com)
  20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  21 *    Copyright (c) 2010, Intel Corporation.
  22 *
  23 * Ported x86_64 version to x86:
  24 *    Author: Mathias Krause <minipli@googlemail.com>
  25 *
  26 * This program is free software; you can redistribute it and/or modify
  27 * it under the terms of the GNU General Public License as published by
  28 * the Free Software Foundation; either version 2 of the License, or
  29 * (at your option) any later version.
  30 */
  31
  32#include <linux/linkage.h>
  33#include <asm/inst.h>
  34
  35#ifdef __x86_64__
  36.data
  37.align 16
  38.Lgf128mul_x_ble_mask:
  39	.octa 0x00000000000000010000000000000087
  40
  41POLY:   .octa 0xC2000000000000000000000000000001
  42TWOONE: .octa 0x00000001000000000000000000000001
  43
  44# order of these constants should not change.
  45# more specifically, ALL_F should follow SHIFT_MASK,
  46# and ZERO should follow ALL_F
  47
  48SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  49MASK1:      .octa 0x0000000000000000ffffffffffffffff
  50MASK2:      .octa 0xffffffffffffffff0000000000000000
  51SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  52ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  53ZERO:       .octa 0x00000000000000000000000000000000
  54ONE:        .octa 0x00000000000000000000000000000001
  55F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  56dec:        .octa 0x1
  57enc:        .octa 0x2
  58
  59
  60.text
  61
  62
  63#define	STACK_OFFSET    8*3
  64#define	HashKey		16*0	// store HashKey <<1 mod poly here
  65#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
  66#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
  67#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
  68#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
  69				// bits of  HashKey <<1 mod poly here
  70				//(for Karatsuba purposes)
  71#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
  72				// bits of  HashKey^2 <<1 mod poly here
  73				// (for Karatsuba purposes)
  74#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
  75				// bits of  HashKey^3 <<1 mod poly here
  76				// (for Karatsuba purposes)
  77#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
  78				// bits of  HashKey^4 <<1 mod poly here
  79				// (for Karatsuba purposes)
  80#define	VARIABLE_OFFSET	16*8
  81
  82#define arg1 rdi
  83#define arg2 rsi
  84#define arg3 rdx
  85#define arg4 rcx
  86#define arg5 r8
  87#define arg6 r9
  88#define arg7 STACK_OFFSET+8(%r14)
  89#define arg8 STACK_OFFSET+16(%r14)
  90#define arg9 STACK_OFFSET+24(%r14)
  91#define arg10 STACK_OFFSET+32(%r14)
  92#endif
  93
  94
  95#define STATE1	%xmm0
  96#define STATE2	%xmm4
  97#define STATE3	%xmm5
  98#define STATE4	%xmm6
  99#define STATE	STATE1
 100#define IN1	%xmm1
 101#define IN2	%xmm7
 102#define IN3	%xmm8
 103#define IN4	%xmm9
 104#define IN	IN1
 105#define KEY	%xmm2
 106#define IV	%xmm3
 107
 108#define BSWAP_MASK %xmm10
 109#define CTR	%xmm11
 110#define INC	%xmm12
 111
 112#define GF128MUL_MASK %xmm10
 113
 114#ifdef __x86_64__
 115#define AREG	%rax
 116#define KEYP	%rdi
 117#define OUTP	%rsi
 118#define UKEYP	OUTP
 119#define INP	%rdx
 120#define LEN	%rcx
 121#define IVP	%r8
 122#define KLEN	%r9d
 123#define T1	%r10
 124#define TKEYP	T1
 125#define T2	%r11
 126#define TCTR_LOW T2
 127#else
 128#define AREG	%eax
 129#define KEYP	%edi
 130#define OUTP	AREG
 131#define UKEYP	OUTP
 132#define INP	%edx
 133#define LEN	%esi
 134#define IVP	%ebp
 135#define KLEN	%ebx
 136#define T1	%ecx
 137#define TKEYP	T1
 138#endif
 139
 140
 141#ifdef __x86_64__
 142/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 143*
 144*
 145* Input: A and B (128-bits each, bit-reflected)
 146* Output: C = A*B*x mod poly, (i.e. >>1 )
 147* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 148* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 149*
 150*/
 151.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 152	movdqa	  \GH, \TMP1
 153	pshufd	  $78, \GH, \TMP2
 154	pshufd	  $78, \HK, \TMP3
 155	pxor	  \GH, \TMP2            # TMP2 = a1+a0
 156	pxor	  \HK, \TMP3            # TMP3 = b1+b0
 157	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 158	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 159	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 160	pxor	  \GH, \TMP2
 161	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 162	movdqa	  \TMP2, \TMP3
 163	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
 164	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
 165	pxor	  \TMP3, \GH
 166	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 167
 168        # first phase of the reduction
 169
 170	movdqa    \GH, \TMP2
 171	movdqa    \GH, \TMP3
 172	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 173					# in in order to perform
 174					# independent shifts
 175	pslld     $31, \TMP2            # packed right shift <<31
 176	pslld     $30, \TMP3            # packed right shift <<30
 177	pslld     $25, \TMP4            # packed right shift <<25
 178	pxor      \TMP3, \TMP2          # xor the shifted versions
 179	pxor      \TMP4, \TMP2
 180	movdqa    \TMP2, \TMP5
 181	psrldq    $4, \TMP5             # right shift TMP5 1 DW
 182	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 183	pxor      \TMP2, \GH
 184
 185        # second phase of the reduction
 186
 187	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 188					# in in order to perform
 189					# independent shifts
 190	movdqa    \GH,\TMP3
 191	movdqa    \GH,\TMP4
 192	psrld     $1,\TMP2              # packed left shift >>1
 193	psrld     $2,\TMP3              # packed left shift >>2
 194	psrld     $7,\TMP4              # packed left shift >>7
 195	pxor      \TMP3,\TMP2		# xor the shifted versions
 196	pxor      \TMP4,\TMP2
 197	pxor      \TMP5, \TMP2
 198	pxor      \TMP2, \GH
 199	pxor      \TMP1, \GH            # result is in TMP1
 200.endm
 201
 202/*
 203* if a = number of total plaintext bytes
 204* b = floor(a/16)
 205* num_initial_blocks = b mod 4
 206* encrypt the initial num_initial_blocks blocks and apply ghash on
 207* the ciphertext
 208* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 209* are clobbered
 210* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
 211*/
 212
 213
 214.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 215XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 216	mov	   arg7, %r10           # %r10 = AAD
 217	mov	   arg8, %r12           # %r12 = aadLen
 218	mov	   %r12, %r11
 219	pxor	   %xmm\i, %xmm\i
 220_get_AAD_loop\num_initial_blocks\operation:
 221	movd	   (%r10), \TMP1
 222	pslldq	   $12, \TMP1
 223	psrldq	   $4, %xmm\i
 224	pxor	   \TMP1, %xmm\i
 225	add	   $4, %r10
 226	sub	   $4, %r12
 227	jne	   _get_AAD_loop\num_initial_blocks\operation
 228	cmp	   $16, %r11
 229	je	   _get_AAD_loop2_done\num_initial_blocks\operation
 230	mov	   $16, %r12
 231_get_AAD_loop2\num_initial_blocks\operation:
 232	psrldq	   $4, %xmm\i
 233	sub	   $4, %r12
 234	cmp	   %r11, %r12
 235	jne	   _get_AAD_loop2\num_initial_blocks\operation
 236_get_AAD_loop2_done\num_initial_blocks\operation:
 237        movdqa     SHUF_MASK(%rip), %xmm14
 238	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 239
 240	xor	   %r11, %r11 # initialise the data pointer offset as zero
 241
 242        # start AES for num_initial_blocks blocks
 243
 244	mov	   %arg5, %rax                      # %rax = *Y0
 245	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
 246        movdqa     SHUF_MASK(%rip), %xmm14
 247	PSHUFB_XMM   %xmm14, \XMM0
 248
 249.if (\i == 5) || (\i == 6) || (\i == 7)
 250.irpc index, \i_seq
 251	paddd	   ONE(%rip), \XMM0                 # INCR Y0
 252	movdqa	   \XMM0, %xmm\index
 253        movdqa     SHUF_MASK(%rip), %xmm14
 254	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 255
 256.endr
 257.irpc index, \i_seq
 258	pxor	   16*0(%arg1), %xmm\index
 259.endr
 260.irpc index, \i_seq
 261	movaps 0x10(%rdi), \TMP1
 262	AESENC     \TMP1, %xmm\index          # Round 1
 263.endr
 264.irpc index, \i_seq
 265	movaps 0x20(%arg1), \TMP1
 266	AESENC     \TMP1, %xmm\index          # Round 2
 267.endr
 268.irpc index, \i_seq
 269	movaps 0x30(%arg1), \TMP1
 270	AESENC     \TMP1, %xmm\index          # Round 2
 271.endr
 272.irpc index, \i_seq
 273	movaps 0x40(%arg1), \TMP1
 274	AESENC     \TMP1, %xmm\index          # Round 2
 275.endr
 276.irpc index, \i_seq
 277	movaps 0x50(%arg1), \TMP1
 278	AESENC     \TMP1, %xmm\index          # Round 2
 279.endr
 280.irpc index, \i_seq
 281	movaps 0x60(%arg1), \TMP1
 282	AESENC     \TMP1, %xmm\index          # Round 2
 283.endr
 284.irpc index, \i_seq
 285	movaps 0x70(%arg1), \TMP1
 286	AESENC     \TMP1, %xmm\index          # Round 2
 287.endr
 288.irpc index, \i_seq
 289	movaps 0x80(%arg1), \TMP1
 290	AESENC     \TMP1, %xmm\index          # Round 2
 291.endr
 292.irpc index, \i_seq
 293	movaps 0x90(%arg1), \TMP1
 294	AESENC     \TMP1, %xmm\index          # Round 2
 295.endr
 296.irpc index, \i_seq
 297	movaps 0xa0(%arg1), \TMP1
 298	AESENCLAST \TMP1, %xmm\index         # Round 10
 299.endr
 300.irpc index, \i_seq
 301	movdqu	   (%arg3 , %r11, 1), \TMP1
 302	pxor	   \TMP1, %xmm\index
 303	movdqu	   %xmm\index, (%arg2 , %r11, 1)
 304	# write back plaintext/ciphertext for num_initial_blocks
 305	add	   $16, %r11
 306
 307	movdqa     \TMP1, %xmm\index
 308        movdqa     SHUF_MASK(%rip), %xmm14
 309	PSHUFB_XMM	   %xmm14, %xmm\index
 310
 311		# prepare plaintext/ciphertext for GHASH computation
 312.endr
 313.endif
 314	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 315        # apply GHASH on num_initial_blocks blocks
 316
 317.if \i == 5
 318        pxor       %xmm5, %xmm6
 319	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 320        pxor       %xmm6, %xmm7
 321	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 322        pxor       %xmm7, %xmm8
 323	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 324.elseif \i == 6
 325        pxor       %xmm6, %xmm7
 326	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 327        pxor       %xmm7, %xmm8
 328	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 329.elseif \i == 7
 330        pxor       %xmm7, %xmm8
 331	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 332.endif
 333	cmp	   $64, %r13
 334	jl	_initial_blocks_done\num_initial_blocks\operation
 335	# no need for precomputed values
 336/*
 337*
 338* Precomputations for HashKey parallel with encryption of first 4 blocks.
 339* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 340*/
 341	paddd	   ONE(%rip), \XMM0              # INCR Y0
 342	movdqa	   \XMM0, \XMM1
 343        movdqa     SHUF_MASK(%rip), %xmm14
 344	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 345
 346	paddd	   ONE(%rip), \XMM0              # INCR Y0
 347	movdqa	   \XMM0, \XMM2
 348        movdqa     SHUF_MASK(%rip), %xmm14
 349	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 350
 351	paddd	   ONE(%rip), \XMM0              # INCR Y0
 352	movdqa	   \XMM0, \XMM3
 353        movdqa     SHUF_MASK(%rip), %xmm14
 354	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 355
 356	paddd	   ONE(%rip), \XMM0              # INCR Y0
 357	movdqa	   \XMM0, \XMM4
 358        movdqa     SHUF_MASK(%rip), %xmm14
 359	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 360
 361	pxor	   16*0(%arg1), \XMM1
 362	pxor	   16*0(%arg1), \XMM2
 363	pxor	   16*0(%arg1), \XMM3
 364	pxor	   16*0(%arg1), \XMM4
 365	movdqa	   \TMP3, \TMP5
 366	pshufd	   $78, \TMP3, \TMP1
 367	pxor	   \TMP3, \TMP1
 368	movdqa	   \TMP1, HashKey_k(%rsp)
 369	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 370# TMP5 = HashKey^2<<1 (mod poly)
 371	movdqa	   \TMP5, HashKey_2(%rsp)
 372# HashKey_2 = HashKey^2<<1 (mod poly)
 373	pshufd	   $78, \TMP5, \TMP1
 374	pxor	   \TMP5, \TMP1
 375	movdqa	   \TMP1, HashKey_2_k(%rsp)
 376.irpc index, 1234 # do 4 rounds
 377	movaps 0x10*\index(%arg1), \TMP1
 378	AESENC	   \TMP1, \XMM1
 379	AESENC	   \TMP1, \XMM2
 380	AESENC	   \TMP1, \XMM3
 381	AESENC	   \TMP1, \XMM4
 382.endr
 383	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 384# TMP5 = HashKey^3<<1 (mod poly)
 385	movdqa	   \TMP5, HashKey_3(%rsp)
 386	pshufd	   $78, \TMP5, \TMP1
 387	pxor	   \TMP5, \TMP1
 388	movdqa	   \TMP1, HashKey_3_k(%rsp)
 389.irpc index, 56789 # do next 5 rounds
 390	movaps 0x10*\index(%arg1), \TMP1
 391	AESENC	   \TMP1, \XMM1
 392	AESENC	   \TMP1, \XMM2
 393	AESENC	   \TMP1, \XMM3
 394	AESENC	   \TMP1, \XMM4
 395.endr
 396	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 397# TMP5 = HashKey^3<<1 (mod poly)
 398	movdqa	   \TMP5, HashKey_4(%rsp)
 399	pshufd	   $78, \TMP5, \TMP1
 400	pxor	   \TMP5, \TMP1
 401	movdqa	   \TMP1, HashKey_4_k(%rsp)
 402	movaps 0xa0(%arg1), \TMP2
 403	AESENCLAST \TMP2, \XMM1
 404	AESENCLAST \TMP2, \XMM2
 405	AESENCLAST \TMP2, \XMM3
 406	AESENCLAST \TMP2, \XMM4
 407	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
 408	pxor	   \TMP1, \XMM1
 409	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
 410	movdqa     \TMP1, \XMM1
 411	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
 412	pxor	   \TMP1, \XMM2
 413	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
 414	movdqa     \TMP1, \XMM2
 415	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
 416	pxor	   \TMP1, \XMM3
 417	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
 418	movdqa     \TMP1, \XMM3
 419	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
 420	pxor	   \TMP1, \XMM4
 421	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
 422	movdqa     \TMP1, \XMM4
 423	add	   $64, %r11
 424        movdqa     SHUF_MASK(%rip), %xmm14
 425	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 426	pxor	   \XMMDst, \XMM1
 427# combine GHASHed value with the corresponding ciphertext
 428        movdqa     SHUF_MASK(%rip), %xmm14
 429	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 430        movdqa     SHUF_MASK(%rip), %xmm14
 431	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 432        movdqa     SHUF_MASK(%rip), %xmm14
 433	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 434
 435_initial_blocks_done\num_initial_blocks\operation:
 436
 437.endm
 438
 439
 440/*
 441* if a = number of total plaintext bytes
 442* b = floor(a/16)
 443* num_initial_blocks = b mod 4
 444* encrypt the initial num_initial_blocks blocks and apply ghash on
 445* the ciphertext
 446* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 447* are clobbered
 448* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
 449*/
 450
 451
 452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 454	mov	   arg7, %r10           # %r10 = AAD
 455	mov	   arg8, %r12           # %r12 = aadLen
 456	mov	   %r12, %r11
 457	pxor	   %xmm\i, %xmm\i
 458_get_AAD_loop\num_initial_blocks\operation:
 459	movd	   (%r10), \TMP1
 460	pslldq	   $12, \TMP1
 461	psrldq	   $4, %xmm\i
 462	pxor	   \TMP1, %xmm\i
 463	add	   $4, %r10
 464	sub	   $4, %r12
 465	jne	   _get_AAD_loop\num_initial_blocks\operation
 466	cmp	   $16, %r11
 467	je	   _get_AAD_loop2_done\num_initial_blocks\operation
 468	mov	   $16, %r12
 469_get_AAD_loop2\num_initial_blocks\operation:
 470	psrldq	   $4, %xmm\i
 471	sub	   $4, %r12
 472	cmp	   %r11, %r12
 473	jne	   _get_AAD_loop2\num_initial_blocks\operation
 474_get_AAD_loop2_done\num_initial_blocks\operation:
 475        movdqa     SHUF_MASK(%rip), %xmm14
 476	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 477
 478	xor	   %r11, %r11 # initialise the data pointer offset as zero
 479
 480        # start AES for num_initial_blocks blocks
 481
 482	mov	   %arg5, %rax                      # %rax = *Y0
 483	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
 484        movdqa     SHUF_MASK(%rip), %xmm14
 485	PSHUFB_XMM   %xmm14, \XMM0
 486
 487.if (\i == 5) || (\i == 6) || (\i == 7)
 488.irpc index, \i_seq
 489	paddd	   ONE(%rip), \XMM0                 # INCR Y0
 490	movdqa	   \XMM0, %xmm\index
 491        movdqa     SHUF_MASK(%rip), %xmm14
 492	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 493
 494.endr
 495.irpc index, \i_seq
 496	pxor	   16*0(%arg1), %xmm\index
 497.endr
 498.irpc index, \i_seq
 499	movaps 0x10(%rdi), \TMP1
 500	AESENC     \TMP1, %xmm\index          # Round 1
 501.endr
 502.irpc index, \i_seq
 503	movaps 0x20(%arg1), \TMP1
 504	AESENC     \TMP1, %xmm\index          # Round 2
 505.endr
 506.irpc index, \i_seq
 507	movaps 0x30(%arg1), \TMP1
 508	AESENC     \TMP1, %xmm\index          # Round 2
 509.endr
 510.irpc index, \i_seq
 511	movaps 0x40(%arg1), \TMP1
 512	AESENC     \TMP1, %xmm\index          # Round 2
 513.endr
 514.irpc index, \i_seq
 515	movaps 0x50(%arg1), \TMP1
 516	AESENC     \TMP1, %xmm\index          # Round 2
 517.endr
 518.irpc index, \i_seq
 519	movaps 0x60(%arg1), \TMP1
 520	AESENC     \TMP1, %xmm\index          # Round 2
 521.endr
 522.irpc index, \i_seq
 523	movaps 0x70(%arg1), \TMP1
 524	AESENC     \TMP1, %xmm\index          # Round 2
 525.endr
 526.irpc index, \i_seq
 527	movaps 0x80(%arg1), \TMP1
 528	AESENC     \TMP1, %xmm\index          # Round 2
 529.endr
 530.irpc index, \i_seq
 531	movaps 0x90(%arg1), \TMP1
 532	AESENC     \TMP1, %xmm\index          # Round 2
 533.endr
 534.irpc index, \i_seq
 535	movaps 0xa0(%arg1), \TMP1
 536	AESENCLAST \TMP1, %xmm\index         # Round 10
 537.endr
 538.irpc index, \i_seq
 539	movdqu	   (%arg3 , %r11, 1), \TMP1
 540	pxor	   \TMP1, %xmm\index
 541	movdqu	   %xmm\index, (%arg2 , %r11, 1)
 542	# write back plaintext/ciphertext for num_initial_blocks
 543	add	   $16, %r11
 544
 545        movdqa     SHUF_MASK(%rip), %xmm14
 546	PSHUFB_XMM	   %xmm14, %xmm\index
 547
 548		# prepare plaintext/ciphertext for GHASH computation
 549.endr
 550.endif
 551	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 552        # apply GHASH on num_initial_blocks blocks
 553
 554.if \i == 5
 555        pxor       %xmm5, %xmm6
 556	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 557        pxor       %xmm6, %xmm7
 558	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 559        pxor       %xmm7, %xmm8
 560	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 561.elseif \i == 6
 562        pxor       %xmm6, %xmm7
 563	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 564        pxor       %xmm7, %xmm8
 565	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 566.elseif \i == 7
 567        pxor       %xmm7, %xmm8
 568	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 569.endif
 570	cmp	   $64, %r13
 571	jl	_initial_blocks_done\num_initial_blocks\operation
 572	# no need for precomputed values
 573/*
 574*
 575* Precomputations for HashKey parallel with encryption of first 4 blocks.
 576* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 577*/
 578	paddd	   ONE(%rip), \XMM0              # INCR Y0
 579	movdqa	   \XMM0, \XMM1
 580        movdqa     SHUF_MASK(%rip), %xmm14
 581	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 582
 583	paddd	   ONE(%rip), \XMM0              # INCR Y0
 584	movdqa	   \XMM0, \XMM2
 585        movdqa     SHUF_MASK(%rip), %xmm14
 586	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 587
 588	paddd	   ONE(%rip), \XMM0              # INCR Y0
 589	movdqa	   \XMM0, \XMM3
 590        movdqa     SHUF_MASK(%rip), %xmm14
 591	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 592
 593	paddd	   ONE(%rip), \XMM0              # INCR Y0
 594	movdqa	   \XMM0, \XMM4
 595        movdqa     SHUF_MASK(%rip), %xmm14
 596	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 597
 598	pxor	   16*0(%arg1), \XMM1
 599	pxor	   16*0(%arg1), \XMM2
 600	pxor	   16*0(%arg1), \XMM3
 601	pxor	   16*0(%arg1), \XMM4
 602	movdqa	   \TMP3, \TMP5
 603	pshufd	   $78, \TMP3, \TMP1
 604	pxor	   \TMP3, \TMP1
 605	movdqa	   \TMP1, HashKey_k(%rsp)
 606	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 607# TMP5 = HashKey^2<<1 (mod poly)
 608	movdqa	   \TMP5, HashKey_2(%rsp)
 609# HashKey_2 = HashKey^2<<1 (mod poly)
 610	pshufd	   $78, \TMP5, \TMP1
 611	pxor	   \TMP5, \TMP1
 612	movdqa	   \TMP1, HashKey_2_k(%rsp)
 613.irpc index, 1234 # do 4 rounds
 614	movaps 0x10*\index(%arg1), \TMP1
 615	AESENC	   \TMP1, \XMM1
 616	AESENC	   \TMP1, \XMM2
 617	AESENC	   \TMP1, \XMM3
 618	AESENC	   \TMP1, \XMM4
 619.endr
 620	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 621# TMP5 = HashKey^3<<1 (mod poly)
 622	movdqa	   \TMP5, HashKey_3(%rsp)
 623	pshufd	   $78, \TMP5, \TMP1
 624	pxor	   \TMP5, \TMP1
 625	movdqa	   \TMP1, HashKey_3_k(%rsp)
 626.irpc index, 56789 # do next 5 rounds
 627	movaps 0x10*\index(%arg1), \TMP1
 628	AESENC	   \TMP1, \XMM1
 629	AESENC	   \TMP1, \XMM2
 630	AESENC	   \TMP1, \XMM3
 631	AESENC	   \TMP1, \XMM4
 632.endr
 633	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 634# TMP5 = HashKey^3<<1 (mod poly)
 635	movdqa	   \TMP5, HashKey_4(%rsp)
 636	pshufd	   $78, \TMP5, \TMP1
 637	pxor	   \TMP5, \TMP1
 638	movdqa	   \TMP1, HashKey_4_k(%rsp)
 639	movaps 0xa0(%arg1), \TMP2
 640	AESENCLAST \TMP2, \XMM1
 641	AESENCLAST \TMP2, \XMM2
 642	AESENCLAST \TMP2, \XMM3
 643	AESENCLAST \TMP2, \XMM4
 644	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
 645	pxor	   \TMP1, \XMM1
 646	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
 647	pxor	   \TMP1, \XMM2
 648	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
 649	pxor	   \TMP1, \XMM3
 650	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
 651	pxor	   \TMP1, \XMM4
 652	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
 653	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
 654	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
 655	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
 656
 657	add	   $64, %r11
 658        movdqa     SHUF_MASK(%rip), %xmm14
 659	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 660	pxor	   \XMMDst, \XMM1
 661# combine GHASHed value with the corresponding ciphertext
 662        movdqa     SHUF_MASK(%rip), %xmm14
 663	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 664        movdqa     SHUF_MASK(%rip), %xmm14
 665	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 666        movdqa     SHUF_MASK(%rip), %xmm14
 667	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 668
 669_initial_blocks_done\num_initial_blocks\operation:
 670
 671.endm
 672
 673/*
 674* encrypt 4 blocks at a time
 675* ghash the 4 previously encrypted ciphertext blocks
 676* arg1, %arg2, %arg3 are used as pointers only, not modified
 677* %r11 is the data offset value
 678*/
 679.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 680TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 681
 682	movdqa	  \XMM1, \XMM5
 683	movdqa	  \XMM2, \XMM6
 684	movdqa	  \XMM3, \XMM7
 685	movdqa	  \XMM4, \XMM8
 686
 687        movdqa    SHUF_MASK(%rip), %xmm15
 688        # multiply TMP5 * HashKey using karatsuba
 689
 690	movdqa	  \XMM5, \TMP4
 691	pshufd	  $78, \XMM5, \TMP6
 692	pxor	  \XMM5, \TMP6
 693	paddd     ONE(%rip), \XMM0		# INCR CNT
 694	movdqa	  HashKey_4(%rsp), \TMP5
 695	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 696	movdqa    \XMM0, \XMM1
 697	paddd     ONE(%rip), \XMM0		# INCR CNT
 698	movdqa    \XMM0, \XMM2
 699	paddd     ONE(%rip), \XMM0		# INCR CNT
 700	movdqa    \XMM0, \XMM3
 701	paddd     ONE(%rip), \XMM0		# INCR CNT
 702	movdqa    \XMM0, \XMM4
 703	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
 704	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
 705	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 706	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 707	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 708
 709	pxor	  (%arg1), \XMM1
 710	pxor	  (%arg1), \XMM2
 711	pxor	  (%arg1), \XMM3
 712	pxor	  (%arg1), \XMM4
 713	movdqa	  HashKey_4_k(%rsp), \TMP5
 714	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
 715	movaps 0x10(%arg1), \TMP1
 716	AESENC	  \TMP1, \XMM1              # Round 1
 717	AESENC	  \TMP1, \XMM2
 718	AESENC	  \TMP1, \XMM3
 719	AESENC	  \TMP1, \XMM4
 720	movaps 0x20(%arg1), \TMP1
 721	AESENC	  \TMP1, \XMM1              # Round 2
 722	AESENC	  \TMP1, \XMM2
 723	AESENC	  \TMP1, \XMM3
 724	AESENC	  \TMP1, \XMM4
 725	movdqa	  \XMM6, \TMP1
 726	pshufd	  $78, \XMM6, \TMP2
 727	pxor	  \XMM6, \TMP2
 728	movdqa	  HashKey_3(%rsp), \TMP5
 729	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
 730	movaps 0x30(%arg1), \TMP3
 731	AESENC    \TMP3, \XMM1              # Round 3
 732	AESENC    \TMP3, \XMM2
 733	AESENC    \TMP3, \XMM3
 734	AESENC    \TMP3, \XMM4
 735	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
 736	movaps 0x40(%arg1), \TMP3
 737	AESENC	  \TMP3, \XMM1              # Round 4
 738	AESENC	  \TMP3, \XMM2
 739	AESENC	  \TMP3, \XMM3
 740	AESENC	  \TMP3, \XMM4
 741	movdqa	  HashKey_3_k(%rsp), \TMP5
 742	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 743	movaps 0x50(%arg1), \TMP3
 744	AESENC	  \TMP3, \XMM1              # Round 5
 745	AESENC	  \TMP3, \XMM2
 746	AESENC	  \TMP3, \XMM3
 747	AESENC	  \TMP3, \XMM4
 748	pxor	  \TMP1, \TMP4
 749# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 750	pxor	  \XMM6, \XMM5
 751	pxor	  \TMP2, \TMP6
 752	movdqa	  \XMM7, \TMP1
 753	pshufd	  $78, \XMM7, \TMP2
 754	pxor	  \XMM7, \TMP2
 755	movdqa	  HashKey_2(%rsp ), \TMP5
 756
 757        # Multiply TMP5 * HashKey using karatsuba
 758
 759	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
 760	movaps 0x60(%arg1), \TMP3
 761	AESENC	  \TMP3, \XMM1              # Round 6
 762	AESENC	  \TMP3, \XMM2
 763	AESENC	  \TMP3, \XMM3
 764	AESENC	  \TMP3, \XMM4
 765	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
 766	movaps 0x70(%arg1), \TMP3
 767	AESENC	  \TMP3, \XMM1             # Round 7
 768	AESENC	  \TMP3, \XMM2
 769	AESENC	  \TMP3, \XMM3
 770	AESENC	  \TMP3, \XMM4
 771	movdqa	  HashKey_2_k(%rsp), \TMP5
 772	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 773	movaps 0x80(%arg1), \TMP3
 774	AESENC	  \TMP3, \XMM1             # Round 8
 775	AESENC	  \TMP3, \XMM2
 776	AESENC	  \TMP3, \XMM3
 777	AESENC	  \TMP3, \XMM4
 778	pxor	  \TMP1, \TMP4
 779# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 780	pxor	  \XMM7, \XMM5
 781	pxor	  \TMP2, \TMP6
 782
 783        # Multiply XMM8 * HashKey
 784        # XMM8 and TMP5 hold the values for the two operands
 785
 786	movdqa	  \XMM8, \TMP1
 787	pshufd	  $78, \XMM8, \TMP2
 788	pxor	  \XMM8, \TMP2
 789	movdqa	  HashKey(%rsp), \TMP5
 790	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
 791	movaps 0x90(%arg1), \TMP3
 792	AESENC	  \TMP3, \XMM1            # Round 9
 793	AESENC	  \TMP3, \XMM2
 794	AESENC	  \TMP3, \XMM3
 795	AESENC	  \TMP3, \XMM4
 796	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 797	movaps 0xa0(%arg1), \TMP3
 798	AESENCLAST \TMP3, \XMM1           # Round 10
 799	AESENCLAST \TMP3, \XMM2
 800	AESENCLAST \TMP3, \XMM3
 801	AESENCLAST \TMP3, \XMM4
 802	movdqa    HashKey_k(%rsp), \TMP5
 803	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
 804	movdqu	  (%arg3,%r11,1), \TMP3
 805	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
 806	movdqu	  16(%arg3,%r11,1), \TMP3
 807	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
 808	movdqu	  32(%arg3,%r11,1), \TMP3
 809	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
 810	movdqu	  48(%arg3,%r11,1), \TMP3
 811	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
 812        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
 813        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
 814        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
 815        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
 816	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
 817	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 818	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 819	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 820
 821	pxor	  \TMP4, \TMP1
 822	pxor	  \XMM8, \XMM5
 823	pxor	  \TMP6, \TMP2
 824	pxor	  \TMP1, \TMP2
 825	pxor	  \XMM5, \TMP2
 826	movdqa	  \TMP2, \TMP3
 827	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
 828	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
 829	pxor	  \TMP3, \XMM5
 830	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
 831
 832        # first phase of reduction
 833
 834	movdqa    \XMM5, \TMP2
 835	movdqa    \XMM5, \TMP3
 836	movdqa    \XMM5, \TMP4
 837# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
 838	pslld     $31, \TMP2                   # packed right shift << 31
 839	pslld     $30, \TMP3                   # packed right shift << 30
 840	pslld     $25, \TMP4                   # packed right shift << 25
 841	pxor      \TMP3, \TMP2	               # xor the shifted versions
 842	pxor      \TMP4, \TMP2
 843	movdqa    \TMP2, \TMP5
 844	psrldq    $4, \TMP5                    # right shift T5 1 DW
 845	pslldq    $12, \TMP2                   # left shift T2 3 DWs
 846	pxor      \TMP2, \XMM5
 847
 848        # second phase of reduction
 849
 850	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
 851	movdqa    \XMM5,\TMP3
 852	movdqa    \XMM5,\TMP4
 853	psrld     $1, \TMP2                    # packed left shift >>1
 854	psrld     $2, \TMP3                    # packed left shift >>2
 855	psrld     $7, \TMP4                    # packed left shift >>7
 856	pxor      \TMP3,\TMP2		       # xor the shifted versions
 857	pxor      \TMP4,\TMP2
 858	pxor      \TMP5, \TMP2
 859	pxor      \TMP2, \XMM5
 860	pxor      \TMP1, \XMM5                 # result is in TMP1
 861
 862	pxor	  \XMM5, \XMM1
 863.endm
 864
 865/*
 866* decrypt 4 blocks at a time
 867* ghash the 4 previously decrypted ciphertext blocks
 868* arg1, %arg2, %arg3 are used as pointers only, not modified
 869* %r11 is the data offset value
 870*/
 871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
 872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 873
 874	movdqa	  \XMM1, \XMM5
 875	movdqa	  \XMM2, \XMM6
 876	movdqa	  \XMM3, \XMM7
 877	movdqa	  \XMM4, \XMM8
 878
 879        movdqa    SHUF_MASK(%rip), %xmm15
 880        # multiply TMP5 * HashKey using karatsuba
 881
 882	movdqa	  \XMM5, \TMP4
 883	pshufd	  $78, \XMM5, \TMP6
 884	pxor	  \XMM5, \TMP6
 885	paddd     ONE(%rip), \XMM0		# INCR CNT
 886	movdqa	  HashKey_4(%rsp), \TMP5
 887	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 888	movdqa    \XMM0, \XMM1
 889	paddd     ONE(%rip), \XMM0		# INCR CNT
 890	movdqa    \XMM0, \XMM2
 891	paddd     ONE(%rip), \XMM0		# INCR CNT
 892	movdqa    \XMM0, \XMM3
 893	paddd     ONE(%rip), \XMM0		# INCR CNT
 894	movdqa    \XMM0, \XMM4
 895	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
 896	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
 897	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 898	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 899	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 900
 901	pxor	  (%arg1), \XMM1
 902	pxor	  (%arg1), \XMM2
 903	pxor	  (%arg1), \XMM3
 904	pxor	  (%arg1), \XMM4
 905	movdqa	  HashKey_4_k(%rsp), \TMP5
 906	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
 907	movaps 0x10(%arg1), \TMP1
 908	AESENC	  \TMP1, \XMM1              # Round 1
 909	AESENC	  \TMP1, \XMM2
 910	AESENC	  \TMP1, \XMM3
 911	AESENC	  \TMP1, \XMM4
 912	movaps 0x20(%arg1), \TMP1
 913	AESENC	  \TMP1, \XMM1              # Round 2
 914	AESENC	  \TMP1, \XMM2
 915	AESENC	  \TMP1, \XMM3
 916	AESENC	  \TMP1, \XMM4
 917	movdqa	  \XMM6, \TMP1
 918	pshufd	  $78, \XMM6, \TMP2
 919	pxor	  \XMM6, \TMP2
 920	movdqa	  HashKey_3(%rsp), \TMP5
 921	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
 922	movaps 0x30(%arg1), \TMP3
 923	AESENC    \TMP3, \XMM1              # Round 3
 924	AESENC    \TMP3, \XMM2
 925	AESENC    \TMP3, \XMM3
 926	AESENC    \TMP3, \XMM4
 927	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
 928	movaps 0x40(%arg1), \TMP3
 929	AESENC	  \TMP3, \XMM1              # Round 4
 930	AESENC	  \TMP3, \XMM2
 931	AESENC	  \TMP3, \XMM3
 932	AESENC	  \TMP3, \XMM4
 933	movdqa	  HashKey_3_k(%rsp), \TMP5
 934	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 935	movaps 0x50(%arg1), \TMP3
 936	AESENC	  \TMP3, \XMM1              # Round 5
 937	AESENC	  \TMP3, \XMM2
 938	AESENC	  \TMP3, \XMM3
 939	AESENC	  \TMP3, \XMM4
 940	pxor	  \TMP1, \TMP4
 941# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 942	pxor	  \XMM6, \XMM5
 943	pxor	  \TMP2, \TMP6
 944	movdqa	  \XMM7, \TMP1
 945	pshufd	  $78, \XMM7, \TMP2
 946	pxor	  \XMM7, \TMP2
 947	movdqa	  HashKey_2(%rsp ), \TMP5
 948
 949        # Multiply TMP5 * HashKey using karatsuba
 950
 951	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
 952	movaps 0x60(%arg1), \TMP3
 953	AESENC	  \TMP3, \XMM1              # Round 6
 954	AESENC	  \TMP3, \XMM2
 955	AESENC	  \TMP3, \XMM3
 956	AESENC	  \TMP3, \XMM4
 957	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
 958	movaps 0x70(%arg1), \TMP3
 959	AESENC	  \TMP3, \XMM1             # Round 7
 960	AESENC	  \TMP3, \XMM2
 961	AESENC	  \TMP3, \XMM3
 962	AESENC	  \TMP3, \XMM4
 963	movdqa	  HashKey_2_k(%rsp), \TMP5
 964	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 965	movaps 0x80(%arg1), \TMP3
 966	AESENC	  \TMP3, \XMM1             # Round 8
 967	AESENC	  \TMP3, \XMM2
 968	AESENC	  \TMP3, \XMM3
 969	AESENC	  \TMP3, \XMM4
 970	pxor	  \TMP1, \TMP4
 971# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 972	pxor	  \XMM7, \XMM5
 973	pxor	  \TMP2, \TMP6
 974
 975        # Multiply XMM8 * HashKey
 976        # XMM8 and TMP5 hold the values for the two operands
 977
 978	movdqa	  \XMM8, \TMP1
 979	pshufd	  $78, \XMM8, \TMP2
 980	pxor	  \XMM8, \TMP2
 981	movdqa	  HashKey(%rsp), \TMP5
 982	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
 983	movaps 0x90(%arg1), \TMP3
 984	AESENC	  \TMP3, \XMM1            # Round 9
 985	AESENC	  \TMP3, \XMM2
 986	AESENC	  \TMP3, \XMM3
 987	AESENC	  \TMP3, \XMM4
 988	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 989	movaps 0xa0(%arg1), \TMP3
 990	AESENCLAST \TMP3, \XMM1           # Round 10
 991	AESENCLAST \TMP3, \XMM2
 992	AESENCLAST \TMP3, \XMM3
 993	AESENCLAST \TMP3, \XMM4
 994	movdqa    HashKey_k(%rsp), \TMP5
 995	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
 996	movdqu	  (%arg3,%r11,1), \TMP3
 997	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
 998	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
 999	movdqa    \TMP3, \XMM1
1000	movdqu	  16(%arg3,%r11,1), \TMP3
1001	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1002	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
1003	movdqa    \TMP3, \XMM2
1004	movdqu	  32(%arg3,%r11,1), \TMP3
1005	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1006	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1007	movdqa    \TMP3, \XMM3
1008	movdqu	  48(%arg3,%r11,1), \TMP3
1009	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1010	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1011	movdqa    \TMP3, \XMM4
1012	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1013	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1014	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1015	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1016
1017	pxor	  \TMP4, \TMP1
1018	pxor	  \XMM8, \XMM5
1019	pxor	  \TMP6, \TMP2
1020	pxor	  \TMP1, \TMP2
1021	pxor	  \XMM5, \TMP2
1022	movdqa	  \TMP2, \TMP3
1023	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1024	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1025	pxor	  \TMP3, \XMM5
1026	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1027
1028        # first phase of reduction
1029
1030	movdqa    \XMM5, \TMP2
1031	movdqa    \XMM5, \TMP3
1032	movdqa    \XMM5, \TMP4
1033# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1034	pslld     $31, \TMP2                   # packed right shift << 31
1035	pslld     $30, \TMP3                   # packed right shift << 30
1036	pslld     $25, \TMP4                   # packed right shift << 25
1037	pxor      \TMP3, \TMP2	               # xor the shifted versions
1038	pxor      \TMP4, \TMP2
1039	movdqa    \TMP2, \TMP5
1040	psrldq    $4, \TMP5                    # right shift T5 1 DW
1041	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1042	pxor      \TMP2, \XMM5
1043
1044        # second phase of reduction
1045
1046	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1047	movdqa    \XMM5,\TMP3
1048	movdqa    \XMM5,\TMP4
1049	psrld     $1, \TMP2                    # packed left shift >>1
1050	psrld     $2, \TMP3                    # packed left shift >>2
1051	psrld     $7, \TMP4                    # packed left shift >>7
1052	pxor      \TMP3,\TMP2		       # xor the shifted versions
1053	pxor      \TMP4,\TMP2
1054	pxor      \TMP5, \TMP2
1055	pxor      \TMP2, \XMM5
1056	pxor      \TMP1, \XMM5                 # result is in TMP1
1057
1058	pxor	  \XMM5, \XMM1
1059.endm
1060
1061/* GHASH the last 4 ciphertext blocks. */
1062.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1063TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1064
1065        # Multiply TMP6 * HashKey (using Karatsuba)
1066
1067	movdqa	  \XMM1, \TMP6
1068	pshufd	  $78, \XMM1, \TMP2
1069	pxor	  \XMM1, \TMP2
1070	movdqa	  HashKey_4(%rsp), \TMP5
1071	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1072	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1073	movdqa	  HashKey_4_k(%rsp), \TMP4
1074	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1075	movdqa	  \XMM1, \XMMDst
1076	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1077
1078        # Multiply TMP1 * HashKey (using Karatsuba)
1079
1080	movdqa	  \XMM2, \TMP1
1081	pshufd	  $78, \XMM2, \TMP2
1082	pxor	  \XMM2, \TMP2
1083	movdqa	  HashKey_3(%rsp), \TMP5
1084	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1085	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1086	movdqa	  HashKey_3_k(%rsp), \TMP4
1087	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1088	pxor	  \TMP1, \TMP6
1089	pxor	  \XMM2, \XMMDst
1090	pxor	  \TMP2, \XMM1
1091# results accumulated in TMP6, XMMDst, XMM1
1092
1093        # Multiply TMP1 * HashKey (using Karatsuba)
1094
1095	movdqa	  \XMM3, \TMP1
1096	pshufd	  $78, \XMM3, \TMP2
1097	pxor	  \XMM3, \TMP2
1098	movdqa	  HashKey_2(%rsp), \TMP5
1099	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1100	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1101	movdqa	  HashKey_2_k(%rsp), \TMP4
1102	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1103	pxor	  \TMP1, \TMP6
1104	pxor	  \XMM3, \XMMDst
1105	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1106
1107        # Multiply TMP1 * HashKey (using Karatsuba)
1108	movdqa	  \XMM4, \TMP1
1109	pshufd	  $78, \XMM4, \TMP2
1110	pxor	  \XMM4, \TMP2
1111	movdqa	  HashKey(%rsp), \TMP5
1112	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1113	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1114	movdqa	  HashKey_k(%rsp), \TMP4
1115	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1116	pxor	  \TMP1, \TMP6
1117	pxor	  \XMM4, \XMMDst
1118	pxor	  \XMM1, \TMP2
1119	pxor	  \TMP6, \TMP2
1120	pxor	  \XMMDst, \TMP2
1121	# middle section of the temp results combined as in karatsuba algorithm
1122	movdqa	  \TMP2, \TMP4
1123	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1124	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1125	pxor	  \TMP4, \XMMDst
1126	pxor	  \TMP2, \TMP6
1127# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1128	# first phase of the reduction
1129	movdqa    \XMMDst, \TMP2
1130	movdqa    \XMMDst, \TMP3
1131	movdqa    \XMMDst, \TMP4
1132# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1133	pslld     $31, \TMP2                # packed right shifting << 31
1134	pslld     $30, \TMP3                # packed right shifting << 30
1135	pslld     $25, \TMP4                # packed right shifting << 25
1136	pxor      \TMP3, \TMP2              # xor the shifted versions
1137	pxor      \TMP4, \TMP2
1138	movdqa    \TMP2, \TMP7
1139	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1140	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1141	pxor      \TMP2, \XMMDst
1142
1143        # second phase of the reduction
1144	movdqa    \XMMDst, \TMP2
1145	# make 3 copies of XMMDst for doing 3 shift operations
1146	movdqa    \XMMDst, \TMP3
1147	movdqa    \XMMDst, \TMP4
1148	psrld     $1, \TMP2                 # packed left shift >> 1
1149	psrld     $2, \TMP3                 # packed left shift >> 2
1150	psrld     $7, \TMP4                 # packed left shift >> 7
1151	pxor      \TMP3, \TMP2              # xor the shifted versions
1152	pxor      \TMP4, \TMP2
1153	pxor      \TMP7, \TMP2
1154	pxor      \TMP2, \XMMDst
1155	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1156.endm
1157
1158/* Encryption of a single block done*/
1159.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1160
1161	pxor	(%arg1), \XMM0
1162        movaps 16(%arg1), \TMP1
1163	AESENC	\TMP1, \XMM0
1164        movaps 32(%arg1), \TMP1
1165	AESENC	\TMP1, \XMM0
1166        movaps 48(%arg1), \TMP1
1167	AESENC	\TMP1, \XMM0
1168        movaps 64(%arg1), \TMP1
1169	AESENC	\TMP1, \XMM0
1170        movaps 80(%arg1), \TMP1
1171	AESENC	\TMP1, \XMM0
1172        movaps 96(%arg1), \TMP1
1173	AESENC	\TMP1, \XMM0
1174        movaps 112(%arg1), \TMP1
1175	AESENC	\TMP1, \XMM0
1176        movaps 128(%arg1), \TMP1
1177	AESENC	\TMP1, \XMM0
1178        movaps 144(%arg1), \TMP1
1179	AESENC	\TMP1, \XMM0
1180        movaps 160(%arg1), \TMP1
1181	AESENCLAST	\TMP1, \XMM0
1182.endm
1183
1184
1185/*****************************************************************************
1186* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1187*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1188*                   const u8 *in,      // Ciphertext input
1189*                   u64 plaintext_len, // Length of data in bytes for decryption.
1190*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1191*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1192*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1193*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1194*                   const u8 *aad,     // Additional Authentication Data (AAD)
1195*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1196*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1197*                                      // given authentication tag and only return the plaintext if they match.
1198*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1199*                                      // (most likely), 12 or 8.
1200*
1201* Assumptions:
1202*
1203* keys:
1204*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1205*       set of 11 keys in the data structure void *aes_ctx
1206*
1207* iv:
1208*       0                   1                   2                   3
1209*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1210*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1211*       |                             Salt  (From the SA)               |
1212*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1213*       |                     Initialization Vector                     |
1214*       |         (This is the sequence number from IPSec header)       |
1215*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1216*       |                              0x1                              |
1217*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1218*
1219*
1220*
1221* AAD:
1222*       AAD padded to 128 bits with 0
1223*       for example, assume AAD is a u32 vector
1224*
1225*       if AAD is 8 bytes:
1226*       AAD[3] = {A0, A1};
1227*       padded AAD in xmm register = {A1 A0 0 0}
1228*
1229*       0                   1                   2                   3
1230*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1231*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*       |                               SPI (A1)                        |
1233*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1234*       |                     32-bit Sequence Number (A0)               |
1235*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1236*       |                              0x0                              |
1237*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1238*
1239*                                       AAD Format with 32-bit Sequence Number
1240*
1241*       if AAD is 12 bytes:
1242*       AAD[3] = {A0, A1, A2};
1243*       padded AAD in xmm register = {A2 A1 A0 0}
1244*
1245*       0                   1                   2                   3
1246*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1247*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1248*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1249*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250*       |                               SPI (A2)                        |
1251*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1252*       |                 64-bit Extended Sequence Number {A1,A0}       |
1253*       |                                                               |
1254*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1255*       |                              0x0                              |
1256*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1257*
1258*                        AAD Format with 64-bit Extended Sequence Number
1259*
1260* aadLen:
1261*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1262*       The code supports 16 too but for other sizes, the code will fail.
1263*
1264* TLen:
1265*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1266*       For other sizes, the code will fail.
1267*
1268* poly = x^128 + x^127 + x^126 + x^121 + 1
1269*
1270*****************************************************************************/
1271ENTRY(aesni_gcm_dec)
1272	push	%r12
1273	push	%r13
1274	push	%r14
1275	mov	%rsp, %r14
1276/*
1277* states of %xmm registers %xmm6:%xmm15 not saved
1278* all %xmm registers are clobbered
1279*/
1280	sub	$VARIABLE_OFFSET, %rsp
1281	and	$~63, %rsp                        # align rsp to 64 bytes
1282	mov	%arg6, %r12
1283	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1284        movdqa  SHUF_MASK(%rip), %xmm2
1285	PSHUFB_XMM %xmm2, %xmm13
1286
1287
1288# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1289
1290	movdqa	%xmm13, %xmm2
1291	psllq	$1, %xmm13
1292	psrlq	$63, %xmm2
1293	movdqa	%xmm2, %xmm1
1294	pslldq	$8, %xmm2
1295	psrldq	$8, %xmm1
1296	por	%xmm2, %xmm13
1297
1298        # Reduction
1299
1300	pshufd	$0x24, %xmm1, %xmm2
1301	pcmpeqd TWOONE(%rip), %xmm2
1302	pand	POLY(%rip), %xmm2
1303	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1304
1305
1306        # Decrypt first few blocks
1307
1308	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1309	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1310	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1311	mov %r13, %r12
1312	and $(3<<4), %r12
1313	jz _initial_num_blocks_is_0_decrypt
1314	cmp $(2<<4), %r12
1315	jb _initial_num_blocks_is_1_decrypt
1316	je _initial_num_blocks_is_2_decrypt
1317_initial_num_blocks_is_3_decrypt:
1318	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1320	sub	$48, %r13
1321	jmp	_initial_blocks_decrypted
1322_initial_num_blocks_is_2_decrypt:
1323	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1325	sub	$32, %r13
1326	jmp	_initial_blocks_decrypted
1327_initial_num_blocks_is_1_decrypt:
1328	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1330	sub	$16, %r13
1331	jmp	_initial_blocks_decrypted
1332_initial_num_blocks_is_0_decrypt:
1333	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1334%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1335_initial_blocks_decrypted:
1336	cmp	$0, %r13
1337	je	_zero_cipher_left_decrypt
1338	sub	$64, %r13
1339	je	_four_cipher_left_decrypt
1340_decrypt_by_4:
1341	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1342%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1343	add	$64, %r11
1344	sub	$64, %r13
1345	jne	_decrypt_by_4
1346_four_cipher_left_decrypt:
1347	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1348%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1349_zero_cipher_left_decrypt:
1350	mov	%arg4, %r13
1351	and	$15, %r13				# %r13 = arg4 (mod 16)
1352	je	_multiple_of_16_bytes_decrypt
1353
1354        # Handle the last <16 byte block separately
1355
1356	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1357        movdqa SHUF_MASK(%rip), %xmm10
1358	PSHUFB_XMM %xmm10, %xmm0
1359
1360	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1361	sub $16, %r11
1362	add %r13, %r11
1363	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1364	lea SHIFT_MASK+16(%rip), %r12
1365	sub %r13, %r12
1366# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1367# (%r13 is the number of bytes in plaintext mod 16)
1368	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1369	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1370
1371	movdqa  %xmm1, %xmm2
1372	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1373	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1374	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1375	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1376	pand    %xmm1, %xmm2
1377        movdqa SHUF_MASK(%rip), %xmm10
1378	PSHUFB_XMM %xmm10 ,%xmm2
1379
1380	pxor %xmm2, %xmm8
1381	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1382	          # GHASH computation for the last <16 byte block
1383	sub %r13, %r11
1384	add $16, %r11
1385
1386        # output %r13 bytes
1387	MOVQ_R64_XMM	%xmm0, %rax
1388	cmp	$8, %r13
1389	jle	_less_than_8_bytes_left_decrypt
1390	mov	%rax, (%arg2 , %r11, 1)
1391	add	$8, %r11
1392	psrldq	$8, %xmm0
1393	MOVQ_R64_XMM	%xmm0, %rax
1394	sub	$8, %r13
1395_less_than_8_bytes_left_decrypt:
1396	mov	%al,  (%arg2, %r11, 1)
1397	add	$1, %r11
1398	shr	$8, %rax
1399	sub	$1, %r13
1400	jne	_less_than_8_bytes_left_decrypt
1401_multiple_of_16_bytes_decrypt:
1402	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1403	shl	$3, %r12		  # convert into number of bits
1404	movd	%r12d, %xmm15		  # len(A) in %xmm15
1405	shl	$3, %arg4		  # len(C) in bits (*128)
1406	MOVQ_R64_XMM	%arg4, %xmm1
1407	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1408	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1409	pxor	%xmm15, %xmm8
1410	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1411	         # final GHASH computation
1412        movdqa SHUF_MASK(%rip), %xmm10
1413	PSHUFB_XMM %xmm10, %xmm8
1414
1415	mov	%arg5, %rax		  # %rax = *Y0
1416	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1417	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1418	pxor	%xmm8, %xmm0
1419_return_T_decrypt:
1420	mov	arg9, %r10                # %r10 = authTag
1421	mov	arg10, %r11               # %r11 = auth_tag_len
1422	cmp	$16, %r11
1423	je	_T_16_decrypt
1424	cmp	$12, %r11
1425	je	_T_12_decrypt
1426_T_8_decrypt:
1427	MOVQ_R64_XMM	%xmm0, %rax
1428	mov	%rax, (%r10)
1429	jmp	_return_T_done_decrypt
1430_T_12_decrypt:
1431	MOVQ_R64_XMM	%xmm0, %rax
1432	mov	%rax, (%r10)
1433	psrldq	$8, %xmm0
1434	movd	%xmm0, %eax
1435	mov	%eax, 8(%r10)
1436	jmp	_return_T_done_decrypt
1437_T_16_decrypt:
1438	movdqu	%xmm0, (%r10)
1439_return_T_done_decrypt:
1440	mov	%r14, %rsp
1441	pop	%r14
1442	pop	%r13
1443	pop	%r12
1444	ret
1445ENDPROC(aesni_gcm_dec)
1446
1447
1448/*****************************************************************************
1449* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1450*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1451*                    const u8 *in,       // Plaintext input
1452*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1453*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1454*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1455*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1456*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1457*                    const u8 *aad,      // Additional Authentication Data (AAD)
1458*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1459*                    u8 *auth_tag,       // Authenticated Tag output.
1460*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1461*                                        // 12 or 8.
1462*
1463* Assumptions:
1464*
1465* keys:
1466*       keys are pre-expanded and aligned to 16 bytes. we are using the
1467*       first set of 11 keys in the data structure void *aes_ctx
1468*
1469*
1470* iv:
1471*       0                   1                   2                   3
1472*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1473*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1474*       |                             Salt  (From the SA)               |
1475*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1476*       |                     Initialization Vector                     |
1477*       |         (This is the sequence number from IPSec header)       |
1478*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1479*       |                              0x1                              |
1480*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1481*
1482*
1483*
1484* AAD:
1485*       AAD padded to 128 bits with 0
1486*       for example, assume AAD is a u32 vector
1487*
1488*       if AAD is 8 bytes:
1489*       AAD[3] = {A0, A1};
1490*       padded AAD in xmm register = {A1 A0 0 0}
1491*
1492*       0                   1                   2                   3
1493*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1494*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*       |                               SPI (A1)                        |
1496*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1497*       |                     32-bit Sequence Number (A0)               |
1498*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1499*       |                              0x0                              |
1500*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1501*
1502*                                 AAD Format with 32-bit Sequence Number
1503*
1504*       if AAD is 12 bytes:
1505*       AAD[3] = {A0, A1, A2};
1506*       padded AAD in xmm register = {A2 A1 A0 0}
1507*
1508*       0                   1                   2                   3
1509*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1510*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1511*       |                               SPI (A2)                        |
1512*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513*       |                 64-bit Extended Sequence Number {A1,A0}       |
1514*       |                                                               |
1515*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1516*       |                              0x0                              |
1517*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1518*
1519*                         AAD Format with 64-bit Extended Sequence Number
1520*
1521* aadLen:
1522*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1523*       The code supports 16 too but for other sizes, the code will fail.
1524*
1525* TLen:
1526*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1527*       For other sizes, the code will fail.
1528*
1529* poly = x^128 + x^127 + x^126 + x^121 + 1
1530***************************************************************************/
1531ENTRY(aesni_gcm_enc)
1532	push	%r12
1533	push	%r13
1534	push	%r14
1535	mov	%rsp, %r14
1536#
1537# states of %xmm registers %xmm6:%xmm15 not saved
1538# all %xmm registers are clobbered
1539#
1540	sub	$VARIABLE_OFFSET, %rsp
1541	and	$~63, %rsp
1542	mov	%arg6, %r12
1543	movdqu	(%r12), %xmm13
1544        movdqa  SHUF_MASK(%rip), %xmm2
1545	PSHUFB_XMM %xmm2, %xmm13
1546
1547
1548# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1549
1550	movdqa	%xmm13, %xmm2
1551	psllq	$1, %xmm13
1552	psrlq	$63, %xmm2
1553	movdqa	%xmm2, %xmm1
1554	pslldq	$8, %xmm2
1555	psrldq	$8, %xmm1
1556	por	%xmm2, %xmm13
1557
1558        # reduce HashKey<<1
1559
1560	pshufd	$0x24, %xmm1, %xmm2
1561	pcmpeqd TWOONE(%rip), %xmm2
1562	pand	POLY(%rip), %xmm2
1563	pxor	%xmm2, %xmm13
1564	movdqa	%xmm13, HashKey(%rsp)
1565	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1566	and	$-16, %r13
1567	mov	%r13, %r12
1568
1569        # Encrypt first few blocks
1570
1571	and	$(3<<4), %r12
1572	jz	_initial_num_blocks_is_0_encrypt
1573	cmp	$(2<<4), %r12
1574	jb	_initial_num_blocks_is_1_encrypt
1575	je	_initial_num_blocks_is_2_encrypt
1576_initial_num_blocks_is_3_encrypt:
1577	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1578%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1579	sub	$48, %r13
1580	jmp	_initial_blocks_encrypted
1581_initial_num_blocks_is_2_encrypt:
1582	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1583%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1584	sub	$32, %r13
1585	jmp	_initial_blocks_encrypted
1586_initial_num_blocks_is_1_encrypt:
1587	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1588%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1589	sub	$16, %r13
1590	jmp	_initial_blocks_encrypted
1591_initial_num_blocks_is_0_encrypt:
1592	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1593%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1594_initial_blocks_encrypted:
1595
1596        # Main loop - Encrypt remaining blocks
1597
1598	cmp	$0, %r13
1599	je	_zero_cipher_left_encrypt
1600	sub	$64, %r13
1601	je	_four_cipher_left_encrypt
1602_encrypt_by_4_encrypt:
1603	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1604%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1605	add	$64, %r11
1606	sub	$64, %r13
1607	jne	_encrypt_by_4_encrypt
1608_four_cipher_left_encrypt:
1609	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1610%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1611_zero_cipher_left_encrypt:
1612	mov	%arg4, %r13
1613	and	$15, %r13			# %r13 = arg4 (mod 16)
1614	je	_multiple_of_16_bytes_encrypt
1615
1616         # Handle the last <16 Byte block separately
1617	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1618        movdqa SHUF_MASK(%rip), %xmm10
1619	PSHUFB_XMM %xmm10, %xmm0
1620
1621
1622	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1623	sub $16, %r11
1624	add %r13, %r11
1625	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1626	lea SHIFT_MASK+16(%rip), %r12
1627	sub %r13, %r12
1628	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1629	# (%r13 is the number of bytes in plaintext mod 16)
1630	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1631	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1632	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1633	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1634	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1635	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1636        movdqa SHUF_MASK(%rip), %xmm10
1637	PSHUFB_XMM %xmm10,%xmm0
1638
1639	pxor	%xmm0, %xmm8
1640	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1641	# GHASH computation for the last <16 byte block
1642	sub	%r13, %r11
1643	add	$16, %r11
1644
1645	movdqa SHUF_MASK(%rip), %xmm10
1646	PSHUFB_XMM %xmm10, %xmm0
1647
1648	# shuffle xmm0 back to output as ciphertext
1649
1650        # Output %r13 bytes
1651	MOVQ_R64_XMM %xmm0, %rax
1652	cmp $8, %r13
1653	jle _less_than_8_bytes_left_encrypt
1654	mov %rax, (%arg2 , %r11, 1)
1655	add $8, %r11
1656	psrldq $8, %xmm0
1657	MOVQ_R64_XMM %xmm0, %rax
1658	sub $8, %r13
1659_less_than_8_bytes_left_encrypt:
1660	mov %al,  (%arg2, %r11, 1)
1661	add $1, %r11
1662	shr $8, %rax
1663	sub $1, %r13
1664	jne _less_than_8_bytes_left_encrypt
1665_multiple_of_16_bytes_encrypt:
1666	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1667	shl	$3, %r12
1668	movd	%r12d, %xmm15       # len(A) in %xmm15
1669	shl	$3, %arg4               # len(C) in bits (*128)
1670	MOVQ_R64_XMM	%arg4, %xmm1
1671	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1672	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1673	pxor	%xmm15, %xmm8
1674	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1675	# final GHASH computation
1676        movdqa SHUF_MASK(%rip), %xmm10
1677	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1678
1679	mov	%arg5, %rax		       # %rax  = *Y0
1680	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1681	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1682	pxor	%xmm8, %xmm0
1683_return_T_encrypt:
1684	mov	arg9, %r10                     # %r10 = authTag
1685	mov	arg10, %r11                    # %r11 = auth_tag_len
1686	cmp	$16, %r11
1687	je	_T_16_encrypt
1688	cmp	$12, %r11
1689	je	_T_12_encrypt
1690_T_8_encrypt:
1691	MOVQ_R64_XMM	%xmm0, %rax
1692	mov	%rax, (%r10)
1693	jmp	_return_T_done_encrypt
1694_T_12_encrypt:
1695	MOVQ_R64_XMM	%xmm0, %rax
1696	mov	%rax, (%r10)
1697	psrldq	$8, %xmm0
1698	movd	%xmm0, %eax
1699	mov	%eax, 8(%r10)
1700	jmp	_return_T_done_encrypt
1701_T_16_encrypt:
1702	movdqu	%xmm0, (%r10)
1703_return_T_done_encrypt:
1704	mov	%r14, %rsp
1705	pop	%r14
1706	pop	%r13
1707	pop	%r12
1708	ret
1709ENDPROC(aesni_gcm_enc)
1710
1711#endif
1712
1713
1714.align 4
1715_key_expansion_128:
1716_key_expansion_256a:
1717	pshufd $0b11111111, %xmm1, %xmm1
1718	shufps $0b00010000, %xmm0, %xmm4
1719	pxor %xmm4, %xmm0
1720	shufps $0b10001100, %xmm0, %xmm4
1721	pxor %xmm4, %xmm0
1722	pxor %xmm1, %xmm0
1723	movaps %xmm0, (TKEYP)
1724	add $0x10, TKEYP
1725	ret
1726ENDPROC(_key_expansion_128)
1727ENDPROC(_key_expansion_256a)
1728
1729.align 4
1730_key_expansion_192a:
1731	pshufd $0b01010101, %xmm1, %xmm1
1732	shufps $0b00010000, %xmm0, %xmm4
1733	pxor %xmm4, %xmm0
1734	shufps $0b10001100, %xmm0, %xmm4
1735	pxor %xmm4, %xmm0
1736	pxor %xmm1, %xmm0
1737
1738	movaps %xmm2, %xmm5
1739	movaps %xmm2, %xmm6
1740	pslldq $4, %xmm5
1741	pshufd $0b11111111, %xmm0, %xmm3
1742	pxor %xmm3, %xmm2
1743	pxor %xmm5, %xmm2
1744
1745	movaps %xmm0, %xmm1
1746	shufps $0b01000100, %xmm0, %xmm6
1747	movaps %xmm6, (TKEYP)
1748	shufps $0b01001110, %xmm2, %xmm1
1749	movaps %xmm1, 0x10(TKEYP)
1750	add $0x20, TKEYP
1751	ret
1752ENDPROC(_key_expansion_192a)
1753
1754.align 4
1755_key_expansion_192b:
1756	pshufd $0b01010101, %xmm1, %xmm1
1757	shufps $0b00010000, %xmm0, %xmm4
1758	pxor %xmm4, %xmm0
1759	shufps $0b10001100, %xmm0, %xmm4
1760	pxor %xmm4, %xmm0
1761	pxor %xmm1, %xmm0
1762
1763	movaps %xmm2, %xmm5
1764	pslldq $4, %xmm5
1765	pshufd $0b11111111, %xmm0, %xmm3
1766	pxor %xmm3, %xmm2
1767	pxor %xmm5, %xmm2
1768
1769	movaps %xmm0, (TKEYP)
1770	add $0x10, TKEYP
1771	ret
1772ENDPROC(_key_expansion_192b)
1773
1774.align 4
1775_key_expansion_256b:
1776	pshufd $0b10101010, %xmm1, %xmm1
1777	shufps $0b00010000, %xmm2, %xmm4
1778	pxor %xmm4, %xmm2
1779	shufps $0b10001100, %xmm2, %xmm4
1780	pxor %xmm4, %xmm2
1781	pxor %xmm1, %xmm2
1782	movaps %xmm2, (TKEYP)
1783	add $0x10, TKEYP
1784	ret
1785ENDPROC(_key_expansion_256b)
1786
1787/*
1788 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1789 *                   unsigned int key_len)
1790 */
1791ENTRY(aesni_set_key)
1792#ifndef __x86_64__
1793	pushl KEYP
1794	movl 8(%esp), KEYP		# ctx
1795	movl 12(%esp), UKEYP		# in_key
1796	movl 16(%esp), %edx		# key_len
1797#endif
1798	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1799	movaps %xmm0, (KEYP)
1800	lea 0x10(KEYP), TKEYP		# key addr
1801	movl %edx, 480(KEYP)
1802	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1803	cmp $24, %dl
1804	jb .Lenc_key128
1805	je .Lenc_key192
1806	movups 0x10(UKEYP), %xmm2	# other user key
1807	movaps %xmm2, (TKEYP)
1808	add $0x10, TKEYP
1809	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1810	call _key_expansion_256a
1811	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1812	call _key_expansion_256b
1813	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1814	call _key_expansion_256a
1815	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1816	call _key_expansion_256b
1817	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1818	call _key_expansion_256a
1819	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1820	call _key_expansion_256b
1821	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1822	call _key_expansion_256a
1823	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1824	call _key_expansion_256b
1825	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1826	call _key_expansion_256a
1827	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1828	call _key_expansion_256b
1829	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1830	call _key_expansion_256a
1831	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1832	call _key_expansion_256b
1833	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1834	call _key_expansion_256a
1835	jmp .Ldec_key
1836.Lenc_key192:
1837	movq 0x10(UKEYP), %xmm2		# other user key
1838	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1839	call _key_expansion_192a
1840	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1841	call _key_expansion_192b
1842	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1843	call _key_expansion_192a
1844	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1845	call _key_expansion_192b
1846	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1847	call _key_expansion_192a
1848	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1849	call _key_expansion_192b
1850	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1851	call _key_expansion_192a
1852	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1853	call _key_expansion_192b
1854	jmp .Ldec_key
1855.Lenc_key128:
1856	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1857	call _key_expansion_128
1858	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1859	call _key_expansion_128
1860	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1861	call _key_expansion_128
1862	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1863	call _key_expansion_128
1864	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1865	call _key_expansion_128
1866	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1867	call _key_expansion_128
1868	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1869	call _key_expansion_128
1870	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1871	call _key_expansion_128
1872	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1873	call _key_expansion_128
1874	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1875	call _key_expansion_128
1876.Ldec_key:
1877	sub $0x10, TKEYP
1878	movaps (KEYP), %xmm0
1879	movaps (TKEYP), %xmm1
1880	movaps %xmm0, 240(TKEYP)
1881	movaps %xmm1, 240(KEYP)
1882	add $0x10, KEYP
1883	lea 240-16(TKEYP), UKEYP
1884.align 4
1885.Ldec_key_loop:
1886	movaps (KEYP), %xmm0
1887	AESIMC %xmm0 %xmm1
1888	movaps %xmm1, (UKEYP)
1889	add $0x10, KEYP
1890	sub $0x10, UKEYP
1891	cmp TKEYP, KEYP
1892	jb .Ldec_key_loop
1893	xor AREG, AREG
1894#ifndef __x86_64__
1895	popl KEYP
1896#endif
1897	ret
1898ENDPROC(aesni_set_key)
1899
1900/*
1901 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1902 */
1903ENTRY(aesni_enc)
1904#ifndef __x86_64__
1905	pushl KEYP
1906	pushl KLEN
1907	movl 12(%esp), KEYP
1908	movl 16(%esp), OUTP
1909	movl 20(%esp), INP
1910#endif
1911	movl 480(KEYP), KLEN		# key length
1912	movups (INP), STATE		# input
1913	call _aesni_enc1
1914	movups STATE, (OUTP)		# output
1915#ifndef __x86_64__
1916	popl KLEN
1917	popl KEYP
1918#endif
1919	ret
1920ENDPROC(aesni_enc)
1921
1922/*
1923 * _aesni_enc1:		internal ABI
1924 * input:
1925 *	KEYP:		key struct pointer
1926 *	KLEN:		round count
1927 *	STATE:		initial state (input)
1928 * output:
1929 *	STATE:		finial state (output)
1930 * changed:
1931 *	KEY
1932 *	TKEYP (T1)
1933 */
1934.align 4
1935_aesni_enc1:
1936	movaps (KEYP), KEY		# key
1937	mov KEYP, TKEYP
1938	pxor KEY, STATE		# round 0
1939	add $0x30, TKEYP
1940	cmp $24, KLEN
1941	jb .Lenc128
1942	lea 0x20(TKEYP), TKEYP
1943	je .Lenc192
1944	add $0x20, TKEYP
1945	movaps -0x60(TKEYP), KEY
1946	AESENC KEY STATE
1947	movaps -0x50(TKEYP), KEY
1948	AESENC KEY STATE
1949.align 4
1950.Lenc192:
1951	movaps -0x40(TKEYP), KEY
1952	AESENC KEY STATE
1953	movaps -0x30(TKEYP), KEY
1954	AESENC KEY STATE
1955.align 4
1956.Lenc128:
1957	movaps -0x20(TKEYP), KEY
1958	AESENC KEY STATE
1959	movaps -0x10(TKEYP), KEY
1960	AESENC KEY STATE
1961	movaps (TKEYP), KEY
1962	AESENC KEY STATE
1963	movaps 0x10(TKEYP), KEY
1964	AESENC KEY STATE
1965	movaps 0x20(TKEYP), KEY
1966	AESENC KEY STATE
1967	movaps 0x30(TKEYP), KEY
1968	AESENC KEY STATE
1969	movaps 0x40(TKEYP), KEY
1970	AESENC KEY STATE
1971	movaps 0x50(TKEYP), KEY
1972	AESENC KEY STATE
1973	movaps 0x60(TKEYP), KEY
1974	AESENC KEY STATE
1975	movaps 0x70(TKEYP), KEY
1976	AESENCLAST KEY STATE
1977	ret
1978ENDPROC(_aesni_enc1)
1979
1980/*
1981 * _aesni_enc4:	internal ABI
1982 * input:
1983 *	KEYP:		key struct pointer
1984 *	KLEN:		round count
1985 *	STATE1:		initial state (input)
1986 *	STATE2
1987 *	STATE3
1988 *	STATE4
1989 * output:
1990 *	STATE1:		finial state (output)
1991 *	STATE2
1992 *	STATE3
1993 *	STATE4
1994 * changed:
1995 *	KEY
1996 *	TKEYP (T1)
1997 */
1998.align 4
1999_aesni_enc4:
2000	movaps (KEYP), KEY		# key
2001	mov KEYP, TKEYP
2002	pxor KEY, STATE1		# round 0
2003	pxor KEY, STATE2
2004	pxor KEY, STATE3
2005	pxor KEY, STATE4
2006	add $0x30, TKEYP
2007	cmp $24, KLEN
2008	jb .L4enc128
2009	lea 0x20(TKEYP), TKEYP
2010	je .L4enc192
2011	add $0x20, TKEYP
2012	movaps -0x60(TKEYP), KEY
2013	AESENC KEY STATE1
2014	AESENC KEY STATE2
2015	AESENC KEY STATE3
2016	AESENC KEY STATE4
2017	movaps -0x50(TKEYP), KEY
2018	AESENC KEY STATE1
2019	AESENC KEY STATE2
2020	AESENC KEY STATE3
2021	AESENC KEY STATE4
2022#.align 4
2023.L4enc192:
2024	movaps -0x40(TKEYP), KEY
2025	AESENC KEY STATE1
2026	AESENC KEY STATE2
2027	AESENC KEY STATE3
2028	AESENC KEY STATE4
2029	movaps -0x30(TKEYP), KEY
2030	AESENC KEY STATE1
2031	AESENC KEY STATE2
2032	AESENC KEY STATE3
2033	AESENC KEY STATE4
2034#.align 4
2035.L4enc128:
2036	movaps -0x20(TKEYP), KEY
2037	AESENC KEY STATE1
2038	AESENC KEY STATE2
2039	AESENC KEY STATE3
2040	AESENC KEY STATE4
2041	movaps -0x10(TKEYP), KEY
2042	AESENC KEY STATE1
2043	AESENC KEY STATE2
2044	AESENC KEY STATE3
2045	AESENC KEY STATE4
2046	movaps (TKEYP), KEY
2047	AESENC KEY STATE1
2048	AESENC KEY STATE2
2049	AESENC KEY STATE3
2050	AESENC KEY STATE4
2051	movaps 0x10(TKEYP), KEY
2052	AESENC KEY STATE1
2053	AESENC KEY STATE2
2054	AESENC KEY STATE3
2055	AESENC KEY STATE4
2056	movaps 0x20(TKEYP), KEY
2057	AESENC KEY STATE1
2058	AESENC KEY STATE2
2059	AESENC KEY STATE3
2060	AESENC KEY STATE4
2061	movaps 0x30(TKEYP), KEY
2062	AESENC KEY STATE1
2063	AESENC KEY STATE2
2064	AESENC KEY STATE3
2065	AESENC KEY STATE4
2066	movaps 0x40(TKEYP), KEY
2067	AESENC KEY STATE1
2068	AESENC KEY STATE2
2069	AESENC KEY STATE3
2070	AESENC KEY STATE4
2071	movaps 0x50(TKEYP), KEY
2072	AESENC KEY STATE1
2073	AESENC KEY STATE2
2074	AESENC KEY STATE3
2075	AESENC KEY STATE4
2076	movaps 0x60(TKEYP), KEY
2077	AESENC KEY STATE1
2078	AESENC KEY STATE2
2079	AESENC KEY STATE3
2080	AESENC KEY STATE4
2081	movaps 0x70(TKEYP), KEY
2082	AESENCLAST KEY STATE1		# last round
2083	AESENCLAST KEY STATE2
2084	AESENCLAST KEY STATE3
2085	AESENCLAST KEY STATE4
2086	ret
2087ENDPROC(_aesni_enc4)
2088
2089/*
2090 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2091 */
2092ENTRY(aesni_dec)
2093#ifndef __x86_64__
2094	pushl KEYP
2095	pushl KLEN
2096	movl 12(%esp), KEYP
2097	movl 16(%esp), OUTP
2098	movl 20(%esp), INP
2099#endif
2100	mov 480(KEYP), KLEN		# key length
2101	add $240, KEYP
2102	movups (INP), STATE		# input
2103	call _aesni_dec1
2104	movups STATE, (OUTP)		#output
2105#ifndef __x86_64__
2106	popl KLEN
2107	popl KEYP
2108#endif
2109	ret
2110ENDPROC(aesni_dec)
2111
2112/*
2113 * _aesni_dec1:		internal ABI
2114 * input:
2115 *	KEYP:		key struct pointer
2116 *	KLEN:		key length
2117 *	STATE:		initial state (input)
2118 * output:
2119 *	STATE:		finial state (output)
2120 * changed:
2121 *	KEY
2122 *	TKEYP (T1)
2123 */
2124.align 4
2125_aesni_dec1:
2126	movaps (KEYP), KEY		# key
2127	mov KEYP, TKEYP
2128	pxor KEY, STATE		# round 0
2129	add $0x30, TKEYP
2130	cmp $24, KLEN
2131	jb .Ldec128
2132	lea 0x20(TKEYP), TKEYP
2133	je .Ldec192
2134	add $0x20, TKEYP
2135	movaps -0x60(TKEYP), KEY
2136	AESDEC KEY STATE
2137	movaps -0x50(TKEYP), KEY
2138	AESDEC KEY STATE
2139.align 4
2140.Ldec192:
2141	movaps -0x40(TKEYP), KEY
2142	AESDEC KEY STATE
2143	movaps -0x30(TKEYP), KEY
2144	AESDEC KEY STATE
2145.align 4
2146.Ldec128:
2147	movaps -0x20(TKEYP), KEY
2148	AESDEC KEY STATE
2149	movaps -0x10(TKEYP), KEY
2150	AESDEC KEY STATE
2151	movaps (TKEYP), KEY
2152	AESDEC KEY STATE
2153	movaps 0x10(TKEYP), KEY
2154	AESDEC KEY STATE
2155	movaps 0x20(TKEYP), KEY
2156	AESDEC KEY STATE
2157	movaps 0x30(TKEYP), KEY
2158	AESDEC KEY STATE
2159	movaps 0x40(TKEYP), KEY
2160	AESDEC KEY STATE
2161	movaps 0x50(TKEYP), KEY
2162	AESDEC KEY STATE
2163	movaps 0x60(TKEYP), KEY
2164	AESDEC KEY STATE
2165	movaps 0x70(TKEYP), KEY
2166	AESDECLAST KEY STATE
2167	ret
2168ENDPROC(_aesni_dec1)
2169
2170/*
2171 * _aesni_dec4:	internal ABI
2172 * input:
2173 *	KEYP:		key struct pointer
2174 *	KLEN:		key length
2175 *	STATE1:		initial state (input)
2176 *	STATE2
2177 *	STATE3
2178 *	STATE4
2179 * output:
2180 *	STATE1:		finial state (output)
2181 *	STATE2
2182 *	STATE3
2183 *	STATE4
2184 * changed:
2185 *	KEY
2186 *	TKEYP (T1)
2187 */
2188.align 4
2189_aesni_dec4:
2190	movaps (KEYP), KEY		# key
2191	mov KEYP, TKEYP
2192	pxor KEY, STATE1		# round 0
2193	pxor KEY, STATE2
2194	pxor KEY, STATE3
2195	pxor KEY, STATE4
2196	add $0x30, TKEYP
2197	cmp $24, KLEN
2198	jb .L4dec128
2199	lea 0x20(TKEYP), TKEYP
2200	je .L4dec192
2201	add $0x20, TKEYP
2202	movaps -0x60(TKEYP), KEY
2203	AESDEC KEY STATE1
2204	AESDEC KEY STATE2
2205	AESDEC KEY STATE3
2206	AESDEC KEY STATE4
2207	movaps -0x50(TKEYP), KEY
2208	AESDEC KEY STATE1
2209	AESDEC KEY STATE2
2210	AESDEC KEY STATE3
2211	AESDEC KEY STATE4
2212.align 4
2213.L4dec192:
2214	movaps -0x40(TKEYP), KEY
2215	AESDEC KEY STATE1
2216	AESDEC KEY STATE2
2217	AESDEC KEY STATE3
2218	AESDEC KEY STATE4
2219	movaps -0x30(TKEYP), KEY
2220	AESDEC KEY STATE1
2221	AESDEC KEY STATE2
2222	AESDEC KEY STATE3
2223	AESDEC KEY STATE4
2224.align 4
2225.L4dec128:
2226	movaps -0x20(TKEYP), KEY
2227	AESDEC KEY STATE1
2228	AESDEC KEY STATE2
2229	AESDEC KEY STATE3
2230	AESDEC KEY STATE4
2231	movaps -0x10(TKEYP), KEY
2232	AESDEC KEY STATE1
2233	AESDEC KEY STATE2
2234	AESDEC KEY STATE3
2235	AESDEC KEY STATE4
2236	movaps (TKEYP), KEY
2237	AESDEC KEY STATE1
2238	AESDEC KEY STATE2
2239	AESDEC KEY STATE3
2240	AESDEC KEY STATE4
2241	movaps 0x10(TKEYP), KEY
2242	AESDEC KEY STATE1
2243	AESDEC KEY STATE2
2244	AESDEC KEY STATE3
2245	AESDEC KEY STATE4
2246	movaps 0x20(TKEYP), KEY
2247	AESDEC KEY STATE1
2248	AESDEC KEY STATE2
2249	AESDEC KEY STATE3
2250	AESDEC KEY STATE4
2251	movaps 0x30(TKEYP), KEY
2252	AESDEC KEY STATE1
2253	AESDEC KEY STATE2
2254	AESDEC KEY STATE3
2255	AESDEC KEY STATE4
2256	movaps 0x40(TKEYP), KEY
2257	AESDEC KEY STATE1
2258	AESDEC KEY STATE2
2259	AESDEC KEY STATE3
2260	AESDEC KEY STATE4
2261	movaps 0x50(TKEYP), KEY
2262	AESDEC KEY STATE1
2263	AESDEC KEY STATE2
2264	AESDEC KEY STATE3
2265	AESDEC KEY STATE4
2266	movaps 0x60(TKEYP), KEY
2267	AESDEC KEY STATE1
2268	AESDEC KEY STATE2
2269	AESDEC KEY STATE3
2270	AESDEC KEY STATE4
2271	movaps 0x70(TKEYP), KEY
2272	AESDECLAST KEY STATE1		# last round
2273	AESDECLAST KEY STATE2
2274	AESDECLAST KEY STATE3
2275	AESDECLAST KEY STATE4
2276	ret
2277ENDPROC(_aesni_dec4)
2278
2279/*
2280 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2281 *		      size_t len)
2282 */
2283ENTRY(aesni_ecb_enc)
2284#ifndef __x86_64__
2285	pushl LEN
2286	pushl KEYP
2287	pushl KLEN
2288	movl 16(%esp), KEYP
2289	movl 20(%esp), OUTP
2290	movl 24(%esp), INP
2291	movl 28(%esp), LEN
2292#endif
2293	test LEN, LEN		# check length
2294	jz .Lecb_enc_ret
2295	mov 480(KEYP), KLEN
2296	cmp $16, LEN
2297	jb .Lecb_enc_ret
2298	cmp $64, LEN
2299	jb .Lecb_enc_loop1
2300.align 4
2301.Lecb_enc_loop4:
2302	movups (INP), STATE1
2303	movups 0x10(INP), STATE2
2304	movups 0x20(INP), STATE3
2305	movups 0x30(INP), STATE4
2306	call _aesni_enc4
2307	movups STATE1, (OUTP)
2308	movups STATE2, 0x10(OUTP)
2309	movups STATE3, 0x20(OUTP)
2310	movups STATE4, 0x30(OUTP)
2311	sub $64, LEN
2312	add $64, INP
2313	add $64, OUTP
2314	cmp $64, LEN
2315	jge .Lecb_enc_loop4
2316	cmp $16, LEN
2317	jb .Lecb_enc_ret
2318.align 4
2319.Lecb_enc_loop1:
2320	movups (INP), STATE1
2321	call _aesni_enc1
2322	movups STATE1, (OUTP)
2323	sub $16, LEN
2324	add $16, INP
2325	add $16, OUTP
2326	cmp $16, LEN
2327	jge .Lecb_enc_loop1
2328.Lecb_enc_ret:
2329#ifndef __x86_64__
2330	popl KLEN
2331	popl KEYP
2332	popl LEN
2333#endif
2334	ret
2335ENDPROC(aesni_ecb_enc)
2336
2337/*
2338 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2339 *		      size_t len);
2340 */
2341ENTRY(aesni_ecb_dec)
2342#ifndef __x86_64__
2343	pushl LEN
2344	pushl KEYP
2345	pushl KLEN
2346	movl 16(%esp), KEYP
2347	movl 20(%esp), OUTP
2348	movl 24(%esp), INP
2349	movl 28(%esp), LEN
2350#endif
2351	test LEN, LEN
2352	jz .Lecb_dec_ret
2353	mov 480(KEYP), KLEN
2354	add $240, KEYP
2355	cmp $16, LEN
2356	jb .Lecb_dec_ret
2357	cmp $64, LEN
2358	jb .Lecb_dec_loop1
2359.align 4
2360.Lecb_dec_loop4:
2361	movups (INP), STATE1
2362	movups 0x10(INP), STATE2
2363	movups 0x20(INP), STATE3
2364	movups 0x30(INP), STATE4
2365	call _aesni_dec4
2366	movups STATE1, (OUTP)
2367	movups STATE2, 0x10(OUTP)
2368	movups STATE3, 0x20(OUTP)
2369	movups STATE4, 0x30(OUTP)
2370	sub $64, LEN
2371	add $64, INP
2372	add $64, OUTP
2373	cmp $64, LEN
2374	jge .Lecb_dec_loop4
2375	cmp $16, LEN
2376	jb .Lecb_dec_ret
2377.align 4
2378.Lecb_dec_loop1:
2379	movups (INP), STATE1
2380	call _aesni_dec1
2381	movups STATE1, (OUTP)
2382	sub $16, LEN
2383	add $16, INP
2384	add $16, OUTP
2385	cmp $16, LEN
2386	jge .Lecb_dec_loop1
2387.Lecb_dec_ret:
2388#ifndef __x86_64__
2389	popl KLEN
2390	popl KEYP
2391	popl LEN
2392#endif
2393	ret
2394ENDPROC(aesni_ecb_dec)
2395
2396/*
2397 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2398 *		      size_t len, u8 *iv)
2399 */
2400ENTRY(aesni_cbc_enc)
2401#ifndef __x86_64__
2402	pushl IVP
2403	pushl LEN
2404	pushl KEYP
2405	pushl KLEN
2406	movl 20(%esp), KEYP
2407	movl 24(%esp), OUTP
2408	movl 28(%esp), INP
2409	movl 32(%esp), LEN
2410	movl 36(%esp), IVP
2411#endif
2412	cmp $16, LEN
2413	jb .Lcbc_enc_ret
2414	mov 480(KEYP), KLEN
2415	movups (IVP), STATE	# load iv as initial state
2416.align 4
2417.Lcbc_enc_loop:
2418	movups (INP), IN	# load input
2419	pxor IN, STATE
2420	call _aesni_enc1
2421	movups STATE, (OUTP)	# store output
2422	sub $16, LEN
2423	add $16, INP
2424	add $16, OUTP
2425	cmp $16, LEN
2426	jge .Lcbc_enc_loop
2427	movups STATE, (IVP)
2428.Lcbc_enc_ret:
2429#ifndef __x86_64__
2430	popl KLEN
2431	popl KEYP
2432	popl LEN
2433	popl IVP
2434#endif
2435	ret
2436ENDPROC(aesni_cbc_enc)
2437
2438/*
2439 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2440 *		      size_t len, u8 *iv)
2441 */
2442ENTRY(aesni_cbc_dec)
2443#ifndef __x86_64__
2444	pushl IVP
2445	pushl LEN
2446	pushl KEYP
2447	pushl KLEN
2448	movl 20(%esp), KEYP
2449	movl 24(%esp), OUTP
2450	movl 28(%esp), INP
2451	movl 32(%esp), LEN
2452	movl 36(%esp), IVP
2453#endif
2454	cmp $16, LEN
2455	jb .Lcbc_dec_just_ret
2456	mov 480(KEYP), KLEN
2457	add $240, KEYP
2458	movups (IVP), IV
2459	cmp $64, LEN
2460	jb .Lcbc_dec_loop1
2461.align 4
2462.Lcbc_dec_loop4:
2463	movups (INP), IN1
2464	movaps IN1, STATE1
2465	movups 0x10(INP), IN2
2466	movaps IN2, STATE2
2467#ifdef __x86_64__
2468	movups 0x20(INP), IN3
2469	movaps IN3, STATE3
2470	movups 0x30(INP), IN4
2471	movaps IN4, STATE4
2472#else
2473	movups 0x20(INP), IN1
2474	movaps IN1, STATE3
2475	movups 0x30(INP), IN2
2476	movaps IN2, STATE4
2477#endif
2478	call _aesni_dec4
2479	pxor IV, STATE1
2480#ifdef __x86_64__
2481	pxor IN1, STATE2
2482	pxor IN2, STATE3
2483	pxor IN3, STATE4
2484	movaps IN4, IV
2485#else
2486	pxor IN1, STATE4
2487	movaps IN2, IV
2488	movups (INP), IN1
2489	pxor IN1, STATE2
2490	movups 0x10(INP), IN2
2491	pxor IN2, STATE3
2492#endif
2493	movups STATE1, (OUTP)
2494	movups STATE2, 0x10(OUTP)
2495	movups STATE3, 0x20(OUTP)
2496	movups STATE4, 0x30(OUTP)
2497	sub $64, LEN
2498	add $64, INP
2499	add $64, OUTP
2500	cmp $64, LEN
2501	jge .Lcbc_dec_loop4
2502	cmp $16, LEN
2503	jb .Lcbc_dec_ret
2504.align 4
2505.Lcbc_dec_loop1:
2506	movups (INP), IN
2507	movaps IN, STATE
2508	call _aesni_dec1
2509	pxor IV, STATE
2510	movups STATE, (OUTP)
2511	movaps IN, IV
2512	sub $16, LEN
2513	add $16, INP
2514	add $16, OUTP
2515	cmp $16, LEN
2516	jge .Lcbc_dec_loop1
2517.Lcbc_dec_ret:
2518	movups IV, (IVP)
2519.Lcbc_dec_just_ret:
2520#ifndef __x86_64__
2521	popl KLEN
2522	popl KEYP
2523	popl LEN
2524	popl IVP
2525#endif
2526	ret
2527ENDPROC(aesni_cbc_dec)
2528
2529#ifdef __x86_64__
2530.align 16
2531.Lbswap_mask:
2532	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2533
2534/*
2535 * _aesni_inc_init:	internal ABI
2536 *	setup registers used by _aesni_inc
2537 * input:
2538 *	IV
2539 * output:
2540 *	CTR:	== IV, in little endian
2541 *	TCTR_LOW: == lower qword of CTR
2542 *	INC:	== 1, in little endian
2543 *	BSWAP_MASK == endian swapping mask
2544 */
2545.align 4
2546_aesni_inc_init:
2547	movaps .Lbswap_mask, BSWAP_MASK
2548	movaps IV, CTR
2549	PSHUFB_XMM BSWAP_MASK CTR
2550	mov $1, TCTR_LOW
2551	MOVQ_R64_XMM TCTR_LOW INC
2552	MOVQ_R64_XMM CTR TCTR_LOW
2553	ret
2554ENDPROC(_aesni_inc_init)
2555
2556/*
2557 * _aesni_inc:		internal ABI
2558 *	Increase IV by 1, IV is in big endian
2559 * input:
2560 *	IV
2561 *	CTR:	== IV, in little endian
2562 *	TCTR_LOW: == lower qword of CTR
2563 *	INC:	== 1, in little endian
2564 *	BSWAP_MASK == endian swapping mask
2565 * output:
2566 *	IV:	Increase by 1
2567 * changed:
2568 *	CTR:	== output IV, in little endian
2569 *	TCTR_LOW: == lower qword of CTR
2570 */
2571.align 4
2572_aesni_inc:
2573	paddq INC, CTR
2574	add $1, TCTR_LOW
2575	jnc .Linc_low
2576	pslldq $8, INC
2577	paddq INC, CTR
2578	psrldq $8, INC
2579.Linc_low:
2580	movaps CTR, IV
2581	PSHUFB_XMM BSWAP_MASK IV
2582	ret
2583ENDPROC(_aesni_inc)
2584
2585/*
2586 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2587 *		      size_t len, u8 *iv)
2588 */
2589ENTRY(aesni_ctr_enc)
2590	cmp $16, LEN
2591	jb .Lctr_enc_just_ret
2592	mov 480(KEYP), KLEN
2593	movups (IVP), IV
2594	call _aesni_inc_init
2595	cmp $64, LEN
2596	jb .Lctr_enc_loop1
2597.align 4
2598.Lctr_enc_loop4:
2599	movaps IV, STATE1
2600	call _aesni_inc
2601	movups (INP), IN1
2602	movaps IV, STATE2
2603	call _aesni_inc
2604	movups 0x10(INP), IN2
2605	movaps IV, STATE3
2606	call _aesni_inc
2607	movups 0x20(INP), IN3
2608	movaps IV, STATE4
2609	call _aesni_inc
2610	movups 0x30(INP), IN4
2611	call _aesni_enc4
2612	pxor IN1, STATE1
2613	movups STATE1, (OUTP)
2614	pxor IN2, STATE2
2615	movups STATE2, 0x10(OUTP)
2616	pxor IN3, STATE3
2617	movups STATE3, 0x20(OUTP)
2618	pxor IN4, STATE4
2619	movups STATE4, 0x30(OUTP)
2620	sub $64, LEN
2621	add $64, INP
2622	add $64, OUTP
2623	cmp $64, LEN
2624	jge .Lctr_enc_loop4
2625	cmp $16, LEN
2626	jb .Lctr_enc_ret
2627.align 4
2628.Lctr_enc_loop1:
2629	movaps IV, STATE
2630	call _aesni_inc
2631	movups (INP), IN
2632	call _aesni_enc1
2633	pxor IN, STATE
2634	movups STATE, (OUTP)
2635	sub $16, LEN
2636	add $16, INP
2637	add $16, OUTP
2638	cmp $16, LEN
2639	jge .Lctr_enc_loop1
2640.Lctr_enc_ret:
2641	movups IV, (IVP)
2642.Lctr_enc_just_ret:
2643	ret
2644ENDPROC(aesni_ctr_enc)
2645
2646/*
2647 * _aesni_gf128mul_x_ble:		internal ABI
2648 *	Multiply in GF(2^128) for XTS IVs
2649 * input:
2650 *	IV:	current IV
2651 *	GF128MUL_MASK == mask with 0x87 and 0x01
2652 * output:
2653 *	IV:	next IV
2654 * changed:
2655 *	CTR:	== temporary value
2656 */
2657#define _aesni_gf128mul_x_ble() \
2658	pshufd $0x13, IV, CTR; \
2659	paddq IV, IV; \
2660	psrad $31, CTR; \
2661	pand GF128MUL_MASK, CTR; \
2662	pxor CTR, IV;
2663
2664/*
2665 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2666 *			 bool enc, u8 *iv)
2667 */
2668ENTRY(aesni_xts_crypt8)
2669	cmpb $0, %cl
2670	movl $0, %ecx
2671	movl $240, %r10d
2672	leaq _aesni_enc4, %r11
2673	leaq _aesni_dec4, %rax
2674	cmovel %r10d, %ecx
2675	cmoveq %rax, %r11
2676
2677	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2678	movups (IVP), IV
2679
2680	mov 480(KEYP), KLEN
2681	addq %rcx, KEYP
2682
2683	movdqa IV, STATE1
2684	movdqu 0x00(INP), INC
2685	pxor INC, STATE1
2686	movdqu IV, 0x00(OUTP)
2687
2688	_aesni_gf128mul_x_ble()
2689	movdqa IV, STATE2
2690	movdqu 0x10(INP), INC
2691	pxor INC, STATE2
2692	movdqu IV, 0x10(OUTP)
2693
2694	_aesni_gf128mul_x_ble()
2695	movdqa IV, STATE3
2696	movdqu 0x20(INP), INC
2697	pxor INC, STATE3
2698	movdqu IV, 0x20(OUTP)
2699
2700	_aesni_gf128mul_x_ble()
2701	movdqa IV, STATE4
2702	movdqu 0x30(INP), INC
2703	pxor INC, STATE4
2704	movdqu IV, 0x30(OUTP)
2705
2706	call *%r11
2707
2708	movdqu 0x00(OUTP), INC
2709	pxor INC, STATE1
2710	movdqu STATE1, 0x00(OUTP)
2711
2712	_aesni_gf128mul_x_ble()
2713	movdqa IV, STATE1
2714	movdqu 0x40(INP), INC
2715	pxor INC, STATE1
2716	movdqu IV, 0x40(OUTP)
2717
2718	movdqu 0x10(OUTP), INC
2719	pxor INC, STATE2
2720	movdqu STATE2, 0x10(OUTP)
2721
2722	_aesni_gf128mul_x_ble()
2723	movdqa IV, STATE2
2724	movdqu 0x50(INP), INC
2725	pxor INC, STATE2
2726	movdqu IV, 0x50(OUTP)
2727
2728	movdqu 0x20(OUTP), INC
2729	pxor INC, STATE3
2730	movdqu STATE3, 0x20(OUTP)
2731
2732	_aesni_gf128mul_x_ble()
2733	movdqa IV, STATE3
2734	movdqu 0x60(INP), INC
2735	pxor INC, STATE3
2736	movdqu IV, 0x60(OUTP)
2737
2738	movdqu 0x30(OUTP), INC
2739	pxor INC, STATE4
2740	movdqu STATE4, 0x30(OUTP)
2741
2742	_aesni_gf128mul_x_ble()
2743	movdqa IV, STATE4
2744	movdqu 0x70(INP), INC
2745	pxor INC, STATE4
2746	movdqu IV, 0x70(OUTP)
2747
2748	_aesni_gf128mul_x_ble()
2749	movups IV, (IVP)
2750
2751	call *%r11
2752
2753	movdqu 0x40(OUTP), INC
2754	pxor INC, STATE1
2755	movdqu STATE1, 0x40(OUTP)
2756
2757	movdqu 0x50(OUTP), INC
2758	pxor INC, STATE2
2759	movdqu STATE2, 0x50(OUTP)
2760
2761	movdqu 0x60(OUTP), INC
2762	pxor INC, STATE3
2763	movdqu STATE3, 0x60(OUTP)
2764
2765	movdqu 0x70(OUTP), INC
2766	pxor INC, STATE4
2767	movdqu STATE4, 0x70(OUTP)
2768
2769	ret
2770ENDPROC(aesni_xts_crypt8)
2771
2772#endif