aesni-intel_asm.S - arch/x86/crypto/aesni-intel_asm.S - Linux diff v3.1

 
   1/*
   2 * Implement AES algorithm in Intel AES-NI instructions.
   3 *
   4 * The white paper of AES-NI instructions can be downloaded from:
   5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   6 *
   7 * Copyright (C) 2008, Intel Corp.
   8 *    Author: Huang Ying <ying.huang@intel.com>
   9 *            Vinodh Gopal <vinodh.gopal@intel.com>
  10 *            Kahraman Akdemir
  11 *
  12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13 * interface for 64-bit kernels.
  14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  16 *             Adrian Hoban <adrian.hoban@intel.com>
  17 *             James Guilford (james.guilford@intel.com)
  18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  19 *             Tadeusz Struk (tadeusz.struk@intel.com)
  20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  21 *    Copyright (c) 2010, Intel Corporation.
  22 *
  23 * Ported x86_64 version to x86:
  24 *    Author: Mathias Krause <minipli@googlemail.com>
  25 *
  26 * This program is free software; you can redistribute it and/or modify
  27 * it under the terms of the GNU General Public License as published by
  28 * the Free Software Foundation; either version 2 of the License, or
  29 * (at your option) any later version.
  30 */
  31
  32#include <linux/linkage.h>
  33#include <asm/inst.h>
 
 
 
 
 
 
 
 
 
 
 
 
  34
  35#ifdef __x86_64__
  36.data
 
 
 
  37POLY:   .octa 0xC2000000000000000000000000000001
 
 
  38TWOONE: .octa 0x00000001000000000000000000000001
  39
  40# order of these constants should not change.
  41# more specifically, ALL_F should follow SHIFT_MASK,
  42# and ZERO should follow ALL_F
  43
  44SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
 
 
  45MASK1:      .octa 0x0000000000000000ffffffffffffffff
 
 
  46MASK2:      .octa 0xffffffffffffffff0000000000000000
  47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  48ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  49ZERO:       .octa 0x00000000000000000000000000000000
  50ONE:        .octa 0x00000000000000000000000000000001
 
 
  51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
 
 
  52dec:        .octa 0x1
 
 
  53enc:        .octa 0x2
  54
 
 
 
 
 
 
 
 
  55
  56.text
  57
  58
  59#define	STACK_OFFSET    8*3
  60#define	HashKey		16*0	// store HashKey <<1 mod poly here
  61#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
  62#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
  63#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
  64#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
 
 
 
 
 
 
 
 
  65				// bits of  HashKey <<1 mod poly here
  66				//(for Karatsuba purposes)
  67#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
  68				// bits of  HashKey^2 <<1 mod poly here
  69				// (for Karatsuba purposes)
  70#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
  71				// bits of  HashKey^3 <<1 mod poly here
  72				// (for Karatsuba purposes)
  73#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
  74				// bits of  HashKey^4 <<1 mod poly here
  75				// (for Karatsuba purposes)
  76#define	VARIABLE_OFFSET	16*8
  77
  78#define arg1 rdi
  79#define arg2 rsi
  80#define arg3 rdx
  81#define arg4 rcx
  82#define arg5 r8
  83#define arg6 r9
  84#define arg7 STACK_OFFSET+8(%r14)
  85#define arg8 STACK_OFFSET+16(%r14)
  86#define arg9 STACK_OFFSET+24(%r14)
  87#define arg10 STACK_OFFSET+32(%r14)
 
 
  88#endif
  89
  90
  91#define STATE1	%xmm0
  92#define STATE2	%xmm4
  93#define STATE3	%xmm5
  94#define STATE4	%xmm6
  95#define STATE	STATE1
  96#define IN1	%xmm1
  97#define IN2	%xmm7
  98#define IN3	%xmm8
  99#define IN4	%xmm9
 100#define IN	IN1
 101#define KEY	%xmm2
 102#define IV	%xmm3
 103
 104#define BSWAP_MASK %xmm10
 105#define CTR	%xmm11
 106#define INC	%xmm12
 107
 
 
 108#ifdef __x86_64__
 109#define AREG	%rax
 110#define KEYP	%rdi
 111#define OUTP	%rsi
 112#define UKEYP	OUTP
 113#define INP	%rdx
 114#define LEN	%rcx
 115#define IVP	%r8
 116#define KLEN	%r9d
 117#define T1	%r10
 118#define TKEYP	T1
 119#define T2	%r11
 120#define TCTR_LOW T2
 121#else
 122#define AREG	%eax
 123#define KEYP	%edi
 124#define OUTP	AREG
 125#define UKEYP	OUTP
 126#define INP	%edx
 127#define LEN	%esi
 128#define IVP	%ebp
 129#define KLEN	%ebx
 130#define T1	%ecx
 131#define TKEYP	T1
 132#endif
 133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 134
 135#ifdef __x86_64__
 136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 137*
 138*
 139* Input: A and B (128-bits each, bit-reflected)
 140* Output: C = A*B*x mod poly, (i.e. >>1 )
 141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 143*
 144*/
 145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 146	movdqa	  \GH, \TMP1
 147	pshufd	  $78, \GH, \TMP2
 148	pshufd	  $78, \HK, \TMP3
 149	pxor	  \GH, \TMP2            # TMP2 = a1+a0
 150	pxor	  \HK, \TMP3            # TMP3 = b1+b0
 151	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 152	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 153	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 154	pxor	  \GH, \TMP2
 155	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 156	movdqa	  \TMP2, \TMP3
 157	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
 158	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
 159	pxor	  \TMP3, \GH
 160	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 161
 162        # first phase of the reduction
 163
 164	movdqa    \GH, \TMP2
 165	movdqa    \GH, \TMP3
 166	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 167					# in in order to perform
 168					# independent shifts
 169	pslld     $31, \TMP2            # packed right shift <<31
 170	pslld     $30, \TMP3            # packed right shift <<30
 171	pslld     $25, \TMP4            # packed right shift <<25
 172	pxor      \TMP3, \TMP2          # xor the shifted versions
 173	pxor      \TMP4, \TMP2
 174	movdqa    \TMP2, \TMP5
 175	psrldq    $4, \TMP5             # right shift TMP5 1 DW
 176	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 177	pxor      \TMP2, \GH
 178
 179        # second phase of the reduction
 180
 181	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 182					# in in order to perform
 183					# independent shifts
 184	movdqa    \GH,\TMP3
 185	movdqa    \GH,\TMP4
 186	psrld     $1,\TMP2              # packed left shift >>1
 187	psrld     $2,\TMP3              # packed left shift >>2
 188	psrld     $7,\TMP4              # packed left shift >>7
 189	pxor      \TMP3,\TMP2		# xor the shifted versions
 190	pxor      \TMP4,\TMP2
 191	pxor      \TMP5, \TMP2
 192	pxor      \TMP2, \GH
 193	pxor      \TMP1, \GH            # result is in TMP1
 194.endm
 195
 196/*
 197* if a = number of total plaintext bytes
 198* b = floor(a/16)
 199* num_initial_blocks = b mod 4
 200* encrypt the initial num_initial_blocks blocks and apply ghash on
 201* the ciphertext
 202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 203* are clobbered
 204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
 205*/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 206
 
 
 
 
 
 
 
 
 
 207
 208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 210	mov	   arg7, %r10           # %r10 = AAD
 211	mov	   arg8, %r12           # %r12 = aadLen
 212	mov	   %r12, %r11
 213	pxor	   %xmm\i, %xmm\i
 214_get_AAD_loop\num_initial_blocks\operation:
 215	movd	   (%r10), \TMP1
 216	pslldq	   $12, \TMP1
 217	psrldq	   $4, %xmm\i
 218	pxor	   \TMP1, %xmm\i
 219	add	   $4, %r10
 220	sub	   $4, %r12
 221	jne	   _get_AAD_loop\num_initial_blocks\operation
 222	cmp	   $16, %r11
 223	je	   _get_AAD_loop2_done\num_initial_blocks\operation
 224	mov	   $16, %r12
 225_get_AAD_loop2\num_initial_blocks\operation:
 226	psrldq	   $4, %xmm\i
 227	sub	   $4, %r12
 228	cmp	   %r11, %r12
 229	jne	   _get_AAD_loop2\num_initial_blocks\operation
 230_get_AAD_loop2_done\num_initial_blocks\operation:
 231        movdqa     SHUF_MASK(%rip), %xmm14
 232	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 233
 234	xor	   %r11, %r11 # initialise the data pointer offset as zero
 235
 236        # start AES for num_initial_blocks blocks
 237
 238	mov	   %arg5, %rax                      # %rax = *Y0
 239	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
 240        movdqa     SHUF_MASK(%rip), %xmm14
 241	PSHUFB_XMM   %xmm14, \XMM0
 242
 243.if (\i == 5) || (\i == 6) || (\i == 7)
 244.irpc index, \i_seq
 245	paddd	   ONE(%rip), \XMM0                 # INCR Y0
 246	movdqa	   \XMM0, %xmm\index
 247        movdqa     SHUF_MASK(%rip), %xmm14
 248	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 249
 250.endr
 251.irpc index, \i_seq
 252	pxor	   16*0(%arg1), %xmm\index
 253.endr
 254.irpc index, \i_seq
 255	movaps 0x10(%rdi), \TMP1
 256	AESENC     \TMP1, %xmm\index          # Round 1
 257.endr
 258.irpc index, \i_seq
 259	movaps 0x20(%arg1), \TMP1
 260	AESENC     \TMP1, %xmm\index          # Round 2
 261.endr
 262.irpc index, \i_seq
 263	movaps 0x30(%arg1), \TMP1
 264	AESENC     \TMP1, %xmm\index          # Round 2
 265.endr
 266.irpc index, \i_seq
 267	movaps 0x40(%arg1), \TMP1
 268	AESENC     \TMP1, %xmm\index          # Round 2
 269.endr
 270.irpc index, \i_seq
 271	movaps 0x50(%arg1), \TMP1
 272	AESENC     \TMP1, %xmm\index          # Round 2
 273.endr
 274.irpc index, \i_seq
 275	movaps 0x60(%arg1), \TMP1
 276	AESENC     \TMP1, %xmm\index          # Round 2
 277.endr
 278.irpc index, \i_seq
 279	movaps 0x70(%arg1), \TMP1
 280	AESENC     \TMP1, %xmm\index          # Round 2
 281.endr
 282.irpc index, \i_seq
 283	movaps 0x80(%arg1), \TMP1
 284	AESENC     \TMP1, %xmm\index          # Round 2
 285.endr
 286.irpc index, \i_seq
 287	movaps 0x90(%arg1), \TMP1
 288	AESENC     \TMP1, %xmm\index          # Round 2
 289.endr
 290.irpc index, \i_seq
 291	movaps 0xa0(%arg1), \TMP1
 292	AESENCLAST \TMP1, %xmm\index         # Round 10
 293.endr
 294.irpc index, \i_seq
 295	movdqu	   (%arg3 , %r11, 1), \TMP1
 296	pxor	   \TMP1, %xmm\index
 297	movdqu	   %xmm\index, (%arg2 , %r11, 1)
 298	# write back plaintext/ciphertext for num_initial_blocks
 299	add	   $16, %r11
 300
 301	movdqa     \TMP1, %xmm\index
 302        movdqa     SHUF_MASK(%rip), %xmm14
 303	PSHUFB_XMM	   %xmm14, %xmm\index
 304
 305		# prepare plaintext/ciphertext for GHASH computation
 306.endr
 307.endif
 308	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 309        # apply GHASH on num_initial_blocks blocks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 310
 311.if \i == 5
 312        pxor       %xmm5, %xmm6
 313	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 314        pxor       %xmm6, %xmm7
 315	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 316        pxor       %xmm7, %xmm8
 317	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 318.elseif \i == 6
 319        pxor       %xmm6, %xmm7
 320	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 321        pxor       %xmm7, %xmm8
 322	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 323.elseif \i == 7
 324        pxor       %xmm7, %xmm8
 325	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 326.endif
 327	cmp	   $64, %r13
 328	jl	_initial_blocks_done\num_initial_blocks\operation
 329	# no need for precomputed values
 330/*
 331*
 332* Precomputations for HashKey parallel with encryption of first 4 blocks.
 333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 334*/
 335	paddd	   ONE(%rip), \XMM0              # INCR Y0
 336	movdqa	   \XMM0, \XMM1
 337        movdqa     SHUF_MASK(%rip), %xmm14
 338	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 339
 340	paddd	   ONE(%rip), \XMM0              # INCR Y0
 341	movdqa	   \XMM0, \XMM2
 342        movdqa     SHUF_MASK(%rip), %xmm14
 343	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 344
 345	paddd	   ONE(%rip), \XMM0              # INCR Y0
 346	movdqa	   \XMM0, \XMM3
 347        movdqa     SHUF_MASK(%rip), %xmm14
 348	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 349
 350	paddd	   ONE(%rip), \XMM0              # INCR Y0
 351	movdqa	   \XMM0, \XMM4
 352        movdqa     SHUF_MASK(%rip), %xmm14
 353	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 354
 355	pxor	   16*0(%arg1), \XMM1
 356	pxor	   16*0(%arg1), \XMM2
 357	pxor	   16*0(%arg1), \XMM3
 358	pxor	   16*0(%arg1), \XMM4
 359	movdqa	   \TMP3, \TMP5
 360	pshufd	   $78, \TMP3, \TMP1
 361	pxor	   \TMP3, \TMP1
 362	movdqa	   \TMP1, HashKey_k(%rsp)
 363	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 364# TMP5 = HashKey^2<<1 (mod poly)
 365	movdqa	   \TMP5, HashKey_2(%rsp)
 366# HashKey_2 = HashKey^2<<1 (mod poly)
 367	pshufd	   $78, \TMP5, \TMP1
 368	pxor	   \TMP5, \TMP1
 369	movdqa	   \TMP1, HashKey_2_k(%rsp)
 370.irpc index, 1234 # do 4 rounds
 371	movaps 0x10*\index(%arg1), \TMP1
 372	AESENC	   \TMP1, \XMM1
 373	AESENC	   \TMP1, \XMM2
 374	AESENC	   \TMP1, \XMM3
 375	AESENC	   \TMP1, \XMM4
 376.endr
 377	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 378# TMP5 = HashKey^3<<1 (mod poly)
 379	movdqa	   \TMP5, HashKey_3(%rsp)
 380	pshufd	   $78, \TMP5, \TMP1
 381	pxor	   \TMP5, \TMP1
 382	movdqa	   \TMP1, HashKey_3_k(%rsp)
 383.irpc index, 56789 # do next 5 rounds
 384	movaps 0x10*\index(%arg1), \TMP1
 385	AESENC	   \TMP1, \XMM1
 386	AESENC	   \TMP1, \XMM2
 387	AESENC	   \TMP1, \XMM3
 388	AESENC	   \TMP1, \XMM4
 389.endr
 390	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 391# TMP5 = HashKey^3<<1 (mod poly)
 392	movdqa	   \TMP5, HashKey_4(%rsp)
 393	pshufd	   $78, \TMP5, \TMP1
 394	pxor	   \TMP5, \TMP1
 395	movdqa	   \TMP1, HashKey_4_k(%rsp)
 396	movaps 0xa0(%arg1), \TMP2
 397	AESENCLAST \TMP2, \XMM1
 398	AESENCLAST \TMP2, \XMM2
 399	AESENCLAST \TMP2, \XMM3
 400	AESENCLAST \TMP2, \XMM4
 401	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
 402	pxor	   \TMP1, \XMM1
 403	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
 404	movdqa     \TMP1, \XMM1
 405	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
 406	pxor	   \TMP1, \XMM2
 407	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
 408	movdqa     \TMP1, \XMM2
 409	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
 410	pxor	   \TMP1, \XMM3
 411	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
 412	movdqa     \TMP1, \XMM3
 413	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
 414	pxor	   \TMP1, \XMM4
 415	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
 416	movdqa     \TMP1, \XMM4
 417	add	   $64, %r11
 418        movdqa     SHUF_MASK(%rip), %xmm14
 419	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 420	pxor	   \XMMDst, \XMM1
 421# combine GHASHed value with the corresponding ciphertext
 422        movdqa     SHUF_MASK(%rip), %xmm14
 423	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 424        movdqa     SHUF_MASK(%rip), %xmm14
 425	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 426        movdqa     SHUF_MASK(%rip), %xmm14
 427	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 428
 429_initial_blocks_done\num_initial_blocks\operation:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 430
 431.endm
 
 
 432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 433
 434/*
 435* if a = number of total plaintext bytes
 436* b = floor(a/16)
 437* num_initial_blocks = b mod 4
 438* encrypt the initial num_initial_blocks blocks and apply ghash on
 439* the ciphertext
 440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 441* are clobbered
 442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
 443*/
 444
 445
 446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 448	mov	   arg7, %r10           # %r10 = AAD
 449	mov	   arg8, %r12           # %r12 = aadLen
 450	mov	   %r12, %r11
 451	pxor	   %xmm\i, %xmm\i
 452_get_AAD_loop\num_initial_blocks\operation:
 453	movd	   (%r10), \TMP1
 454	pslldq	   $12, \TMP1
 455	psrldq	   $4, %xmm\i
 456	pxor	   \TMP1, %xmm\i
 457	add	   $4, %r10
 458	sub	   $4, %r12
 459	jne	   _get_AAD_loop\num_initial_blocks\operation
 460	cmp	   $16, %r11
 461	je	   _get_AAD_loop2_done\num_initial_blocks\operation
 462	mov	   $16, %r12
 463_get_AAD_loop2\num_initial_blocks\operation:
 464	psrldq	   $4, %xmm\i
 465	sub	   $4, %r12
 466	cmp	   %r11, %r12
 467	jne	   _get_AAD_loop2\num_initial_blocks\operation
 468_get_AAD_loop2_done\num_initial_blocks\operation:
 469        movdqa     SHUF_MASK(%rip), %xmm14
 470	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 471
 472	xor	   %r11, %r11 # initialise the data pointer offset as zero
 473
 474        # start AES for num_initial_blocks blocks
 475
 476	mov	   %arg5, %rax                      # %rax = *Y0
 477	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
 478        movdqa     SHUF_MASK(%rip), %xmm14
 479	PSHUFB_XMM   %xmm14, \XMM0
 480
 481.if (\i == 5) || (\i == 6) || (\i == 7)
 482.irpc index, \i_seq
 483	paddd	   ONE(%rip), \XMM0                 # INCR Y0
 484	movdqa	   \XMM0, %xmm\index
 485        movdqa     SHUF_MASK(%rip), %xmm14
 486	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 487
 488.endr
 489.irpc index, \i_seq
 490	pxor	   16*0(%arg1), %xmm\index
 491.endr
 492.irpc index, \i_seq
 493	movaps 0x10(%rdi), \TMP1
 494	AESENC     \TMP1, %xmm\index          # Round 1
 495.endr
 496.irpc index, \i_seq
 497	movaps 0x20(%arg1), \TMP1
 498	AESENC     \TMP1, %xmm\index          # Round 2
 499.endr
 500.irpc index, \i_seq
 501	movaps 0x30(%arg1), \TMP1
 502	AESENC     \TMP1, %xmm\index          # Round 2
 503.endr
 504.irpc index, \i_seq
 505	movaps 0x40(%arg1), \TMP1
 506	AESENC     \TMP1, %xmm\index          # Round 2
 507.endr
 508.irpc index, \i_seq
 509	movaps 0x50(%arg1), \TMP1
 510	AESENC     \TMP1, %xmm\index          # Round 2
 511.endr
 512.irpc index, \i_seq
 513	movaps 0x60(%arg1), \TMP1
 514	AESENC     \TMP1, %xmm\index          # Round 2
 515.endr
 516.irpc index, \i_seq
 517	movaps 0x70(%arg1), \TMP1
 518	AESENC     \TMP1, %xmm\index          # Round 2
 519.endr
 520.irpc index, \i_seq
 521	movaps 0x80(%arg1), \TMP1
 522	AESENC     \TMP1, %xmm\index          # Round 2
 523.endr
 524.irpc index, \i_seq
 525	movaps 0x90(%arg1), \TMP1
 526	AESENC     \TMP1, %xmm\index          # Round 2
 
 
 
 
 
 
 527.endr
 
 
 
 
 
 528.irpc index, \i_seq
 529	movaps 0xa0(%arg1), \TMP1
 530	AESENCLAST \TMP1, %xmm\index         # Round 10
 531.endr
 532.irpc index, \i_seq
 533	movdqu	   (%arg3 , %r11, 1), \TMP1
 534	pxor	   \TMP1, %xmm\index
 535	movdqu	   %xmm\index, (%arg2 , %r11, 1)
 536	# write back plaintext/ciphertext for num_initial_blocks
 537	add	   $16, %r11
 538
 539        movdqa     SHUF_MASK(%rip), %xmm14
 540	PSHUFB_XMM	   %xmm14, %xmm\index
 
 
 541
 542		# prepare plaintext/ciphertext for GHASH computation
 543.endr
 544.endif
 545	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 546        # apply GHASH on num_initial_blocks blocks
 547
 548.if \i == 5
 549        pxor       %xmm5, %xmm6
 550	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 551        pxor       %xmm6, %xmm7
 552	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 553        pxor       %xmm7, %xmm8
 554	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 555.elseif \i == 6
 556        pxor       %xmm6, %xmm7
 557	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 558        pxor       %xmm7, %xmm8
 559	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 560.elseif \i == 7
 561        pxor       %xmm7, %xmm8
 562	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 563.endif
 564	cmp	   $64, %r13
 565	jl	_initial_blocks_done\num_initial_blocks\operation
 566	# no need for precomputed values
 567/*
 568*
 569* Precomputations for HashKey parallel with encryption of first 4 blocks.
 570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 571*/
 572	paddd	   ONE(%rip), \XMM0              # INCR Y0
 573	movdqa	   \XMM0, \XMM1
 574        movdqa     SHUF_MASK(%rip), %xmm14
 575	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 576
 577	paddd	   ONE(%rip), \XMM0              # INCR Y0
 578	movdqa	   \XMM0, \XMM2
 579        movdqa     SHUF_MASK(%rip), %xmm14
 580	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 581
 582	paddd	   ONE(%rip), \XMM0              # INCR Y0
 583	movdqa	   \XMM0, \XMM3
 584        movdqa     SHUF_MASK(%rip), %xmm14
 585	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 586
 587	paddd	   ONE(%rip), \XMM0              # INCR Y0
 588	movdqa	   \XMM0, \XMM4
 589        movdqa     SHUF_MASK(%rip), %xmm14
 590	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 591
 592	pxor	   16*0(%arg1), \XMM1
 593	pxor	   16*0(%arg1), \XMM2
 594	pxor	   16*0(%arg1), \XMM3
 595	pxor	   16*0(%arg1), \XMM4
 596	movdqa	   \TMP3, \TMP5
 597	pshufd	   $78, \TMP3, \TMP1
 598	pxor	   \TMP3, \TMP1
 599	movdqa	   \TMP1, HashKey_k(%rsp)
 600	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 601# TMP5 = HashKey^2<<1 (mod poly)
 602	movdqa	   \TMP5, HashKey_2(%rsp)
 603# HashKey_2 = HashKey^2<<1 (mod poly)
 604	pshufd	   $78, \TMP5, \TMP1
 605	pxor	   \TMP5, \TMP1
 606	movdqa	   \TMP1, HashKey_2_k(%rsp)
 607.irpc index, 1234 # do 4 rounds
 608	movaps 0x10*\index(%arg1), \TMP1
 609	AESENC	   \TMP1, \XMM1
 610	AESENC	   \TMP1, \XMM2
 611	AESENC	   \TMP1, \XMM3
 612	AESENC	   \TMP1, \XMM4
 613.endr
 614	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 615# TMP5 = HashKey^3<<1 (mod poly)
 616	movdqa	   \TMP5, HashKey_3(%rsp)
 617	pshufd	   $78, \TMP5, \TMP1
 618	pxor	   \TMP5, \TMP1
 619	movdqa	   \TMP1, HashKey_3_k(%rsp)
 620.irpc index, 56789 # do next 5 rounds
 621	movaps 0x10*\index(%arg1), \TMP1
 622	AESENC	   \TMP1, \XMM1
 623	AESENC	   \TMP1, \XMM2
 624	AESENC	   \TMP1, \XMM3
 625	AESENC	   \TMP1, \XMM4
 626.endr
 627	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 628# TMP5 = HashKey^3<<1 (mod poly)
 629	movdqa	   \TMP5, HashKey_4(%rsp)
 630	pshufd	   $78, \TMP5, \TMP1
 631	pxor	   \TMP5, \TMP1
 632	movdqa	   \TMP1, HashKey_4_k(%rsp)
 633	movaps 0xa0(%arg1), \TMP2
 634	AESENCLAST \TMP2, \XMM1
 635	AESENCLAST \TMP2, \XMM2
 636	AESENCLAST \TMP2, \XMM3
 637	AESENCLAST \TMP2, \XMM4
 638	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
 
 
 
 
 
 
 
 
 
 
 639	pxor	   \TMP1, \XMM1
 640	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
 
 
 
 
 641	pxor	   \TMP1, \XMM2
 642	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
 
 
 
 
 643	pxor	   \TMP1, \XMM3
 644	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
 
 
 
 
 645	pxor	   \TMP1, \XMM4
 646	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
 647	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
 648	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
 649	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
 
 
 
 
 
 650
 651	add	   $64, %r11
 652        movdqa     SHUF_MASK(%rip), %xmm14
 653	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 654	pxor	   \XMMDst, \XMM1
 655# combine GHASHed value with the corresponding ciphertext
 656        movdqa     SHUF_MASK(%rip), %xmm14
 657	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 658        movdqa     SHUF_MASK(%rip), %xmm14
 659	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 660        movdqa     SHUF_MASK(%rip), %xmm14
 661	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 662
 663_initial_blocks_done\num_initial_blocks\operation:
 664
 665.endm
 666
 667/*
 668* encrypt 4 blocks at a time
 669* ghash the 4 previously encrypted ciphertext blocks
 670* arg1, %arg2, %arg3 are used as pointers only, not modified
 671* %r11 is the data offset value
 672*/
 673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 675
 676	movdqa	  \XMM1, \XMM5
 677	movdqa	  \XMM2, \XMM6
 678	movdqa	  \XMM3, \XMM7
 679	movdqa	  \XMM4, \XMM8
 680
 681        movdqa    SHUF_MASK(%rip), %xmm15
 682        # multiply TMP5 * HashKey using karatsuba
 683
 684	movdqa	  \XMM5, \TMP4
 685	pshufd	  $78, \XMM5, \TMP6
 686	pxor	  \XMM5, \TMP6
 687	paddd     ONE(%rip), \XMM0		# INCR CNT
 688	movdqa	  HashKey_4(%rsp), \TMP5
 689	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 690	movdqa    \XMM0, \XMM1
 691	paddd     ONE(%rip), \XMM0		# INCR CNT
 692	movdqa    \XMM0, \XMM2
 693	paddd     ONE(%rip), \XMM0		# INCR CNT
 694	movdqa    \XMM0, \XMM3
 695	paddd     ONE(%rip), \XMM0		# INCR CNT
 696	movdqa    \XMM0, \XMM4
 697	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
 698	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
 699	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 700	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 701	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 702
 703	pxor	  (%arg1), \XMM1
 704	pxor	  (%arg1), \XMM2
 705	pxor	  (%arg1), \XMM3
 706	pxor	  (%arg1), \XMM4
 707	movdqa	  HashKey_4_k(%rsp), \TMP5
 708	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
 709	movaps 0x10(%arg1), \TMP1
 710	AESENC	  \TMP1, \XMM1              # Round 1
 711	AESENC	  \TMP1, \XMM2
 712	AESENC	  \TMP1, \XMM3
 713	AESENC	  \TMP1, \XMM4
 714	movaps 0x20(%arg1), \TMP1
 715	AESENC	  \TMP1, \XMM1              # Round 2
 716	AESENC	  \TMP1, \XMM2
 717	AESENC	  \TMP1, \XMM3
 718	AESENC	  \TMP1, \XMM4
 719	movdqa	  \XMM6, \TMP1
 720	pshufd	  $78, \XMM6, \TMP2
 721	pxor	  \XMM6, \TMP2
 722	movdqa	  HashKey_3(%rsp), \TMP5
 723	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
 724	movaps 0x30(%arg1), \TMP3
 725	AESENC    \TMP3, \XMM1              # Round 3
 726	AESENC    \TMP3, \XMM2
 727	AESENC    \TMP3, \XMM3
 728	AESENC    \TMP3, \XMM4
 729	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
 730	movaps 0x40(%arg1), \TMP3
 731	AESENC	  \TMP3, \XMM1              # Round 4
 732	AESENC	  \TMP3, \XMM2
 733	AESENC	  \TMP3, \XMM3
 734	AESENC	  \TMP3, \XMM4
 735	movdqa	  HashKey_3_k(%rsp), \TMP5
 736	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 737	movaps 0x50(%arg1), \TMP3
 738	AESENC	  \TMP3, \XMM1              # Round 5
 739	AESENC	  \TMP3, \XMM2
 740	AESENC	  \TMP3, \XMM3
 741	AESENC	  \TMP3, \XMM4
 742	pxor	  \TMP1, \TMP4
 743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 744	pxor	  \XMM6, \XMM5
 745	pxor	  \TMP2, \TMP6
 746	movdqa	  \XMM7, \TMP1
 747	pshufd	  $78, \XMM7, \TMP2
 748	pxor	  \XMM7, \TMP2
 749	movdqa	  HashKey_2(%rsp ), \TMP5
 750
 751        # Multiply TMP5 * HashKey using karatsuba
 752
 753	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
 754	movaps 0x60(%arg1), \TMP3
 755	AESENC	  \TMP3, \XMM1              # Round 6
 756	AESENC	  \TMP3, \XMM2
 757	AESENC	  \TMP3, \XMM3
 758	AESENC	  \TMP3, \XMM4
 759	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
 760	movaps 0x70(%arg1), \TMP3
 761	AESENC	  \TMP3, \XMM1             # Round 7
 762	AESENC	  \TMP3, \XMM2
 763	AESENC	  \TMP3, \XMM3
 764	AESENC	  \TMP3, \XMM4
 765	movdqa	  HashKey_2_k(%rsp), \TMP5
 766	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 767	movaps 0x80(%arg1), \TMP3
 768	AESENC	  \TMP3, \XMM1             # Round 8
 769	AESENC	  \TMP3, \XMM2
 770	AESENC	  \TMP3, \XMM3
 771	AESENC	  \TMP3, \XMM4
 772	pxor	  \TMP1, \TMP4
 773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 774	pxor	  \XMM7, \XMM5
 775	pxor	  \TMP2, \TMP6
 776
 777        # Multiply XMM8 * HashKey
 778        # XMM8 and TMP5 hold the values for the two operands
 779
 780	movdqa	  \XMM8, \TMP1
 781	pshufd	  $78, \XMM8, \TMP2
 782	pxor	  \XMM8, \TMP2
 783	movdqa	  HashKey(%rsp), \TMP5
 784	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
 785	movaps 0x90(%arg1), \TMP3
 786	AESENC	  \TMP3, \XMM1            # Round 9
 787	AESENC	  \TMP3, \XMM2
 788	AESENC	  \TMP3, \XMM3
 789	AESENC	  \TMP3, \XMM4
 790	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 791	movaps 0xa0(%arg1), \TMP3
 792	AESENCLAST \TMP3, \XMM1           # Round 10
 793	AESENCLAST \TMP3, \XMM2
 794	AESENCLAST \TMP3, \XMM3
 795	AESENCLAST \TMP3, \XMM4
 796	movdqa    HashKey_k(%rsp), \TMP5
 797	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
 798	movdqu	  (%arg3,%r11,1), \TMP3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 799	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
 800	movdqu	  16(%arg3,%r11,1), \TMP3
 801	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
 802	movdqu	  32(%arg3,%r11,1), \TMP3
 803	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
 804	movdqu	  48(%arg3,%r11,1), \TMP3
 805	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
 806        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
 807        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
 808        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
 809        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
 810	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
 811	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 812	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 813	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 814
 815	pxor	  \TMP4, \TMP1
 816	pxor	  \XMM8, \XMM5
 817	pxor	  \TMP6, \TMP2
 818	pxor	  \TMP1, \TMP2
 819	pxor	  \XMM5, \TMP2
 820	movdqa	  \TMP2, \TMP3
 821	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
 822	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
 823	pxor	  \TMP3, \XMM5
 824	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
 825
 826        # first phase of reduction
 827
 828	movdqa    \XMM5, \TMP2
 829	movdqa    \XMM5, \TMP3
 830	movdqa    \XMM5, \TMP4
 831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
 832	pslld     $31, \TMP2                   # packed right shift << 31
 833	pslld     $30, \TMP3                   # packed right shift << 30
 834	pslld     $25, \TMP4                   # packed right shift << 25
 835	pxor      \TMP3, \TMP2	               # xor the shifted versions
 836	pxor      \TMP4, \TMP2
 837	movdqa    \TMP2, \TMP5
 838	psrldq    $4, \TMP5                    # right shift T5 1 DW
 839	pslldq    $12, \TMP2                   # left shift T2 3 DWs
 840	pxor      \TMP2, \XMM5
 841
 842        # second phase of reduction
 843
 844	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
 845	movdqa    \XMM5,\TMP3
 846	movdqa    \XMM5,\TMP4
 847	psrld     $1, \TMP2                    # packed left shift >>1
 848	psrld     $2, \TMP3                    # packed left shift >>2
 849	psrld     $7, \TMP4                    # packed left shift >>7
 850	pxor      \TMP3,\TMP2		       # xor the shifted versions
 851	pxor      \TMP4,\TMP2
 852	pxor      \TMP5, \TMP2
 853	pxor      \TMP2, \XMM5
 854	pxor      \TMP1, \XMM5                 # result is in TMP1
 855
 856	pxor	  \XMM5, \XMM1
 857.endm
 858
 859/*
 860* decrypt 4 blocks at a time
 861* ghash the 4 previously decrypted ciphertext blocks
 862* arg1, %arg2, %arg3 are used as pointers only, not modified
 863* %r11 is the data offset value
 864*/
 865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
 866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 867
 868	movdqa	  \XMM1, \XMM5
 869	movdqa	  \XMM2, \XMM6
 870	movdqa	  \XMM3, \XMM7
 871	movdqa	  \XMM4, \XMM8
 872
 873        movdqa    SHUF_MASK(%rip), %xmm15
 874        # multiply TMP5 * HashKey using karatsuba
 875
 876	movdqa	  \XMM5, \TMP4
 877	pshufd	  $78, \XMM5, \TMP6
 878	pxor	  \XMM5, \TMP6
 879	paddd     ONE(%rip), \XMM0		# INCR CNT
 880	movdqa	  HashKey_4(%rsp), \TMP5
 881	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 882	movdqa    \XMM0, \XMM1
 883	paddd     ONE(%rip), \XMM0		# INCR CNT
 884	movdqa    \XMM0, \XMM2
 885	paddd     ONE(%rip), \XMM0		# INCR CNT
 886	movdqa    \XMM0, \XMM3
 887	paddd     ONE(%rip), \XMM0		# INCR CNT
 888	movdqa    \XMM0, \XMM4
 889	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
 890	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
 891	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 892	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 893	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 894
 895	pxor	  (%arg1), \XMM1
 896	pxor	  (%arg1), \XMM2
 897	pxor	  (%arg1), \XMM3
 898	pxor	  (%arg1), \XMM4
 899	movdqa	  HashKey_4_k(%rsp), \TMP5
 900	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
 901	movaps 0x10(%arg1), \TMP1
 902	AESENC	  \TMP1, \XMM1              # Round 1
 903	AESENC	  \TMP1, \XMM2
 904	AESENC	  \TMP1, \XMM3
 905	AESENC	  \TMP1, \XMM4
 906	movaps 0x20(%arg1), \TMP1
 907	AESENC	  \TMP1, \XMM1              # Round 2
 908	AESENC	  \TMP1, \XMM2
 909	AESENC	  \TMP1, \XMM3
 910	AESENC	  \TMP1, \XMM4
 911	movdqa	  \XMM6, \TMP1
 912	pshufd	  $78, \XMM6, \TMP2
 913	pxor	  \XMM6, \TMP2
 914	movdqa	  HashKey_3(%rsp), \TMP5
 915	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
 916	movaps 0x30(%arg1), \TMP3
 917	AESENC    \TMP3, \XMM1              # Round 3
 918	AESENC    \TMP3, \XMM2
 919	AESENC    \TMP3, \XMM3
 920	AESENC    \TMP3, \XMM4
 921	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
 922	movaps 0x40(%arg1), \TMP3
 923	AESENC	  \TMP3, \XMM1              # Round 4
 924	AESENC	  \TMP3, \XMM2
 925	AESENC	  \TMP3, \XMM3
 926	AESENC	  \TMP3, \XMM4
 927	movdqa	  HashKey_3_k(%rsp), \TMP5
 928	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 929	movaps 0x50(%arg1), \TMP3
 930	AESENC	  \TMP3, \XMM1              # Round 5
 931	AESENC	  \TMP3, \XMM2
 932	AESENC	  \TMP3, \XMM3
 933	AESENC	  \TMP3, \XMM4
 934	pxor	  \TMP1, \TMP4
 935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 936	pxor	  \XMM6, \XMM5
 937	pxor	  \TMP2, \TMP6
 938	movdqa	  \XMM7, \TMP1
 939	pshufd	  $78, \XMM7, \TMP2
 940	pxor	  \XMM7, \TMP2
 941	movdqa	  HashKey_2(%rsp ), \TMP5
 942
 943        # Multiply TMP5 * HashKey using karatsuba
 944
 945	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
 946	movaps 0x60(%arg1), \TMP3
 947	AESENC	  \TMP3, \XMM1              # Round 6
 948	AESENC	  \TMP3, \XMM2
 949	AESENC	  \TMP3, \XMM3
 950	AESENC	  \TMP3, \XMM4
 951	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
 952	movaps 0x70(%arg1), \TMP3
 953	AESENC	  \TMP3, \XMM1             # Round 7
 954	AESENC	  \TMP3, \XMM2
 955	AESENC	  \TMP3, \XMM3
 956	AESENC	  \TMP3, \XMM4
 957	movdqa	  HashKey_2_k(%rsp), \TMP5
 958	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 959	movaps 0x80(%arg1), \TMP3
 960	AESENC	  \TMP3, \XMM1             # Round 8
 961	AESENC	  \TMP3, \XMM2
 962	AESENC	  \TMP3, \XMM3
 963	AESENC	  \TMP3, \XMM4
 964	pxor	  \TMP1, \TMP4
 965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 966	pxor	  \XMM7, \XMM5
 967	pxor	  \TMP2, \TMP6
 968
 969        # Multiply XMM8 * HashKey
 970        # XMM8 and TMP5 hold the values for the two operands
 971
 972	movdqa	  \XMM8, \TMP1
 973	pshufd	  $78, \XMM8, \TMP2
 974	pxor	  \XMM8, \TMP2
 975	movdqa	  HashKey(%rsp), \TMP5
 976	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
 977	movaps 0x90(%arg1), \TMP3
 978	AESENC	  \TMP3, \XMM1            # Round 9
 979	AESENC	  \TMP3, \XMM2
 980	AESENC	  \TMP3, \XMM3
 981	AESENC	  \TMP3, \XMM4
 982	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 983	movaps 0xa0(%arg1), \TMP3
 984	AESENCLAST \TMP3, \XMM1           # Round 10
 985	AESENCLAST \TMP3, \XMM2
 986	AESENCLAST \TMP3, \XMM3
 987	AESENCLAST \TMP3, \XMM4
 988	movdqa    HashKey_k(%rsp), \TMP5
 989	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
 990	movdqu	  (%arg3,%r11,1), \TMP3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 991	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
 992	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
 993	movdqa    \TMP3, \XMM1
 994	movdqu	  16(%arg3,%r11,1), \TMP3
 995	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
 996	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
 997	movdqa    \TMP3, \XMM2
 998	movdqu	  32(%arg3,%r11,1), \TMP3
 999	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1000	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1001	movdqa    \TMP3, \XMM3
1002	movdqu	  48(%arg3,%r11,1), \TMP3
1003	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1004	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1005	movdqa    \TMP3, \XMM4
1006	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1007	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1008	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1009	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1010
1011	pxor	  \TMP4, \TMP1
1012	pxor	  \XMM8, \XMM5
1013	pxor	  \TMP6, \TMP2
1014	pxor	  \TMP1, \TMP2
1015	pxor	  \XMM5, \TMP2
1016	movdqa	  \TMP2, \TMP3
1017	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1018	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1019	pxor	  \TMP3, \XMM5
1020	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1021
1022        # first phase of reduction
1023
1024	movdqa    \XMM5, \TMP2
1025	movdqa    \XMM5, \TMP3
1026	movdqa    \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028	pslld     $31, \TMP2                   # packed right shift << 31
1029	pslld     $30, \TMP3                   # packed right shift << 30
1030	pslld     $25, \TMP4                   # packed right shift << 25
1031	pxor      \TMP3, \TMP2	               # xor the shifted versions
1032	pxor      \TMP4, \TMP2
1033	movdqa    \TMP2, \TMP5
1034	psrldq    $4, \TMP5                    # right shift T5 1 DW
1035	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1036	pxor      \TMP2, \XMM5
1037
1038        # second phase of reduction
1039
1040	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041	movdqa    \XMM5,\TMP3
1042	movdqa    \XMM5,\TMP4
1043	psrld     $1, \TMP2                    # packed left shift >>1
1044	psrld     $2, \TMP3                    # packed left shift >>2
1045	psrld     $7, \TMP4                    # packed left shift >>7
1046	pxor      \TMP3,\TMP2		       # xor the shifted versions
1047	pxor      \TMP4,\TMP2
1048	pxor      \TMP5, \TMP2
1049	pxor      \TMP2, \XMM5
1050	pxor      \TMP1, \XMM5                 # result is in TMP1
1051
1052	pxor	  \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059        # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061	movdqa	  \XMM1, \TMP6
1062	pshufd	  $78, \XMM1, \TMP2
1063	pxor	  \XMM1, \TMP2
1064	movdqa	  HashKey_4(%rsp), \TMP5
1065	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1066	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1067	movdqa	  HashKey_4_k(%rsp), \TMP4
1068	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1069	movdqa	  \XMM1, \XMMDst
1070	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1071
1072        # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074	movdqa	  \XMM2, \TMP1
1075	pshufd	  $78, \XMM2, \TMP2
1076	pxor	  \XMM2, \TMP2
1077	movdqa	  HashKey_3(%rsp), \TMP5
1078	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1079	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1080	movdqa	  HashKey_3_k(%rsp), \TMP4
1081	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1082	pxor	  \TMP1, \TMP6
1083	pxor	  \XMM2, \XMMDst
1084	pxor	  \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087        # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089	movdqa	  \XMM3, \TMP1
1090	pshufd	  $78, \XMM3, \TMP2
1091	pxor	  \XMM3, \TMP2
1092	movdqa	  HashKey_2(%rsp), \TMP5
1093	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1094	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1095	movdqa	  HashKey_2_k(%rsp), \TMP4
1096	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1097	pxor	  \TMP1, \TMP6
1098	pxor	  \XMM3, \XMMDst
1099	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1100
1101        # Multiply TMP1 * HashKey (using Karatsuba)
1102	movdqa	  \XMM4, \TMP1
1103	pshufd	  $78, \XMM4, \TMP2
1104	pxor	  \XMM4, \TMP2
1105	movdqa	  HashKey(%rsp), \TMP5
1106	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1107	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1108	movdqa	  HashKey_k(%rsp), \TMP4
1109	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1110	pxor	  \TMP1, \TMP6
1111	pxor	  \XMM4, \XMMDst
1112	pxor	  \XMM1, \TMP2
1113	pxor	  \TMP6, \TMP2
1114	pxor	  \XMMDst, \TMP2
1115	# middle section of the temp results combined as in karatsuba algorithm
1116	movdqa	  \TMP2, \TMP4
1117	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1118	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1119	pxor	  \TMP4, \XMMDst
1120	pxor	  \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122	# first phase of the reduction
1123	movdqa    \XMMDst, \TMP2
1124	movdqa    \XMMDst, \TMP3
1125	movdqa    \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127	pslld     $31, \TMP2                # packed right shifting << 31
1128	pslld     $30, \TMP3                # packed right shifting << 30
1129	pslld     $25, \TMP4                # packed right shifting << 25
1130	pxor      \TMP3, \TMP2              # xor the shifted versions
1131	pxor      \TMP4, \TMP2
1132	movdqa    \TMP2, \TMP7
1133	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1134	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1135	pxor      \TMP2, \XMMDst
1136
1137        # second phase of the reduction
1138	movdqa    \XMMDst, \TMP2
1139	# make 3 copies of XMMDst for doing 3 shift operations
1140	movdqa    \XMMDst, \TMP3
1141	movdqa    \XMMDst, \TMP4
1142	psrld     $1, \TMP2                 # packed left shift >> 1
1143	psrld     $2, \TMP3                 # packed left shift >> 2
1144	psrld     $7, \TMP4                 # packed left shift >> 7
1145	pxor      \TMP3, \TMP2              # xor the shifted versions
1146	pxor      \TMP4, \TMP2
1147	pxor      \TMP7, \TMP2
1148	pxor      \TMP2, \XMMDst
1149	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155	pxor	(%arg1), \XMM0
1156        movaps 16(%arg1), \TMP1
1157	AESENC	\TMP1, \XMM0
1158        movaps 32(%arg1), \TMP1
1159	AESENC	\TMP1, \XMM0
1160        movaps 48(%arg1), \TMP1
1161	AESENC	\TMP1, \XMM0
1162        movaps 64(%arg1), \TMP1
1163	AESENC	\TMP1, \XMM0
1164        movaps 80(%arg1), \TMP1
1165	AESENC	\TMP1, \XMM0
1166        movaps 96(%arg1), \TMP1
1167	AESENC	\TMP1, \XMM0
1168        movaps 112(%arg1), \TMP1
1169	AESENC	\TMP1, \XMM0
1170        movaps 128(%arg1), \TMP1
1171	AESENC	\TMP1, \XMM0
1172        movaps 144(%arg1), \TMP1
1173	AESENC	\TMP1, \XMM0
1174        movaps 160(%arg1), \TMP1
1175	AESENCLAST	\TMP1, \XMM0
1176.endm
1177
 
 
 
 
 
 
 
 
 
 
 
 
1178
 
 
 
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
 
 
1181*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1182*                   const u8 *in,      // Ciphertext input
1183*                   u64 plaintext_len, // Length of data in bytes for decryption.
1184*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1185*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1187*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188*                   const u8 *aad,     // Additional Authentication Data (AAD)
1189*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1191*                                      // given authentication tag and only return the plaintext if they match.
1192*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193*                                      // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1199*       set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202*       0                   1                   2                   3
1203*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205*       |                             Salt  (From the SA)               |
1206*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207*       |                     Initialization Vector                     |
1208*       |         (This is the sequence number from IPSec header)       |
1209*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210*       |                              0x1                              |
1211*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216*       AAD padded to 128 bits with 0
1217*       for example, assume AAD is a u32 vector
1218*
1219*       if AAD is 8 bytes:
1220*       AAD[3] = {A0, A1};
1221*       padded AAD in xmm register = {A1 A0 0 0}
1222*
1223*       0                   1                   2                   3
1224*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226*       |                               SPI (A1)                        |
1227*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228*       |                     32-bit Sequence Number (A0)               |
1229*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230*       |                              0x0                              |
1231*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233*                                       AAD Format with 32-bit Sequence Number
1234*
1235*       if AAD is 12 bytes:
1236*       AAD[3] = {A0, A1, A2};
1237*       padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239*       0                   1                   2                   3
1240*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244*       |                               SPI (A2)                        |
1245*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246*       |                 64-bit Extended Sequence Number {A1,A0}       |
1247*       |                                                               |
1248*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249*       |                              0x0                              |
1250*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252*                        AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256*       The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260*       For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
 
 
1265
1266ENTRY(aesni_gcm_dec)
1267	push	%r12
1268	push	%r13
1269	push	%r14
1270	mov	%rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275	sub	$VARIABLE_OFFSET, %rsp
1276	and	$~63, %rsp                        # align rsp to 64 bytes
1277	mov	%arg6, %r12
1278	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1279        movdqa  SHUF_MASK(%rip), %xmm2
1280	PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285	movdqa	%xmm13, %xmm2
1286	psllq	$1, %xmm13
1287	psrlq	$63, %xmm2
1288	movdqa	%xmm2, %xmm1
1289	pslldq	$8, %xmm2
1290	psrldq	$8, %xmm1
1291	por	%xmm2, %xmm13
1292
1293        # Reduction
1294
1295	pshufd	$0x24, %xmm1, %xmm2
1296	pcmpeqd TWOONE(%rip), %xmm2
1297	pand	POLY(%rip), %xmm2
1298	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301        # Decrypt first few blocks
1302
1303	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1304	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1305	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1306	mov %r13, %r12
1307	and $(3<<4), %r12
1308	jz _initial_num_blocks_is_0_decrypt
1309	cmp $(2<<4), %r12
1310	jb _initial_num_blocks_is_1_decrypt
1311	je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315	sub	$48, %r13
1316	jmp	_initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320	sub	$32, %r13
1321	jmp	_initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325	sub	$16, %r13
1326	jmp	_initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331	cmp	$0, %r13
1332	je	_zero_cipher_left_decrypt
1333	sub	$64, %r13
1334	je	_four_cipher_left_decrypt
1335_decrypt_by_4:
1336	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338	add	$64, %r11
1339	sub	$64, %r13
1340	jne	_decrypt_by_4
1341_four_cipher_left_decrypt:
1342	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345	mov	%arg4, %r13
1346	and	$15, %r13				# %r13 = arg4 (mod 16)
1347	je	_multiple_of_16_bytes_decrypt
1348
1349        # Handle the last <16 byte block separately
1350
1351	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1352        movdqa SHUF_MASK(%rip), %xmm10
1353	PSHUFB_XMM %xmm10, %xmm0
1354
1355	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1356	sub $16, %r11
1357	add %r13, %r11
1358	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1359	lea SHIFT_MASK+16(%rip), %r12
1360	sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1364	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1365
1366	movdqa  %xmm1, %xmm2
1367	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1368	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1371	pand    %xmm1, %xmm2
1372        movdqa SHUF_MASK(%rip), %xmm10
1373	PSHUFB_XMM %xmm10 ,%xmm2
1374
1375	pxor %xmm2, %xmm8
1376	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377	          # GHASH computation for the last <16 byte block
1378	sub %r13, %r11
1379	add $16, %r11
1380
1381        # output %r13 bytes
1382	MOVQ_R64_XMM	%xmm0, %rax
1383	cmp	$8, %r13
1384	jle	_less_than_8_bytes_left_decrypt
1385	mov	%rax, (%arg2 , %r11, 1)
1386	add	$8, %r11
1387	psrldq	$8, %xmm0
1388	MOVQ_R64_XMM	%xmm0, %rax
1389	sub	$8, %r13
1390_less_than_8_bytes_left_decrypt:
1391	mov	%al,  (%arg2, %r11, 1)
1392	add	$1, %r11
1393	shr	$8, %rax
1394	sub	$1, %r13
1395	jne	_less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1398	shl	$3, %r12		  # convert into number of bits
1399	movd	%r12d, %xmm15		  # len(A) in %xmm15
1400	shl	$3, %arg4		  # len(C) in bits (*128)
1401	MOVQ_R64_XMM	%arg4, %xmm1
1402	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1403	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1404	pxor	%xmm15, %xmm8
1405	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406	         # final GHASH computation
1407        movdqa SHUF_MASK(%rip), %xmm10
1408	PSHUFB_XMM %xmm10, %xmm8
1409
1410	mov	%arg5, %rax		  # %rax = *Y0
1411	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1412	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1413	pxor	%xmm8, %xmm0
1414_return_T_decrypt:
1415	mov	arg9, %r10                # %r10 = authTag
1416	mov	arg10, %r11               # %r11 = auth_tag_len
1417	cmp	$16, %r11
1418	je	_T_16_decrypt
1419	cmp	$12, %r11
1420	je	_T_12_decrypt
1421_T_8_decrypt:
1422	MOVQ_R64_XMM	%xmm0, %rax
1423	mov	%rax, (%r10)
1424	jmp	_return_T_done_decrypt
1425_T_12_decrypt:
1426	MOVQ_R64_XMM	%xmm0, %rax
1427	mov	%rax, (%r10)
1428	psrldq	$8, %xmm0
1429	movd	%xmm0, %eax
1430	mov	%eax, 8(%r10)
1431	jmp	_return_T_done_decrypt
1432_T_16_decrypt:
1433	movdqu	%xmm0, (%r10)
1434_return_T_done_decrypt:
1435	mov	%r14, %rsp
1436	pop	%r14
1437	pop	%r13
1438	pop	%r12
1439	ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
 
 
1444*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1445*                    const u8 *in,       // Plaintext input
1446*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1447*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1448*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1450*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451*                    const u8 *aad,      // Additional Authentication Data (AAD)
1452*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453*                    u8 *auth_tag,       // Authenticated Tag output.
1454*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455*                                        // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460*       keys are pre-expanded and aligned to 16 bytes. we are using the
1461*       first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465*       0                   1                   2                   3
1466*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468*       |                             Salt  (From the SA)               |
1469*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470*       |                     Initialization Vector                     |
1471*       |         (This is the sequence number from IPSec header)       |
1472*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473*       |                              0x1                              |
1474*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479*       AAD padded to 128 bits with 0
1480*       for example, assume AAD is a u32 vector
1481*
1482*       if AAD is 8 bytes:
1483*       AAD[3] = {A0, A1};
1484*       padded AAD in xmm register = {A1 A0 0 0}
1485*
1486*       0                   1                   2                   3
1487*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489*       |                               SPI (A1)                        |
1490*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491*       |                     32-bit Sequence Number (A0)               |
1492*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493*       |                              0x0                              |
1494*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496*                                 AAD Format with 32-bit Sequence Number
1497*
1498*       if AAD is 12 bytes:
1499*       AAD[3] = {A0, A1, A2};
1500*       padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502*       0                   1                   2                   3
1503*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505*       |                               SPI (A2)                        |
1506*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507*       |                 64-bit Extended Sequence Number {A1,A0}       |
1508*       |                                                               |
1509*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510*       |                              0x0                              |
1511*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513*                         AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517*       The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521*       For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526	push	%r12
1527	push	%r13
1528	push	%r14
1529	mov	%rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534	sub	$VARIABLE_OFFSET, %rsp
1535	and	$~63, %rsp
1536	mov	%arg6, %r12
1537	movdqu	(%r12), %xmm13
1538        movdqa  SHUF_MASK(%rip), %xmm2
1539	PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544	movdqa	%xmm13, %xmm2
1545	psllq	$1, %xmm13
1546	psrlq	$63, %xmm2
1547	movdqa	%xmm2, %xmm1
1548	pslldq	$8, %xmm2
1549	psrldq	$8, %xmm1
1550	por	%xmm2, %xmm13
1551
1552        # reduce HashKey<<1
1553
1554	pshufd	$0x24, %xmm1, %xmm2
1555	pcmpeqd TWOONE(%rip), %xmm2
1556	pand	POLY(%rip), %xmm2
1557	pxor	%xmm2, %xmm13
1558	movdqa	%xmm13, HashKey(%rsp)
1559	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1560	and	$-16, %r13
1561	mov	%r13, %r12
1562
1563        # Encrypt first few blocks
1564
1565	and	$(3<<4), %r12
1566	jz	_initial_num_blocks_is_0_encrypt
1567	cmp	$(2<<4), %r12
1568	jb	_initial_num_blocks_is_1_encrypt
1569	je	_initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573	sub	$48, %r13
1574	jmp	_initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578	sub	$32, %r13
1579	jmp	_initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583	sub	$16, %r13
1584	jmp	_initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590        # Main loop - Encrypt remaining blocks
1591
1592	cmp	$0, %r13
1593	je	_zero_cipher_left_encrypt
1594	sub	$64, %r13
1595	je	_four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599	add	$64, %r11
1600	sub	$64, %r13
1601	jne	_encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606	mov	%arg4, %r13
1607	and	$15, %r13			# %r13 = arg4 (mod 16)
1608	je	_multiple_of_16_bytes_encrypt
1609
1610         # Handle the last <16 Byte block separately
1611	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1612        movdqa SHUF_MASK(%rip), %xmm10
1613	PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1617	sub $16, %r11
1618	add %r13, %r11
1619	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1620	lea SHIFT_MASK+16(%rip), %r12
1621	sub %r13, %r12
1622	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623	# (%r13 is the number of bytes in plaintext mod 16)
1624	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1625	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1626	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1627	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1628	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1630        movdqa SHUF_MASK(%rip), %xmm10
1631	PSHUFB_XMM %xmm10,%xmm0
1632
1633	pxor	%xmm0, %xmm8
1634	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635	# GHASH computation for the last <16 byte block
1636	sub	%r13, %r11
1637	add	$16, %r11
1638
1639	movdqa SHUF_MASK(%rip), %xmm10
1640	PSHUFB_XMM %xmm10, %xmm0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1641
1642	# shuffle xmm0 back to output as ciphertext
 
 
 
 
 
 
 
 
 
 
 
 
 
1643
1644        # Output %r13 bytes
1645	MOVQ_R64_XMM %xmm0, %rax
1646	cmp $8, %r13
1647	jle _less_than_8_bytes_left_encrypt
1648	mov %rax, (%arg2 , %r11, 1)
1649	add $8, %r11
1650	psrldq $8, %xmm0
1651	MOVQ_R64_XMM %xmm0, %rax
1652	sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654	mov %al,  (%arg2, %r11, 1)
1655	add $1, %r11
1656	shr $8, %rax
1657	sub $1, %r13
1658	jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1661	shl	$3, %r12
1662	movd	%r12d, %xmm15       # len(A) in %xmm15
1663	shl	$3, %arg4               # len(C) in bits (*128)
1664	MOVQ_R64_XMM	%arg4, %xmm1
1665	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1666	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1667	pxor	%xmm15, %xmm8
1668	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669	# final GHASH computation
1670        movdqa SHUF_MASK(%rip), %xmm10
1671	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1672
1673	mov	%arg5, %rax		       # %rax  = *Y0
1674	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1675	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1676	pxor	%xmm8, %xmm0
1677_return_T_encrypt:
1678	mov	arg9, %r10                     # %r10 = authTag
1679	mov	arg10, %r11                    # %r11 = auth_tag_len
1680	cmp	$16, %r11
1681	je	_T_16_encrypt
1682	cmp	$12, %r11
1683	je	_T_12_encrypt
1684_T_8_encrypt:
1685	MOVQ_R64_XMM	%xmm0, %rax
1686	mov	%rax, (%r10)
1687	jmp	_return_T_done_encrypt
1688_T_12_encrypt:
1689	MOVQ_R64_XMM	%xmm0, %rax
1690	mov	%rax, (%r10)
1691	psrldq	$8, %xmm0
1692	movd	%xmm0, %eax
1693	mov	%eax, 8(%r10)
1694	jmp	_return_T_done_encrypt
1695_T_16_encrypt:
1696	movdqu	%xmm0, (%r10)
1697_return_T_done_encrypt:
1698	mov	%r14, %rsp
1699	pop	%r14
1700	pop	%r13
1701	pop	%r12
1702	ret
1703
1704#endif
1705
1706
1707_key_expansion_128:
1708_key_expansion_256a:
1709	pshufd $0b11111111, %xmm1, %xmm1
1710	shufps $0b00010000, %xmm0, %xmm4
1711	pxor %xmm4, %xmm0
1712	shufps $0b10001100, %xmm0, %xmm4
1713	pxor %xmm4, %xmm0
1714	pxor %xmm1, %xmm0
1715	movaps %xmm0, (TKEYP)
1716	add $0x10, TKEYP
1717	ret
 
 
1718
1719.align 4
1720_key_expansion_192a:
1721	pshufd $0b01010101, %xmm1, %xmm1
1722	shufps $0b00010000, %xmm0, %xmm4
1723	pxor %xmm4, %xmm0
1724	shufps $0b10001100, %xmm0, %xmm4
1725	pxor %xmm4, %xmm0
1726	pxor %xmm1, %xmm0
1727
1728	movaps %xmm2, %xmm5
1729	movaps %xmm2, %xmm6
1730	pslldq $4, %xmm5
1731	pshufd $0b11111111, %xmm0, %xmm3
1732	pxor %xmm3, %xmm2
1733	pxor %xmm5, %xmm2
1734
1735	movaps %xmm0, %xmm1
1736	shufps $0b01000100, %xmm0, %xmm6
1737	movaps %xmm6, (TKEYP)
1738	shufps $0b01001110, %xmm2, %xmm1
1739	movaps %xmm1, 0x10(TKEYP)
1740	add $0x20, TKEYP
1741	ret
 
1742
1743.align 4
1744_key_expansion_192b:
1745	pshufd $0b01010101, %xmm1, %xmm1
1746	shufps $0b00010000, %xmm0, %xmm4
1747	pxor %xmm4, %xmm0
1748	shufps $0b10001100, %xmm0, %xmm4
1749	pxor %xmm4, %xmm0
1750	pxor %xmm1, %xmm0
1751
1752	movaps %xmm2, %xmm5
1753	pslldq $4, %xmm5
1754	pshufd $0b11111111, %xmm0, %xmm3
1755	pxor %xmm3, %xmm2
1756	pxor %xmm5, %xmm2
1757
1758	movaps %xmm0, (TKEYP)
1759	add $0x10, TKEYP
1760	ret
 
1761
1762.align 4
1763_key_expansion_256b:
1764	pshufd $0b10101010, %xmm1, %xmm1
1765	shufps $0b00010000, %xmm2, %xmm4
1766	pxor %xmm4, %xmm2
1767	shufps $0b10001100, %xmm2, %xmm4
1768	pxor %xmm4, %xmm2
1769	pxor %xmm1, %xmm2
1770	movaps %xmm2, (TKEYP)
1771	add $0x10, TKEYP
1772	ret
 
1773
1774/*
1775 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1776 *                   unsigned int key_len)
1777 */
1778ENTRY(aesni_set_key)
 
1779#ifndef __x86_64__
1780	pushl KEYP
1781	movl 8(%esp), KEYP		# ctx
1782	movl 12(%esp), UKEYP		# in_key
1783	movl 16(%esp), %edx		# key_len
1784#endif
1785	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1786	movaps %xmm0, (KEYP)
1787	lea 0x10(KEYP), TKEYP		# key addr
1788	movl %edx, 480(KEYP)
1789	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1790	cmp $24, %dl
1791	jb .Lenc_key128
1792	je .Lenc_key192
1793	movups 0x10(UKEYP), %xmm2	# other user key
1794	movaps %xmm2, (TKEYP)
1795	add $0x10, TKEYP
1796	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1797	call _key_expansion_256a
1798	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1799	call _key_expansion_256b
1800	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1801	call _key_expansion_256a
1802	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1803	call _key_expansion_256b
1804	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1805	call _key_expansion_256a
1806	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1807	call _key_expansion_256b
1808	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1809	call _key_expansion_256a
1810	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1811	call _key_expansion_256b
1812	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1813	call _key_expansion_256a
1814	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1815	call _key_expansion_256b
1816	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1817	call _key_expansion_256a
1818	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1819	call _key_expansion_256b
1820	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1821	call _key_expansion_256a
1822	jmp .Ldec_key
1823.Lenc_key192:
1824	movq 0x10(UKEYP), %xmm2		# other user key
1825	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1826	call _key_expansion_192a
1827	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1828	call _key_expansion_192b
1829	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1830	call _key_expansion_192a
1831	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1832	call _key_expansion_192b
1833	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1834	call _key_expansion_192a
1835	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1836	call _key_expansion_192b
1837	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1838	call _key_expansion_192a
1839	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1840	call _key_expansion_192b
1841	jmp .Ldec_key
1842.Lenc_key128:
1843	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1844	call _key_expansion_128
1845	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1846	call _key_expansion_128
1847	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1848	call _key_expansion_128
1849	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1850	call _key_expansion_128
1851	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1852	call _key_expansion_128
1853	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1854	call _key_expansion_128
1855	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1856	call _key_expansion_128
1857	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1858	call _key_expansion_128
1859	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1860	call _key_expansion_128
1861	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1862	call _key_expansion_128
1863.Ldec_key:
1864	sub $0x10, TKEYP
1865	movaps (KEYP), %xmm0
1866	movaps (TKEYP), %xmm1
1867	movaps %xmm0, 240(TKEYP)
1868	movaps %xmm1, 240(KEYP)
1869	add $0x10, KEYP
1870	lea 240-16(TKEYP), UKEYP
1871.align 4
1872.Ldec_key_loop:
1873	movaps (KEYP), %xmm0
1874	AESIMC %xmm0 %xmm1
1875	movaps %xmm1, (UKEYP)
1876	add $0x10, KEYP
1877	sub $0x10, UKEYP
1878	cmp TKEYP, KEYP
1879	jb .Ldec_key_loop
1880	xor AREG, AREG
1881#ifndef __x86_64__
1882	popl KEYP
1883#endif
1884	ret
 
 
1885
1886/*
1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1888 */
1889ENTRY(aesni_enc)
 
1890#ifndef __x86_64__
1891	pushl KEYP
1892	pushl KLEN
1893	movl 12(%esp), KEYP
1894	movl 16(%esp), OUTP
1895	movl 20(%esp), INP
1896#endif
1897	movl 480(KEYP), KLEN		# key length
1898	movups (INP), STATE		# input
1899	call _aesni_enc1
1900	movups STATE, (OUTP)		# output
1901#ifndef __x86_64__
1902	popl KLEN
1903	popl KEYP
1904#endif
1905	ret
 
 
1906
1907/*
1908 * _aesni_enc1:		internal ABI
1909 * input:
1910 *	KEYP:		key struct pointer
1911 *	KLEN:		round count
1912 *	STATE:		initial state (input)
1913 * output:
1914 *	STATE:		finial state (output)
1915 * changed:
1916 *	KEY
1917 *	TKEYP (T1)
1918 */
1919.align 4
1920_aesni_enc1:
1921	movaps (KEYP), KEY		# key
1922	mov KEYP, TKEYP
1923	pxor KEY, STATE		# round 0
1924	add $0x30, TKEYP
1925	cmp $24, KLEN
1926	jb .Lenc128
1927	lea 0x20(TKEYP), TKEYP
1928	je .Lenc192
1929	add $0x20, TKEYP
1930	movaps -0x60(TKEYP), KEY
1931	AESENC KEY STATE
1932	movaps -0x50(TKEYP), KEY
1933	AESENC KEY STATE
1934.align 4
1935.Lenc192:
1936	movaps -0x40(TKEYP), KEY
1937	AESENC KEY STATE
1938	movaps -0x30(TKEYP), KEY
1939	AESENC KEY STATE
1940.align 4
1941.Lenc128:
1942	movaps -0x20(TKEYP), KEY
1943	AESENC KEY STATE
1944	movaps -0x10(TKEYP), KEY
1945	AESENC KEY STATE
1946	movaps (TKEYP), KEY
1947	AESENC KEY STATE
1948	movaps 0x10(TKEYP), KEY
1949	AESENC KEY STATE
1950	movaps 0x20(TKEYP), KEY
1951	AESENC KEY STATE
1952	movaps 0x30(TKEYP), KEY
1953	AESENC KEY STATE
1954	movaps 0x40(TKEYP), KEY
1955	AESENC KEY STATE
1956	movaps 0x50(TKEYP), KEY
1957	AESENC KEY STATE
1958	movaps 0x60(TKEYP), KEY
1959	AESENC KEY STATE
1960	movaps 0x70(TKEYP), KEY
1961	AESENCLAST KEY STATE
1962	ret
 
1963
1964/*
1965 * _aesni_enc4:	internal ABI
1966 * input:
1967 *	KEYP:		key struct pointer
1968 *	KLEN:		round count
1969 *	STATE1:		initial state (input)
1970 *	STATE2
1971 *	STATE3
1972 *	STATE4
1973 * output:
1974 *	STATE1:		finial state (output)
1975 *	STATE2
1976 *	STATE3
1977 *	STATE4
1978 * changed:
1979 *	KEY
1980 *	TKEYP (T1)
1981 */
1982.align 4
1983_aesni_enc4:
1984	movaps (KEYP), KEY		# key
1985	mov KEYP, TKEYP
1986	pxor KEY, STATE1		# round 0
1987	pxor KEY, STATE2
1988	pxor KEY, STATE3
1989	pxor KEY, STATE4
1990	add $0x30, TKEYP
1991	cmp $24, KLEN
1992	jb .L4enc128
1993	lea 0x20(TKEYP), TKEYP
1994	je .L4enc192
1995	add $0x20, TKEYP
1996	movaps -0x60(TKEYP), KEY
1997	AESENC KEY STATE1
1998	AESENC KEY STATE2
1999	AESENC KEY STATE3
2000	AESENC KEY STATE4
2001	movaps -0x50(TKEYP), KEY
2002	AESENC KEY STATE1
2003	AESENC KEY STATE2
2004	AESENC KEY STATE3
2005	AESENC KEY STATE4
2006#.align 4
2007.L4enc192:
2008	movaps -0x40(TKEYP), KEY
2009	AESENC KEY STATE1
2010	AESENC KEY STATE2
2011	AESENC KEY STATE3
2012	AESENC KEY STATE4
2013	movaps -0x30(TKEYP), KEY
2014	AESENC KEY STATE1
2015	AESENC KEY STATE2
2016	AESENC KEY STATE3
2017	AESENC KEY STATE4
2018#.align 4
2019.L4enc128:
2020	movaps -0x20(TKEYP), KEY
2021	AESENC KEY STATE1
2022	AESENC KEY STATE2
2023	AESENC KEY STATE3
2024	AESENC KEY STATE4
2025	movaps -0x10(TKEYP), KEY
2026	AESENC KEY STATE1
2027	AESENC KEY STATE2
2028	AESENC KEY STATE3
2029	AESENC KEY STATE4
2030	movaps (TKEYP), KEY
2031	AESENC KEY STATE1
2032	AESENC KEY STATE2
2033	AESENC KEY STATE3
2034	AESENC KEY STATE4
2035	movaps 0x10(TKEYP), KEY
2036	AESENC KEY STATE1
2037	AESENC KEY STATE2
2038	AESENC KEY STATE3
2039	AESENC KEY STATE4
2040	movaps 0x20(TKEYP), KEY
2041	AESENC KEY STATE1
2042	AESENC KEY STATE2
2043	AESENC KEY STATE3
2044	AESENC KEY STATE4
2045	movaps 0x30(TKEYP), KEY
2046	AESENC KEY STATE1
2047	AESENC KEY STATE2
2048	AESENC KEY STATE3
2049	AESENC KEY STATE4
2050	movaps 0x40(TKEYP), KEY
2051	AESENC KEY STATE1
2052	AESENC KEY STATE2
2053	AESENC KEY STATE3
2054	AESENC KEY STATE4
2055	movaps 0x50(TKEYP), KEY
2056	AESENC KEY STATE1
2057	AESENC KEY STATE2
2058	AESENC KEY STATE3
2059	AESENC KEY STATE4
2060	movaps 0x60(TKEYP), KEY
2061	AESENC KEY STATE1
2062	AESENC KEY STATE2
2063	AESENC KEY STATE3
2064	AESENC KEY STATE4
2065	movaps 0x70(TKEYP), KEY
2066	AESENCLAST KEY STATE1		# last round
2067	AESENCLAST KEY STATE2
2068	AESENCLAST KEY STATE3
2069	AESENCLAST KEY STATE4
2070	ret
 
2071
2072/*
2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2074 */
2075ENTRY(aesni_dec)
 
2076#ifndef __x86_64__
2077	pushl KEYP
2078	pushl KLEN
2079	movl 12(%esp), KEYP
2080	movl 16(%esp), OUTP
2081	movl 20(%esp), INP
2082#endif
2083	mov 480(KEYP), KLEN		# key length
2084	add $240, KEYP
2085	movups (INP), STATE		# input
2086	call _aesni_dec1
2087	movups STATE, (OUTP)		#output
2088#ifndef __x86_64__
2089	popl KLEN
2090	popl KEYP
2091#endif
2092	ret
 
 
2093
2094/*
2095 * _aesni_dec1:		internal ABI
2096 * input:
2097 *	KEYP:		key struct pointer
2098 *	KLEN:		key length
2099 *	STATE:		initial state (input)
2100 * output:
2101 *	STATE:		finial state (output)
2102 * changed:
2103 *	KEY
2104 *	TKEYP (T1)
2105 */
2106.align 4
2107_aesni_dec1:
2108	movaps (KEYP), KEY		# key
2109	mov KEYP, TKEYP
2110	pxor KEY, STATE		# round 0
2111	add $0x30, TKEYP
2112	cmp $24, KLEN
2113	jb .Ldec128
2114	lea 0x20(TKEYP), TKEYP
2115	je .Ldec192
2116	add $0x20, TKEYP
2117	movaps -0x60(TKEYP), KEY
2118	AESDEC KEY STATE
2119	movaps -0x50(TKEYP), KEY
2120	AESDEC KEY STATE
2121.align 4
2122.Ldec192:
2123	movaps -0x40(TKEYP), KEY
2124	AESDEC KEY STATE
2125	movaps -0x30(TKEYP), KEY
2126	AESDEC KEY STATE
2127.align 4
2128.Ldec128:
2129	movaps -0x20(TKEYP), KEY
2130	AESDEC KEY STATE
2131	movaps -0x10(TKEYP), KEY
2132	AESDEC KEY STATE
2133	movaps (TKEYP), KEY
2134	AESDEC KEY STATE
2135	movaps 0x10(TKEYP), KEY
2136	AESDEC KEY STATE
2137	movaps 0x20(TKEYP), KEY
2138	AESDEC KEY STATE
2139	movaps 0x30(TKEYP), KEY
2140	AESDEC KEY STATE
2141	movaps 0x40(TKEYP), KEY
2142	AESDEC KEY STATE
2143	movaps 0x50(TKEYP), KEY
2144	AESDEC KEY STATE
2145	movaps 0x60(TKEYP), KEY
2146	AESDEC KEY STATE
2147	movaps 0x70(TKEYP), KEY
2148	AESDECLAST KEY STATE
2149	ret
 
2150
2151/*
2152 * _aesni_dec4:	internal ABI
2153 * input:
2154 *	KEYP:		key struct pointer
2155 *	KLEN:		key length
2156 *	STATE1:		initial state (input)
2157 *	STATE2
2158 *	STATE3
2159 *	STATE4
2160 * output:
2161 *	STATE1:		finial state (output)
2162 *	STATE2
2163 *	STATE3
2164 *	STATE4
2165 * changed:
2166 *	KEY
2167 *	TKEYP (T1)
2168 */
2169.align 4
2170_aesni_dec4:
2171	movaps (KEYP), KEY		# key
2172	mov KEYP, TKEYP
2173	pxor KEY, STATE1		# round 0
2174	pxor KEY, STATE2
2175	pxor KEY, STATE3
2176	pxor KEY, STATE4
2177	add $0x30, TKEYP
2178	cmp $24, KLEN
2179	jb .L4dec128
2180	lea 0x20(TKEYP), TKEYP
2181	je .L4dec192
2182	add $0x20, TKEYP
2183	movaps -0x60(TKEYP), KEY
2184	AESDEC KEY STATE1
2185	AESDEC KEY STATE2
2186	AESDEC KEY STATE3
2187	AESDEC KEY STATE4
2188	movaps -0x50(TKEYP), KEY
2189	AESDEC KEY STATE1
2190	AESDEC KEY STATE2
2191	AESDEC KEY STATE3
2192	AESDEC KEY STATE4
2193.align 4
2194.L4dec192:
2195	movaps -0x40(TKEYP), KEY
2196	AESDEC KEY STATE1
2197	AESDEC KEY STATE2
2198	AESDEC KEY STATE3
2199	AESDEC KEY STATE4
2200	movaps -0x30(TKEYP), KEY
2201	AESDEC KEY STATE1
2202	AESDEC KEY STATE2
2203	AESDEC KEY STATE3
2204	AESDEC KEY STATE4
2205.align 4
2206.L4dec128:
2207	movaps -0x20(TKEYP), KEY
2208	AESDEC KEY STATE1
2209	AESDEC KEY STATE2
2210	AESDEC KEY STATE3
2211	AESDEC KEY STATE4
2212	movaps -0x10(TKEYP), KEY
2213	AESDEC KEY STATE1
2214	AESDEC KEY STATE2
2215	AESDEC KEY STATE3
2216	AESDEC KEY STATE4
2217	movaps (TKEYP), KEY
2218	AESDEC KEY STATE1
2219	AESDEC KEY STATE2
2220	AESDEC KEY STATE3
2221	AESDEC KEY STATE4
2222	movaps 0x10(TKEYP), KEY
2223	AESDEC KEY STATE1
2224	AESDEC KEY STATE2
2225	AESDEC KEY STATE3
2226	AESDEC KEY STATE4
2227	movaps 0x20(TKEYP), KEY
2228	AESDEC KEY STATE1
2229	AESDEC KEY STATE2
2230	AESDEC KEY STATE3
2231	AESDEC KEY STATE4
2232	movaps 0x30(TKEYP), KEY
2233	AESDEC KEY STATE1
2234	AESDEC KEY STATE2
2235	AESDEC KEY STATE3
2236	AESDEC KEY STATE4
2237	movaps 0x40(TKEYP), KEY
2238	AESDEC KEY STATE1
2239	AESDEC KEY STATE2
2240	AESDEC KEY STATE3
2241	AESDEC KEY STATE4
2242	movaps 0x50(TKEYP), KEY
2243	AESDEC KEY STATE1
2244	AESDEC KEY STATE2
2245	AESDEC KEY STATE3
2246	AESDEC KEY STATE4
2247	movaps 0x60(TKEYP), KEY
2248	AESDEC KEY STATE1
2249	AESDEC KEY STATE2
2250	AESDEC KEY STATE3
2251	AESDEC KEY STATE4
2252	movaps 0x70(TKEYP), KEY
2253	AESDECLAST KEY STATE1		# last round
2254	AESDECLAST KEY STATE2
2255	AESDECLAST KEY STATE3
2256	AESDECLAST KEY STATE4
2257	ret
 
2258
2259/*
2260 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 *		      size_t len)
2262 */
2263ENTRY(aesni_ecb_enc)
 
2264#ifndef __x86_64__
2265	pushl LEN
2266	pushl KEYP
2267	pushl KLEN
2268	movl 16(%esp), KEYP
2269	movl 20(%esp), OUTP
2270	movl 24(%esp), INP
2271	movl 28(%esp), LEN
2272#endif
2273	test LEN, LEN		# check length
2274	jz .Lecb_enc_ret
2275	mov 480(KEYP), KLEN
2276	cmp $16, LEN
2277	jb .Lecb_enc_ret
2278	cmp $64, LEN
2279	jb .Lecb_enc_loop1
2280.align 4
2281.Lecb_enc_loop4:
2282	movups (INP), STATE1
2283	movups 0x10(INP), STATE2
2284	movups 0x20(INP), STATE3
2285	movups 0x30(INP), STATE4
2286	call _aesni_enc4
2287	movups STATE1, (OUTP)
2288	movups STATE2, 0x10(OUTP)
2289	movups STATE3, 0x20(OUTP)
2290	movups STATE4, 0x30(OUTP)
2291	sub $64, LEN
2292	add $64, INP
2293	add $64, OUTP
2294	cmp $64, LEN
2295	jge .Lecb_enc_loop4
2296	cmp $16, LEN
2297	jb .Lecb_enc_ret
2298.align 4
2299.Lecb_enc_loop1:
2300	movups (INP), STATE1
2301	call _aesni_enc1
2302	movups STATE1, (OUTP)
2303	sub $16, LEN
2304	add $16, INP
2305	add $16, OUTP
2306	cmp $16, LEN
2307	jge .Lecb_enc_loop1
2308.Lecb_enc_ret:
2309#ifndef __x86_64__
2310	popl KLEN
2311	popl KEYP
2312	popl LEN
2313#endif
2314	ret
 
 
2315
2316/*
2317 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 *		      size_t len);
2319 */
2320ENTRY(aesni_ecb_dec)
 
2321#ifndef __x86_64__
2322	pushl LEN
2323	pushl KEYP
2324	pushl KLEN
2325	movl 16(%esp), KEYP
2326	movl 20(%esp), OUTP
2327	movl 24(%esp), INP
2328	movl 28(%esp), LEN
2329#endif
2330	test LEN, LEN
2331	jz .Lecb_dec_ret
2332	mov 480(KEYP), KLEN
2333	add $240, KEYP
2334	cmp $16, LEN
2335	jb .Lecb_dec_ret
2336	cmp $64, LEN
2337	jb .Lecb_dec_loop1
2338.align 4
2339.Lecb_dec_loop4:
2340	movups (INP), STATE1
2341	movups 0x10(INP), STATE2
2342	movups 0x20(INP), STATE3
2343	movups 0x30(INP), STATE4
2344	call _aesni_dec4
2345	movups STATE1, (OUTP)
2346	movups STATE2, 0x10(OUTP)
2347	movups STATE3, 0x20(OUTP)
2348	movups STATE4, 0x30(OUTP)
2349	sub $64, LEN
2350	add $64, INP
2351	add $64, OUTP
2352	cmp $64, LEN
2353	jge .Lecb_dec_loop4
2354	cmp $16, LEN
2355	jb .Lecb_dec_ret
2356.align 4
2357.Lecb_dec_loop1:
2358	movups (INP), STATE1
2359	call _aesni_dec1
2360	movups STATE1, (OUTP)
2361	sub $16, LEN
2362	add $16, INP
2363	add $16, OUTP
2364	cmp $16, LEN
2365	jge .Lecb_dec_loop1
2366.Lecb_dec_ret:
2367#ifndef __x86_64__
2368	popl KLEN
2369	popl KEYP
2370	popl LEN
2371#endif
2372	ret
 
 
2373
2374/*
2375 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2376 *		      size_t len, u8 *iv)
2377 */
2378ENTRY(aesni_cbc_enc)
 
2379#ifndef __x86_64__
2380	pushl IVP
2381	pushl LEN
2382	pushl KEYP
2383	pushl KLEN
2384	movl 20(%esp), KEYP
2385	movl 24(%esp), OUTP
2386	movl 28(%esp), INP
2387	movl 32(%esp), LEN
2388	movl 36(%esp), IVP
2389#endif
2390	cmp $16, LEN
2391	jb .Lcbc_enc_ret
2392	mov 480(KEYP), KLEN
2393	movups (IVP), STATE	# load iv as initial state
2394.align 4
2395.Lcbc_enc_loop:
2396	movups (INP), IN	# load input
2397	pxor IN, STATE
2398	call _aesni_enc1
2399	movups STATE, (OUTP)	# store output
2400	sub $16, LEN
2401	add $16, INP
2402	add $16, OUTP
2403	cmp $16, LEN
2404	jge .Lcbc_enc_loop
2405	movups STATE, (IVP)
2406.Lcbc_enc_ret:
2407#ifndef __x86_64__
2408	popl KLEN
2409	popl KEYP
2410	popl LEN
2411	popl IVP
2412#endif
2413	ret
 
 
2414
2415/*
2416 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2417 *		      size_t len, u8 *iv)
2418 */
2419ENTRY(aesni_cbc_dec)
 
2420#ifndef __x86_64__
2421	pushl IVP
2422	pushl LEN
2423	pushl KEYP
2424	pushl KLEN
2425	movl 20(%esp), KEYP
2426	movl 24(%esp), OUTP
2427	movl 28(%esp), INP
2428	movl 32(%esp), LEN
2429	movl 36(%esp), IVP
2430#endif
2431	cmp $16, LEN
2432	jb .Lcbc_dec_just_ret
2433	mov 480(KEYP), KLEN
2434	add $240, KEYP
2435	movups (IVP), IV
2436	cmp $64, LEN
2437	jb .Lcbc_dec_loop1
2438.align 4
2439.Lcbc_dec_loop4:
2440	movups (INP), IN1
2441	movaps IN1, STATE1
2442	movups 0x10(INP), IN2
2443	movaps IN2, STATE2
2444#ifdef __x86_64__
2445	movups 0x20(INP), IN3
2446	movaps IN3, STATE3
2447	movups 0x30(INP), IN4
2448	movaps IN4, STATE4
2449#else
2450	movups 0x20(INP), IN1
2451	movaps IN1, STATE3
2452	movups 0x30(INP), IN2
2453	movaps IN2, STATE4
2454#endif
2455	call _aesni_dec4
2456	pxor IV, STATE1
2457#ifdef __x86_64__
2458	pxor IN1, STATE2
2459	pxor IN2, STATE3
2460	pxor IN3, STATE4
2461	movaps IN4, IV
2462#else
2463	pxor (INP), STATE2
2464	pxor 0x10(INP), STATE3
2465	pxor IN1, STATE4
2466	movaps IN2, IV
 
 
 
 
2467#endif
2468	movups STATE1, (OUTP)
2469	movups STATE2, 0x10(OUTP)
2470	movups STATE3, 0x20(OUTP)
2471	movups STATE4, 0x30(OUTP)
2472	sub $64, LEN
2473	add $64, INP
2474	add $64, OUTP
2475	cmp $64, LEN
2476	jge .Lcbc_dec_loop4
2477	cmp $16, LEN
2478	jb .Lcbc_dec_ret
2479.align 4
2480.Lcbc_dec_loop1:
2481	movups (INP), IN
2482	movaps IN, STATE
2483	call _aesni_dec1
2484	pxor IV, STATE
2485	movups STATE, (OUTP)
2486	movaps IN, IV
2487	sub $16, LEN
2488	add $16, INP
2489	add $16, OUTP
2490	cmp $16, LEN
2491	jge .Lcbc_dec_loop1
2492.Lcbc_dec_ret:
2493	movups IV, (IVP)
2494.Lcbc_dec_just_ret:
2495#ifndef __x86_64__
2496	popl KLEN
2497	popl KEYP
2498	popl LEN
2499	popl IVP
2500#endif
2501	ret
 
 
2502
2503#ifdef __x86_64__
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2504.align 16
 
 
 
 
 
 
 
 
2505.Lbswap_mask:
2506	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 
2507
 
2508/*
2509 * _aesni_inc_init:	internal ABI
2510 *	setup registers used by _aesni_inc
2511 * input:
2512 *	IV
2513 * output:
2514 *	CTR:	== IV, in little endian
2515 *	TCTR_LOW: == lower qword of CTR
2516 *	INC:	== 1, in little endian
2517 *	BSWAP_MASK == endian swapping mask
2518 */
2519.align 4
2520_aesni_inc_init:
2521	movaps .Lbswap_mask, BSWAP_MASK
2522	movaps IV, CTR
2523	PSHUFB_XMM BSWAP_MASK CTR
2524	mov $1, TCTR_LOW
2525	MOVQ_R64_XMM TCTR_LOW INC
2526	MOVQ_R64_XMM CTR TCTR_LOW
2527	ret
 
2528
2529/*
2530 * _aesni_inc:		internal ABI
2531 *	Increase IV by 1, IV is in big endian
2532 * input:
2533 *	IV
2534 *	CTR:	== IV, in little endian
2535 *	TCTR_LOW: == lower qword of CTR
2536 *	INC:	== 1, in little endian
2537 *	BSWAP_MASK == endian swapping mask
2538 * output:
2539 *	IV:	Increase by 1
2540 * changed:
2541 *	CTR:	== output IV, in little endian
2542 *	TCTR_LOW: == lower qword of CTR
2543 */
2544.align 4
2545_aesni_inc:
2546	paddq INC, CTR
2547	add $1, TCTR_LOW
2548	jnc .Linc_low
2549	pslldq $8, INC
2550	paddq INC, CTR
2551	psrldq $8, INC
2552.Linc_low:
2553	movaps CTR, IV
2554	PSHUFB_XMM BSWAP_MASK IV
2555	ret
 
2556
2557/*
2558 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2559 *		      size_t len, u8 *iv)
2560 */
2561ENTRY(aesni_ctr_enc)
 
2562	cmp $16, LEN
2563	jb .Lctr_enc_just_ret
2564	mov 480(KEYP), KLEN
2565	movups (IVP), IV
2566	call _aesni_inc_init
2567	cmp $64, LEN
2568	jb .Lctr_enc_loop1
2569.align 4
2570.Lctr_enc_loop4:
2571	movaps IV, STATE1
2572	call _aesni_inc
2573	movups (INP), IN1
2574	movaps IV, STATE2
2575	call _aesni_inc
2576	movups 0x10(INP), IN2
2577	movaps IV, STATE3
2578	call _aesni_inc
2579	movups 0x20(INP), IN3
2580	movaps IV, STATE4
2581	call _aesni_inc
2582	movups 0x30(INP), IN4
2583	call _aesni_enc4
2584	pxor IN1, STATE1
2585	movups STATE1, (OUTP)
2586	pxor IN2, STATE2
2587	movups STATE2, 0x10(OUTP)
2588	pxor IN3, STATE3
2589	movups STATE3, 0x20(OUTP)
2590	pxor IN4, STATE4
2591	movups STATE4, 0x30(OUTP)
2592	sub $64, LEN
2593	add $64, INP
2594	add $64, OUTP
2595	cmp $64, LEN
2596	jge .Lctr_enc_loop4
2597	cmp $16, LEN
2598	jb .Lctr_enc_ret
2599.align 4
2600.Lctr_enc_loop1:
2601	movaps IV, STATE
2602	call _aesni_inc
2603	movups (INP), IN
2604	call _aesni_enc1
2605	pxor IN, STATE
2606	movups STATE, (OUTP)
2607	sub $16, LEN
2608	add $16, INP
2609	add $16, OUTP
2610	cmp $16, LEN
2611	jge .Lctr_enc_loop1
2612.Lctr_enc_ret:
2613	movups IV, (IVP)
2614.Lctr_enc_just_ret:
2615	ret
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2616#endif

   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Implement AES algorithm in Intel AES-NI instructions.
   4 *
   5 * The white paper of AES-NI instructions can be downloaded from:
   6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7 *
   8 * Copyright (C) 2008, Intel Corp.
   9 *    Author: Huang Ying <ying.huang@intel.com>
  10 *            Vinodh Gopal <vinodh.gopal@intel.com>
  11 *            Kahraman Akdemir
  12 *
  13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14 * interface for 64-bit kernels.
  15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17 *             Adrian Hoban <adrian.hoban@intel.com>
  18 *             James Guilford (james.guilford@intel.com)
  19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20 *             Tadeusz Struk (tadeusz.struk@intel.com)
  21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22 *    Copyright (c) 2010, Intel Corporation.
  23 *
  24 * Ported x86_64 version to x86:
  25 *    Author: Mathias Krause <minipli@googlemail.com>
 
 
 
 
 
  26 */
  27
  28#include <linux/linkage.h>
  29#include <asm/frame.h>
  30#include <asm/nospec-branch.h>
  31
  32/*
  33 * The following macros are used to move an (un)aligned 16 byte value to/from
  34 * an XMM register.  This can done for either FP or integer values, for FP use
  35 * movaps (move aligned packed single) or integer use movdqa (move double quad
  36 * aligned).  It doesn't make a performance difference which instruction is used
  37 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  38 * shorter, so that is the one we'll use for now. (same for unaligned).
  39 */
  40#define MOVADQ	movaps
  41#define MOVUDQ	movups
  42
  43#ifdef __x86_64__
  44
  45# constants in mergeable sections, linker can reorder and merge
  46.section	.rodata.cst16.POLY, "aM", @progbits, 16
  47.align 16
  48POLY:   .octa 0xC2000000000000000000000000000001
  49.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
  50.align 16
  51TWOONE: .octa 0x00000001000000000000000000000001
  52
  53.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  54.align 16
 
 
  55SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  56.section	.rodata.cst16.MASK1, "aM", @progbits, 16
  57.align 16
  58MASK1:      .octa 0x0000000000000000ffffffffffffffff
  59.section	.rodata.cst16.MASK2, "aM", @progbits, 16
  60.align 16
  61MASK2:      .octa 0xffffffffffffffff0000000000000000
  62.section	.rodata.cst16.ONE, "aM", @progbits, 16
  63.align 16
 
  64ONE:        .octa 0x00000000000000000000000000000001
  65.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  66.align 16
  67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  68.section	.rodata.cst16.dec, "aM", @progbits, 16
  69.align 16
  70dec:        .octa 0x1
  71.section	.rodata.cst16.enc, "aM", @progbits, 16
  72.align 16
  73enc:        .octa 0x2
  74
  75# order of these constants should not change.
  76# more specifically, ALL_F should follow SHIFT_MASK,
  77# and zero should follow ALL_F
  78.section	.rodata, "a", @progbits
  79.align 16
  80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  81ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  82            .octa 0x00000000000000000000000000000000
  83
  84.text
  85
  86
  87#define	STACK_OFFSET    8*3
  88
  89#define AadHash 16*0
  90#define AadLen 16*1
  91#define InLen (16*1)+8
  92#define PBlockEncKey 16*2
  93#define OrigIV 16*3
  94#define CurCount 16*4
  95#define PBlockLen 16*5
  96#define	HashKey		16*6	// store HashKey <<1 mod poly here
  97#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
  98#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
  99#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
 100#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
 101				// bits of  HashKey <<1 mod poly here
 102				//(for Karatsuba purposes)
 103#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
 104				// bits of  HashKey^2 <<1 mod poly here
 105				// (for Karatsuba purposes)
 106#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
 107				// bits of  HashKey^3 <<1 mod poly here
 108				// (for Karatsuba purposes)
 109#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
 110				// bits of  HashKey^4 <<1 mod poly here
 111				// (for Karatsuba purposes)
 
 112
 113#define arg1 rdi
 114#define arg2 rsi
 115#define arg3 rdx
 116#define arg4 rcx
 117#define arg5 r8
 118#define arg6 r9
 119#define arg7 STACK_OFFSET+8(%rsp)
 120#define arg8 STACK_OFFSET+16(%rsp)
 121#define arg9 STACK_OFFSET+24(%rsp)
 122#define arg10 STACK_OFFSET+32(%rsp)
 123#define arg11 STACK_OFFSET+40(%rsp)
 124#define keysize 2*15*16(%arg1)
 125#endif
 126
 127
 128#define STATE1	%xmm0
 129#define STATE2	%xmm4
 130#define STATE3	%xmm5
 131#define STATE4	%xmm6
 132#define STATE	STATE1
 133#define IN1	%xmm1
 134#define IN2	%xmm7
 135#define IN3	%xmm8
 136#define IN4	%xmm9
 137#define IN	IN1
 138#define KEY	%xmm2
 139#define IV	%xmm3
 140
 141#define BSWAP_MASK %xmm10
 142#define CTR	%xmm11
 143#define INC	%xmm12
 144
 145#define GF128MUL_MASK %xmm7
 146
 147#ifdef __x86_64__
 148#define AREG	%rax
 149#define KEYP	%rdi
 150#define OUTP	%rsi
 151#define UKEYP	OUTP
 152#define INP	%rdx
 153#define LEN	%rcx
 154#define IVP	%r8
 155#define KLEN	%r9d
 156#define T1	%r10
 157#define TKEYP	T1
 158#define T2	%r11
 159#define TCTR_LOW T2
 160#else
 161#define AREG	%eax
 162#define KEYP	%edi
 163#define OUTP	AREG
 164#define UKEYP	OUTP
 165#define INP	%edx
 166#define LEN	%esi
 167#define IVP	%ebp
 168#define KLEN	%ebx
 169#define T1	%ecx
 170#define TKEYP	T1
 171#endif
 172
 173.macro FUNC_SAVE
 174	push	%r12
 175	push	%r13
 176	push	%r14
 177#
 178# states of %xmm registers %xmm6:%xmm15 not saved
 179# all %xmm registers are clobbered
 180#
 181.endm
 182
 183
 184.macro FUNC_RESTORE
 185	pop	%r14
 186	pop	%r13
 187	pop	%r12
 188.endm
 189
 190# Precompute hashkeys.
 191# Input: Hash subkey.
 192# Output: HashKeys stored in gcm_context_data.  Only needs to be called
 193# once per key.
 194# clobbers r12, and tmp xmm registers.
 195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 196	mov	\SUBKEY, %r12
 197	movdqu	(%r12), \TMP3
 198	movdqa	SHUF_MASK(%rip), \TMP2
 199	pshufb	\TMP2, \TMP3
 200
 201	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 202
 203	movdqa	\TMP3, \TMP2
 204	psllq	$1, \TMP3
 205	psrlq	$63, \TMP2
 206	movdqa	\TMP2, \TMP1
 207	pslldq	$8, \TMP2
 208	psrldq	$8, \TMP1
 209	por	\TMP2, \TMP3
 210
 211	# reduce HashKey<<1
 212
 213	pshufd	$0x24, \TMP1, \TMP2
 214	pcmpeqd TWOONE(%rip), \TMP2
 215	pand	POLY(%rip), \TMP2
 216	pxor	\TMP2, \TMP3
 217	movdqu	\TMP3, HashKey(%arg2)
 218
 219	movdqa	   \TMP3, \TMP5
 220	pshufd	   $78, \TMP3, \TMP1
 221	pxor	   \TMP3, \TMP1
 222	movdqu	   \TMP1, HashKey_k(%arg2)
 223
 224	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 225# TMP5 = HashKey^2<<1 (mod poly)
 226	movdqu	   \TMP5, HashKey_2(%arg2)
 227# HashKey_2 = HashKey^2<<1 (mod poly)
 228	pshufd	   $78, \TMP5, \TMP1
 229	pxor	   \TMP5, \TMP1
 230	movdqu	   \TMP1, HashKey_2_k(%arg2)
 231
 232	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 233# TMP5 = HashKey^3<<1 (mod poly)
 234	movdqu	   \TMP5, HashKey_3(%arg2)
 235	pshufd	   $78, \TMP5, \TMP1
 236	pxor	   \TMP5, \TMP1
 237	movdqu	   \TMP1, HashKey_3_k(%arg2)
 238
 239	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 240# TMP5 = HashKey^3<<1 (mod poly)
 241	movdqu	   \TMP5, HashKey_4(%arg2)
 242	pshufd	   $78, \TMP5, \TMP1
 243	pxor	   \TMP5, \TMP1
 244	movdqu	   \TMP1, HashKey_4_k(%arg2)
 245.endm
 246
 247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 249.macro GCM_INIT Iv SUBKEY AAD AADLEN
 250	mov \AADLEN, %r11
 251	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 252	xor %r11d, %r11d
 253	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 254	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 255	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 256	mov \Iv, %rax
 257	movdqu (%rax), %xmm0
 258	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 259
 260	movdqa  SHUF_MASK(%rip), %xmm2
 261	pshufb %xmm2, %xmm0
 262	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 263
 264	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
 265	movdqu HashKey(%arg2), %xmm13
 266
 267	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 268	%xmm4, %xmm5, %xmm6
 269.endm
 270
 271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 272# struct has been initialized by GCM_INIT.
 273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 274# Clobbers rax, r10-r13, and xmm0-xmm15
 275.macro GCM_ENC_DEC operation
 276	movdqu AadHash(%arg2), %xmm8
 277	movdqu HashKey(%arg2), %xmm13
 278	add %arg5, InLen(%arg2)
 279
 280	xor %r11d, %r11d # initialise the data pointer offset as zero
 281	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 282
 283	sub %r11, %arg5		# sub partial block data used
 284	mov %arg5, %r13		# save the number of bytes
 285
 286	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
 287	mov %r13, %r12
 288	# Encrypt/Decrypt first few blocks
 289
 290	and	$(3<<4), %r12
 291	jz	.L_initial_num_blocks_is_0_\@
 292	cmp	$(2<<4), %r12
 293	jb	.L_initial_num_blocks_is_1_\@
 294	je	.L_initial_num_blocks_is_2_\@
 295.L_initial_num_blocks_is_3_\@:
 296	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 298	sub	$48, %r13
 299	jmp	.L_initial_blocks_\@
 300.L_initial_num_blocks_is_2_\@:
 301	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 303	sub	$32, %r13
 304	jmp	.L_initial_blocks_\@
 305.L_initial_num_blocks_is_1_\@:
 306	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 308	sub	$16, %r13
 309	jmp	.L_initial_blocks_\@
 310.L_initial_num_blocks_is_0_\@:
 311	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 313.L_initial_blocks_\@:
 314
 315	# Main loop - Encrypt/Decrypt remaining blocks
 316
 317	test	%r13, %r13
 318	je	.L_zero_cipher_left_\@
 319	sub	$64, %r13
 320	je	.L_four_cipher_left_\@
 321.L_crypt_by_4_\@:
 322	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
 323	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 324	%xmm7, %xmm8, enc
 325	add	$64, %r11
 326	sub	$64, %r13
 327	jne	.L_crypt_by_4_\@
 328.L_four_cipher_left_\@:
 329	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 331.L_zero_cipher_left_\@:
 332	movdqu %xmm8, AadHash(%arg2)
 333	movdqu %xmm0, CurCount(%arg2)
 334
 335	mov	%arg5, %r13
 336	and	$15, %r13			# %r13 = arg5 (mod 16)
 337	je	.L_multiple_of_16_bytes_\@
 338
 339	mov %r13, PBlockLen(%arg2)
 340
 341	# Handle the last <16 Byte block separately
 342	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 343	movdqu %xmm0, CurCount(%arg2)
 344	movdqa SHUF_MASK(%rip), %xmm10
 345	pshufb %xmm10, %xmm0
 346
 347	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
 348	movdqu %xmm0, PBlockEncKey(%arg2)
 349
 350	cmp	$16, %arg5
 351	jge	.L_large_enough_update_\@
 352
 353	lea (%arg4,%r11,1), %r10
 354	mov %r13, %r12
 355	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 356	jmp	.L_data_read_\@
 357
 358.L_large_enough_update_\@:
 359	sub	$16, %r11
 360	add	%r13, %r11
 361
 362	# receive the last <16 Byte block
 363	movdqu	(%arg4, %r11, 1), %xmm1
 364
 365	sub	%r13, %r11
 366	add	$16, %r11
 367
 368	lea	SHIFT_MASK+16(%rip), %r12
 369	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 370	# (r13 is the number of bytes in plaintext mod 16)
 371	sub	%r13, %r12
 372	# get the appropriate shuffle mask
 373	movdqu	(%r12), %xmm2
 374	# shift right 16-r13 bytes
 375	pshufb  %xmm2, %xmm1
 376
 377.L_data_read_\@:
 378	lea ALL_F+16(%rip), %r12
 379	sub %r13, %r12
 380
 381.ifc \operation, dec
 382	movdqa  %xmm1, %xmm2
 383.endif
 384	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
 385	movdqu	(%r12), %xmm1
 386	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
 387	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 388.ifc \operation, dec
 389	pand    %xmm1, %xmm2
 390	movdqa SHUF_MASK(%rip), %xmm10
 391	pshufb %xmm10 ,%xmm2
 392
 393	pxor %xmm2, %xmm8
 394.else
 395	movdqa SHUF_MASK(%rip), %xmm10
 396	pshufb %xmm10,%xmm0
 397
 398	pxor	%xmm0, %xmm8
 399.endif
 400
 401	movdqu %xmm8, AadHash(%arg2)
 402.ifc \operation, enc
 403	# GHASH computation for the last <16 byte block
 404	movdqa SHUF_MASK(%rip), %xmm10
 405	# shuffle xmm0 back to output as ciphertext
 406	pshufb %xmm10, %xmm0
 407.endif
 408
 409	# Output %r13 bytes
 410	movq %xmm0, %rax
 411	cmp $8, %r13
 412	jle .L_less_than_8_bytes_left_\@
 413	mov %rax, (%arg3 , %r11, 1)
 414	add $8, %r11
 415	psrldq $8, %xmm0
 416	movq %xmm0, %rax
 417	sub $8, %r13
 418.L_less_than_8_bytes_left_\@:
 419	mov %al,  (%arg3, %r11, 1)
 420	add $1, %r11
 421	shr $8, %rax
 422	sub $1, %r13
 423	jne .L_less_than_8_bytes_left_\@
 424.L_multiple_of_16_bytes_\@:
 425.endm
 426
 427# GCM_COMPLETE Finishes update of tag of last partial block
 428# Output: Authorization Tag (AUTH_TAG)
 429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 431	movdqu AadHash(%arg2), %xmm8
 432	movdqu HashKey(%arg2), %xmm13
 433
 434	mov PBlockLen(%arg2), %r12
 435
 436	test %r12, %r12
 437	je .L_partial_done\@
 438
 439	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 440
 441.L_partial_done\@:
 442	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 443	shl	$3, %r12		  # convert into number of bits
 444	movd	%r12d, %xmm15		  # len(A) in %xmm15
 445	mov InLen(%arg2), %r12
 446	shl     $3, %r12                  # len(C) in bits (*128)
 447	movq    %r12, %xmm1
 448
 449	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
 450	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
 451	pxor	%xmm15, %xmm8
 452	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 453	# final GHASH computation
 454	movdqa SHUF_MASK(%rip), %xmm10
 455	pshufb %xmm10, %xmm8
 456
 457	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 458	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
 459	pxor	%xmm8, %xmm0
 460.L_return_T_\@:
 461	mov	\AUTHTAG, %r10                     # %r10 = authTag
 462	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 463	cmp	$16, %r11
 464	je	.L_T_16_\@
 465	cmp	$8, %r11
 466	jl	.L_T_4_\@
 467.L_T_8_\@:
 468	movq	%xmm0, %rax
 469	mov	%rax, (%r10)
 470	add	$8, %r10
 471	sub	$8, %r11
 472	psrldq	$8, %xmm0
 473	test	%r11, %r11
 474	je	.L_return_T_done_\@
 475.L_T_4_\@:
 476	movd	%xmm0, %eax
 477	mov	%eax, (%r10)
 478	add	$4, %r10
 479	sub	$4, %r11
 480	psrldq	$4, %xmm0
 481	test	%r11, %r11
 482	je	.L_return_T_done_\@
 483.L_T_123_\@:
 484	movd	%xmm0, %eax
 485	cmp	$2, %r11
 486	jl	.L_T_1_\@
 487	mov	%ax, (%r10)
 488	cmp	$2, %r11
 489	je	.L_return_T_done_\@
 490	add	$2, %r10
 491	sar	$16, %eax
 492.L_T_1_\@:
 493	mov	%al, (%r10)
 494	jmp	.L_return_T_done_\@
 495.L_T_16_\@:
 496	movdqu	%xmm0, (%r10)
 497.L_return_T_done_\@:
 498.endm
 499
 500#ifdef __x86_64__
 501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 502*
 503*
 504* Input: A and B (128-bits each, bit-reflected)
 505* Output: C = A*B*x mod poly, (i.e. >>1 )
 506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 508*
 509*/
 510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 511	movdqa	  \GH, \TMP1
 512	pshufd	  $78, \GH, \TMP2
 513	pshufd	  $78, \HK, \TMP3
 514	pxor	  \GH, \TMP2            # TMP2 = a1+a0
 515	pxor	  \HK, \TMP3            # TMP3 = b1+b0
 516	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
 517	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
 518	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 519	pxor	  \GH, \TMP2
 520	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 521	movdqa	  \TMP2, \TMP3
 522	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
 523	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
 524	pxor	  \TMP3, \GH
 525	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 526
 527        # first phase of the reduction
 528
 529	movdqa    \GH, \TMP2
 530	movdqa    \GH, \TMP3
 531	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 532					# in in order to perform
 533					# independent shifts
 534	pslld     $31, \TMP2            # packed right shift <<31
 535	pslld     $30, \TMP3            # packed right shift <<30
 536	pslld     $25, \TMP4            # packed right shift <<25
 537	pxor      \TMP3, \TMP2          # xor the shifted versions
 538	pxor      \TMP4, \TMP2
 539	movdqa    \TMP2, \TMP5
 540	psrldq    $4, \TMP5             # right shift TMP5 1 DW
 541	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 542	pxor      \TMP2, \GH
 543
 544        # second phase of the reduction
 545
 546	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 547					# in in order to perform
 548					# independent shifts
 549	movdqa    \GH,\TMP3
 550	movdqa    \GH,\TMP4
 551	psrld     $1,\TMP2              # packed left shift >>1
 552	psrld     $2,\TMP3              # packed left shift >>2
 553	psrld     $7,\TMP4              # packed left shift >>7
 554	pxor      \TMP3,\TMP2		# xor the shifted versions
 555	pxor      \TMP4,\TMP2
 556	pxor      \TMP5, \TMP2
 557	pxor      \TMP2, \GH
 558	pxor      \TMP1, \GH            # result is in TMP1
 559.endm
 560
 561# Reads DLEN bytes starting at DPTR and stores in XMMDst
 562# where 0 < DLEN < 16
 563# Clobbers %rax, DLEN and XMM1
 564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 565        cmp $8, \DLEN
 566        jl .L_read_lt8_\@
 567        mov (\DPTR), %rax
 568        movq %rax, \XMMDst
 569        sub $8, \DLEN
 570        jz .L_done_read_partial_block_\@
 571	xor %eax, %eax
 572.L_read_next_byte_\@:
 573        shl $8, %rax
 574        mov 7(\DPTR, \DLEN, 1), %al
 575        dec \DLEN
 576        jnz .L_read_next_byte_\@
 577        movq %rax, \XMM1
 578	pslldq $8, \XMM1
 579        por \XMM1, \XMMDst
 580	jmp .L_done_read_partial_block_\@
 581.L_read_lt8_\@:
 582	xor %eax, %eax
 583.L_read_next_byte_lt8_\@:
 584        shl $8, %rax
 585        mov -1(\DPTR, \DLEN, 1), %al
 586        dec \DLEN
 587        jnz .L_read_next_byte_lt8_\@
 588        movq %rax, \XMMDst
 589.L_done_read_partial_block_\@:
 590.endm
 591
 592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 593# clobbers r10-11, xmm14
 594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 595	TMP6 TMP7
 596	MOVADQ	   SHUF_MASK(%rip), %xmm14
 597	mov	   \AAD, %r10		# %r10 = AAD
 598	mov	   \AADLEN, %r11		# %r11 = aadLen
 599	pxor	   \TMP7, \TMP7
 600	pxor	   \TMP6, \TMP6
 601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 602	cmp	   $16, %r11
 603	jl	   .L_get_AAD_rest\@
 604.L_get_AAD_blocks\@:
 605	movdqu	   (%r10), \TMP7
 606	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
 607	pxor	   \TMP7, \TMP6
 608	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 609	add	   $16, %r10
 610	sub	   $16, %r11
 611	cmp	   $16, %r11
 612	jge	   .L_get_AAD_blocks\@
 
 
 
 
 
 
 
 
 
 613
 614	movdqu	   \TMP6, \TMP7
 
 
 
 
 
 615
 616	/* read the last <16B of AAD */
 617.L_get_AAD_rest\@:
 618	test	   %r11, %r11
 619	je	   .L_get_AAD_done\@
 620
 621	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 622	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
 623	pxor	   \TMP6, \TMP7
 624	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 625	movdqu \TMP7, \TMP6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 626
 627.L_get_AAD_done\@:
 628	movdqu \TMP6, AadHash(%arg2)
 629.endm
 630
 631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 632# between update calls.
 633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 637	AAD_HASH operation
 638	mov 	PBlockLen(%arg2), %r13
 639	test	%r13, %r13
 640	je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
 641	# Read in input data without over reading
 642	cmp	$16, \PLAIN_CYPH_LEN
 643	jl	.L_fewer_than_16_bytes_\@
 644	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
 645	jmp	.L_data_read_\@
 646
 647.L_fewer_than_16_bytes_\@:
 648	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 649	mov	\PLAIN_CYPH_LEN, %r12
 650	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 651
 652	mov PBlockLen(%arg2), %r13
 653
 654.L_data_read_\@:				# Finished reading in data
 655
 656	movdqu	PBlockEncKey(%arg2), %xmm9
 657	movdqu	HashKey(%arg2), %xmm13
 658
 659	lea	SHIFT_MASK(%rip), %r12
 660
 661	# adjust the shuffle mask pointer to be able to shift r13 bytes
 662	# r16-r13 is the number of bytes in plaintext mod 16)
 663	add	%r13, %r12
 664	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
 665	pshufb	%xmm2, %xmm9		# shift right r13 bytes
 666
 667.ifc \operation, dec
 668	movdqa	%xmm1, %xmm3
 669	pxor	%xmm1, %xmm9		# Ciphertext XOR E(K, Yn)
 670
 671	mov	\PLAIN_CYPH_LEN, %r10
 672	add	%r13, %r10
 673	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 674	sub	$16, %r10
 675	# Determine if partial block is not being filled and
 676	# shift mask accordingly
 677	jge	.L_no_extra_mask_1_\@
 678	sub	%r10, %r12
 679.L_no_extra_mask_1_\@:
 680
 681	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
 682	# get the appropriate mask to mask out bottom r13 bytes of xmm9
 683	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 684
 685	pand	%xmm1, %xmm3
 686	movdqa	SHUF_MASK(%rip), %xmm10
 687	pshufb	%xmm10, %xmm3
 688	pshufb	%xmm2, %xmm3
 689	pxor	%xmm3, \AAD_HASH
 690
 691	test	%r10, %r10
 692	jl	.L_partial_incomplete_1_\@
 693
 694	# GHASH computation for the last <16 Byte block
 695	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 696	xor	%eax, %eax
 697
 698	mov	%rax, PBlockLen(%arg2)
 699	jmp	.L_dec_done_\@
 700.L_partial_incomplete_1_\@:
 701	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
 702.L_dec_done_\@:
 703	movdqu	\AAD_HASH, AadHash(%arg2)
 704.else
 705	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
 706
 707	mov	\PLAIN_CYPH_LEN, %r10
 708	add	%r13, %r10
 709	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 710	sub	$16, %r10
 711	# Determine if partial block is not being filled and
 712	# shift mask accordingly
 713	jge	.L_no_extra_mask_2_\@
 714	sub	%r10, %r12
 715.L_no_extra_mask_2_\@:
 716
 717	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
 718	# get the appropriate mask to mask out bottom r13 bytes of xmm9
 719	pand	%xmm1, %xmm9
 720
 721	movdqa	SHUF_MASK(%rip), %xmm1
 722	pshufb	%xmm1, %xmm9
 723	pshufb	%xmm2, %xmm9
 724	pxor	%xmm9, \AAD_HASH
 725
 726	test	%r10, %r10
 727	jl	.L_partial_incomplete_2_\@
 728
 729	# GHASH computation for the last <16 Byte block
 730	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 731	xor	%eax, %eax
 732
 733	mov	%rax, PBlockLen(%arg2)
 734	jmp	.L_encode_done_\@
 735.L_partial_incomplete_2_\@:
 736	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
 737.L_encode_done_\@:
 738	movdqu	\AAD_HASH, AadHash(%arg2)
 739
 740	movdqa	SHUF_MASK(%rip), %xmm10
 741	# shuffle xmm9 back to output as ciphertext
 742	pshufb	%xmm10, %xmm9
 743	pshufb	%xmm2, %xmm9
 744.endif
 745	# output encrypted Bytes
 746	test	%r10, %r10
 747	jl	.L_partial_fill_\@
 748	mov	%r13, %r12
 749	mov	$16, %r13
 750	# Set r13 to be the number of bytes to write out
 751	sub	%r12, %r13
 752	jmp	.L_count_set_\@
 753.L_partial_fill_\@:
 754	mov	\PLAIN_CYPH_LEN, %r13
 755.L_count_set_\@:
 756	movdqa	%xmm9, %xmm0
 757	movq	%xmm0, %rax
 758	cmp	$8, %r13
 759	jle	.L_less_than_8_bytes_left_\@
 760
 761	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 762	add	$8, \DATA_OFFSET
 763	psrldq	$8, %xmm0
 764	movq	%xmm0, %rax
 765	sub	$8, %r13
 766.L_less_than_8_bytes_left_\@:
 767	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 768	add	$1, \DATA_OFFSET
 769	shr	$8, %rax
 770	sub	$1, %r13
 771	jne	.L_less_than_8_bytes_left_\@
 772.L_partial_block_done_\@:
 773.endm # PARTIAL_BLOCK
 774
 775/*
 776* if a = number of total plaintext bytes
 777* b = floor(a/16)
 778* num_initial_blocks = b mod 4
 779* encrypt the initial num_initial_blocks blocks and apply ghash on
 780* the ciphertext
 781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 782* are clobbered
 783* arg1, %arg2, %arg3 are used as a pointer only, not modified
 784*/
 785
 786
 787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 788	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 789	MOVADQ		SHUF_MASK(%rip), %xmm14
 790
 791	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
 792
 793	# start AES for num_initial_blocks blocks
 794
 795	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 796
 797.if (\i == 5) || (\i == 6) || (\i == 7)
 
 
 
 
 
 798
 799	MOVADQ		ONE(%RIP),\TMP1
 800	MOVADQ		0(%arg1),\TMP2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 801.irpc index, \i_seq
 802	paddd		\TMP1, \XMM0                 # INCR Y0
 803.ifc \operation, dec
 804        movdqa     \XMM0, %xmm\index
 805.else
 806	MOVADQ		\XMM0, %xmm\index
 807.endif
 808	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
 809	pxor		\TMP2, %xmm\index
 
 
 
 
 
 
 
 
 
 
 810.endr
 811	lea	0x10(%arg1),%r10
 812	mov	keysize,%eax
 813	shr	$2,%eax				# 128->4, 192->6, 256->8
 814	add	$5,%eax			      # 128->9, 192->11, 256->13
 815
 816.Laes_loop_initial_\@:
 817	MOVADQ	(%r10),\TMP1
 818.irpc	index, \i_seq
 819	aesenc	\TMP1, %xmm\index
 820.endr
 821	add	$16,%r10
 822	sub	$1,%eax
 823	jnz	.Laes_loop_initial_\@
 824
 825	MOVADQ	(%r10), \TMP1
 826.irpc index, \i_seq
 827	aesenclast \TMP1, %xmm\index         # Last Round
 
 828.endr
 829.irpc index, \i_seq
 830	movdqu	   (%arg4 , %r11, 1), \TMP1
 831	pxor	   \TMP1, %xmm\index
 832	movdqu	   %xmm\index, (%arg3 , %r11, 1)
 833	# write back plaintext/ciphertext for num_initial_blocks
 834	add	   $16, %r11
 835
 836.ifc \operation, dec
 837	movdqa     \TMP1, %xmm\index
 838.endif
 839	pshufb	   %xmm14, %xmm\index
 840
 841		# prepare plaintext/ciphertext for GHASH computation
 842.endr
 843.endif
 844
 845        # apply GHASH on num_initial_blocks blocks
 846
 847.if \i == 5
 848        pxor       %xmm5, %xmm6
 849	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 850        pxor       %xmm6, %xmm7
 851	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 852        pxor       %xmm7, %xmm8
 853	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 854.elseif \i == 6
 855        pxor       %xmm6, %xmm7
 856	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 857        pxor       %xmm7, %xmm8
 858	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859.elseif \i == 7
 860        pxor       %xmm7, %xmm8
 861	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 862.endif
 863	cmp	   $64, %r13
 864	jl	.L_initial_blocks_done\@
 865	# no need for precomputed values
 866/*
 867*
 868* Precomputations for HashKey parallel with encryption of first 4 blocks.
 869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 870*/
 871	MOVADQ	   ONE(%RIP),\TMP1
 872	paddd	   \TMP1, \XMM0              # INCR Y0
 873	MOVADQ	   \XMM0, \XMM1
 874	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
 875
 876	paddd	   \TMP1, \XMM0              # INCR Y0
 877	MOVADQ	   \XMM0, \XMM2
 878	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
 879
 880	paddd	   \TMP1, \XMM0              # INCR Y0
 881	MOVADQ	   \XMM0, \XMM3
 882	pshufb %xmm14, \XMM3        # perform a 16 byte swap
 883
 884	paddd	   \TMP1, \XMM0              # INCR Y0
 885	MOVADQ	   \XMM0, \XMM4
 886	pshufb %xmm14, \XMM4        # perform a 16 byte swap
 887
 888	MOVADQ	   0(%arg1),\TMP1
 889	pxor	   \TMP1, \XMM1
 890	pxor	   \TMP1, \XMM2
 891	pxor	   \TMP1, \XMM3
 892	pxor	   \TMP1, \XMM4
 
 
 
 
 
 
 
 
 
 
 
 
 
 893.irpc index, 1234 # do 4 rounds
 894	movaps 0x10*\index(%arg1), \TMP1
 895	aesenc	   \TMP1, \XMM1
 896	aesenc	   \TMP1, \XMM2
 897	aesenc	   \TMP1, \XMM3
 898	aesenc	   \TMP1, \XMM4
 899.endr
 
 
 
 
 
 
 900.irpc index, 56789 # do next 5 rounds
 901	movaps 0x10*\index(%arg1), \TMP1
 902	aesenc	   \TMP1, \XMM1
 903	aesenc	   \TMP1, \XMM2
 904	aesenc	   \TMP1, \XMM3
 905	aesenc	   \TMP1, \XMM4
 906.endr
 907	lea	   0xa0(%arg1),%r10
 908	mov	   keysize,%eax
 909	shr	   $2,%eax			# 128->4, 192->6, 256->8
 910	sub	   $4,%eax			# 128->0, 192->2, 256->4
 911	jz	   .Laes_loop_pre_done\@
 912
 913.Laes_loop_pre_\@:
 914	MOVADQ	   (%r10),\TMP2
 915.irpc	index, 1234
 916	aesenc	   \TMP2, %xmm\index
 917.endr
 918	add	   $16,%r10
 919	sub	   $1,%eax
 920	jnz	   .Laes_loop_pre_\@
 921
 922.Laes_loop_pre_done\@:
 923	MOVADQ	   (%r10), \TMP2
 924	aesenclast \TMP2, \XMM1
 925	aesenclast \TMP2, \XMM2
 926	aesenclast \TMP2, \XMM3
 927	aesenclast \TMP2, \XMM4
 928	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
 929	pxor	   \TMP1, \XMM1
 930.ifc \operation, dec
 931	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 932	movdqa     \TMP1, \XMM1
 933.endif
 934	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
 935	pxor	   \TMP1, \XMM2
 936.ifc \operation, dec
 937	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 938	movdqa     \TMP1, \XMM2
 939.endif
 940	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
 941	pxor	   \TMP1, \XMM3
 942.ifc \operation, dec
 943	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 944	movdqa     \TMP1, \XMM3
 945.endif
 946	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
 947	pxor	   \TMP1, \XMM4
 948.ifc \operation, dec
 949	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 950	movdqa     \TMP1, \XMM4
 951.else
 952	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 953	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 954	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 955	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 956.endif
 957
 958	add	   $64, %r11
 959	pshufb %xmm14, \XMM1 # perform a 16 byte swap
 
 960	pxor	   \XMMDst, \XMM1
 961# combine GHASHed value with the corresponding ciphertext
 962	pshufb %xmm14, \XMM2 # perform a 16 byte swap
 963	pshufb %xmm14, \XMM3 # perform a 16 byte swap
 964	pshufb %xmm14, \XMM4 # perform a 16 byte swap
 
 
 
 965
 966.L_initial_blocks_done\@:
 967
 968.endm
 969
 970/*
 971* encrypt 4 blocks at a time
 972* ghash the 4 previously encrypted ciphertext blocks
 973* arg1, %arg3, %arg4 are used as pointers only, not modified
 974* %r11 is the data offset value
 975*/
 976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
 977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 978
 979	movdqa	  \XMM1, \XMM5
 980	movdqa	  \XMM2, \XMM6
 981	movdqa	  \XMM3, \XMM7
 982	movdqa	  \XMM4, \XMM8
 983
 984        movdqa    SHUF_MASK(%rip), %xmm15
 985        # multiply TMP5 * HashKey using karatsuba
 986
 987	movdqa	  \XMM5, \TMP4
 988	pshufd	  $78, \XMM5, \TMP6
 989	pxor	  \XMM5, \TMP6
 990	paddd     ONE(%rip), \XMM0		# INCR CNT
 991	movdqu	  HashKey_4(%arg2), \TMP5
 992	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 993	movdqa    \XMM0, \XMM1
 994	paddd     ONE(%rip), \XMM0		# INCR CNT
 995	movdqa    \XMM0, \XMM2
 996	paddd     ONE(%rip), \XMM0		# INCR CNT
 997	movdqa    \XMM0, \XMM3
 998	paddd     ONE(%rip), \XMM0		# INCR CNT
 999	movdqa    \XMM0, \XMM4
1000	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1001	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1002	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1003	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1004	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1005
1006	pxor	  (%arg1), \XMM1
1007	pxor	  (%arg1), \XMM2
1008	pxor	  (%arg1), \XMM3
1009	pxor	  (%arg1), \XMM4
1010	movdqu	  HashKey_4_k(%arg2), \TMP5
1011	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1012	movaps 0x10(%arg1), \TMP1
1013	aesenc	  \TMP1, \XMM1              # Round 1
1014	aesenc	  \TMP1, \XMM2
1015	aesenc	  \TMP1, \XMM3
1016	aesenc	  \TMP1, \XMM4
1017	movaps 0x20(%arg1), \TMP1
1018	aesenc	  \TMP1, \XMM1              # Round 2
1019	aesenc	  \TMP1, \XMM2
1020	aesenc	  \TMP1, \XMM3
1021	aesenc	  \TMP1, \XMM4
1022	movdqa	  \XMM6, \TMP1
1023	pshufd	  $78, \XMM6, \TMP2
1024	pxor	  \XMM6, \TMP2
1025	movdqu	  HashKey_3(%arg2), \TMP5
1026	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1027	movaps 0x30(%arg1), \TMP3
1028	aesenc    \TMP3, \XMM1              # Round 3
1029	aesenc    \TMP3, \XMM2
1030	aesenc    \TMP3, \XMM3
1031	aesenc    \TMP3, \XMM4
1032	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1033	movaps 0x40(%arg1), \TMP3
1034	aesenc	  \TMP3, \XMM1              # Round 4
1035	aesenc	  \TMP3, \XMM2
1036	aesenc	  \TMP3, \XMM3
1037	aesenc	  \TMP3, \XMM4
1038	movdqu	  HashKey_3_k(%arg2), \TMP5
1039	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1040	movaps 0x50(%arg1), \TMP3
1041	aesenc	  \TMP3, \XMM1              # Round 5
1042	aesenc	  \TMP3, \XMM2
1043	aesenc	  \TMP3, \XMM3
1044	aesenc	  \TMP3, \XMM4
1045	pxor	  \TMP1, \TMP4
1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047	pxor	  \XMM6, \XMM5
1048	pxor	  \TMP2, \TMP6
1049	movdqa	  \XMM7, \TMP1
1050	pshufd	  $78, \XMM7, \TMP2
1051	pxor	  \XMM7, \TMP2
1052	movdqu	  HashKey_2(%arg2), \TMP5
1053
1054        # Multiply TMP5 * HashKey using karatsuba
1055
1056	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1057	movaps 0x60(%arg1), \TMP3
1058	aesenc	  \TMP3, \XMM1              # Round 6
1059	aesenc	  \TMP3, \XMM2
1060	aesenc	  \TMP3, \XMM3
1061	aesenc	  \TMP3, \XMM4
1062	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1063	movaps 0x70(%arg1), \TMP3
1064	aesenc	  \TMP3, \XMM1              # Round 7
1065	aesenc	  \TMP3, \XMM2
1066	aesenc	  \TMP3, \XMM3
1067	aesenc	  \TMP3, \XMM4
1068	movdqu	  HashKey_2_k(%arg2), \TMP5
1069	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1070	movaps 0x80(%arg1), \TMP3
1071	aesenc	  \TMP3, \XMM1              # Round 8
1072	aesenc	  \TMP3, \XMM2
1073	aesenc	  \TMP3, \XMM3
1074	aesenc	  \TMP3, \XMM4
1075	pxor	  \TMP1, \TMP4
1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077	pxor	  \XMM7, \XMM5
1078	pxor	  \TMP2, \TMP6
1079
1080        # Multiply XMM8 * HashKey
1081        # XMM8 and TMP5 hold the values for the two operands
1082
1083	movdqa	  \XMM8, \TMP1
1084	pshufd	  $78, \XMM8, \TMP2
1085	pxor	  \XMM8, \TMP2
1086	movdqu	  HashKey(%arg2), \TMP5
1087	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1088	movaps 0x90(%arg1), \TMP3
1089	aesenc	  \TMP3, \XMM1             # Round 9
1090	aesenc	  \TMP3, \XMM2
1091	aesenc	  \TMP3, \XMM3
1092	aesenc	  \TMP3, \XMM4
1093	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1094	lea	  0xa0(%arg1),%r10
1095	mov	  keysize,%eax
1096	shr	  $2,%eax			# 128->4, 192->6, 256->8
1097	sub	  $4,%eax			# 128->0, 192->2, 256->4
1098	jz	  .Laes_loop_par_enc_done\@
1099
1100.Laes_loop_par_enc\@:
1101	MOVADQ	  (%r10),\TMP3
1102.irpc	index, 1234
1103	aesenc	  \TMP3, %xmm\index
1104.endr
1105	add	  $16,%r10
1106	sub	  $1,%eax
1107	jnz	  .Laes_loop_par_enc\@
1108
1109.Laes_loop_par_enc_done\@:
1110	MOVADQ	  (%r10), \TMP3
1111	aesenclast \TMP3, \XMM1           # Round 10
1112	aesenclast \TMP3, \XMM2
1113	aesenclast \TMP3, \XMM3
1114	aesenclast \TMP3, \XMM4
1115	movdqu    HashKey_k(%arg2), \TMP5
1116	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1117	movdqu	  (%arg4,%r11,1), \TMP3
1118	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1119	movdqu	  16(%arg4,%r11,1), \TMP3
1120	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1121	movdqu	  32(%arg4,%r11,1), \TMP3
1122	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1123	movdqu	  48(%arg4,%r11,1), \TMP3
1124	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1125        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1126        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1127        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1128        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1129	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1130	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1131	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1132	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1133
1134	pxor	  \TMP4, \TMP1
1135	pxor	  \XMM8, \XMM5
1136	pxor	  \TMP6, \TMP2
1137	pxor	  \TMP1, \TMP2
1138	pxor	  \XMM5, \TMP2
1139	movdqa	  \TMP2, \TMP3
1140	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1141	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1142	pxor	  \TMP3, \XMM5
1143	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1144
1145        # first phase of reduction
1146
1147	movdqa    \XMM5, \TMP2
1148	movdqa    \XMM5, \TMP3
1149	movdqa    \XMM5, \TMP4
1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151	pslld     $31, \TMP2                   # packed right shift << 31
1152	pslld     $30, \TMP3                   # packed right shift << 30
1153	pslld     $25, \TMP4                   # packed right shift << 25
1154	pxor      \TMP3, \TMP2	               # xor the shifted versions
1155	pxor      \TMP4, \TMP2
1156	movdqa    \TMP2, \TMP5
1157	psrldq    $4, \TMP5                    # right shift T5 1 DW
1158	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1159	pxor      \TMP2, \XMM5
1160
1161        # second phase of reduction
1162
1163	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164	movdqa    \XMM5,\TMP3
1165	movdqa    \XMM5,\TMP4
1166	psrld     $1, \TMP2                    # packed left shift >>1
1167	psrld     $2, \TMP3                    # packed left shift >>2
1168	psrld     $7, \TMP4                    # packed left shift >>7
1169	pxor      \TMP3,\TMP2		       # xor the shifted versions
1170	pxor      \TMP4,\TMP2
1171	pxor      \TMP5, \TMP2
1172	pxor      \TMP2, \XMM5
1173	pxor      \TMP1, \XMM5                 # result is in TMP1
1174
1175	pxor	  \XMM5, \XMM1
1176.endm
1177
1178/*
1179* decrypt 4 blocks at a time
1180* ghash the 4 previously decrypted ciphertext blocks
1181* arg1, %arg3, %arg4 are used as pointers only, not modified
1182* %r11 is the data offset value
1183*/
1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187	movdqa	  \XMM1, \XMM5
1188	movdqa	  \XMM2, \XMM6
1189	movdqa	  \XMM3, \XMM7
1190	movdqa	  \XMM4, \XMM8
1191
1192        movdqa    SHUF_MASK(%rip), %xmm15
1193        # multiply TMP5 * HashKey using karatsuba
1194
1195	movdqa	  \XMM5, \TMP4
1196	pshufd	  $78, \XMM5, \TMP6
1197	pxor	  \XMM5, \TMP6
1198	paddd     ONE(%rip), \XMM0		# INCR CNT
1199	movdqu	  HashKey_4(%arg2), \TMP5
1200	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1201	movdqa    \XMM0, \XMM1
1202	paddd     ONE(%rip), \XMM0		# INCR CNT
1203	movdqa    \XMM0, \XMM2
1204	paddd     ONE(%rip), \XMM0		# INCR CNT
1205	movdqa    \XMM0, \XMM3
1206	paddd     ONE(%rip), \XMM0		# INCR CNT
1207	movdqa    \XMM0, \XMM4
1208	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1209	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1210	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1211	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1212	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1213
1214	pxor	  (%arg1), \XMM1
1215	pxor	  (%arg1), \XMM2
1216	pxor	  (%arg1), \XMM3
1217	pxor	  (%arg1), \XMM4
1218	movdqu	  HashKey_4_k(%arg2), \TMP5
1219	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1220	movaps 0x10(%arg1), \TMP1
1221	aesenc	  \TMP1, \XMM1              # Round 1
1222	aesenc	  \TMP1, \XMM2
1223	aesenc	  \TMP1, \XMM3
1224	aesenc	  \TMP1, \XMM4
1225	movaps 0x20(%arg1), \TMP1
1226	aesenc	  \TMP1, \XMM1              # Round 2
1227	aesenc	  \TMP1, \XMM2
1228	aesenc	  \TMP1, \XMM3
1229	aesenc	  \TMP1, \XMM4
1230	movdqa	  \XMM6, \TMP1
1231	pshufd	  $78, \XMM6, \TMP2
1232	pxor	  \XMM6, \TMP2
1233	movdqu	  HashKey_3(%arg2), \TMP5
1234	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1235	movaps 0x30(%arg1), \TMP3
1236	aesenc    \TMP3, \XMM1              # Round 3
1237	aesenc    \TMP3, \XMM2
1238	aesenc    \TMP3, \XMM3
1239	aesenc    \TMP3, \XMM4
1240	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1241	movaps 0x40(%arg1), \TMP3
1242	aesenc	  \TMP3, \XMM1              # Round 4
1243	aesenc	  \TMP3, \XMM2
1244	aesenc	  \TMP3, \XMM3
1245	aesenc	  \TMP3, \XMM4
1246	movdqu	  HashKey_3_k(%arg2), \TMP5
1247	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248	movaps 0x50(%arg1), \TMP3
1249	aesenc	  \TMP3, \XMM1              # Round 5
1250	aesenc	  \TMP3, \XMM2
1251	aesenc	  \TMP3, \XMM3
1252	aesenc	  \TMP3, \XMM4
1253	pxor	  \TMP1, \TMP4
1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255	pxor	  \XMM6, \XMM5
1256	pxor	  \TMP2, \TMP6
1257	movdqa	  \XMM7, \TMP1
1258	pshufd	  $78, \XMM7, \TMP2
1259	pxor	  \XMM7, \TMP2
1260	movdqu	  HashKey_2(%arg2), \TMP5
1261
1262        # Multiply TMP5 * HashKey using karatsuba
1263
1264	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1265	movaps 0x60(%arg1), \TMP3
1266	aesenc	  \TMP3, \XMM1              # Round 6
1267	aesenc	  \TMP3, \XMM2
1268	aesenc	  \TMP3, \XMM3
1269	aesenc	  \TMP3, \XMM4
1270	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1271	movaps 0x70(%arg1), \TMP3
1272	aesenc	  \TMP3, \XMM1              # Round 7
1273	aesenc	  \TMP3, \XMM2
1274	aesenc	  \TMP3, \XMM3
1275	aesenc	  \TMP3, \XMM4
1276	movdqu	  HashKey_2_k(%arg2), \TMP5
1277	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1278	movaps 0x80(%arg1), \TMP3
1279	aesenc	  \TMP3, \XMM1              # Round 8
1280	aesenc	  \TMP3, \XMM2
1281	aesenc	  \TMP3, \XMM3
1282	aesenc	  \TMP3, \XMM4
1283	pxor	  \TMP1, \TMP4
1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285	pxor	  \XMM7, \XMM5
1286	pxor	  \TMP2, \TMP6
1287
1288        # Multiply XMM8 * HashKey
1289        # XMM8 and TMP5 hold the values for the two operands
1290
1291	movdqa	  \XMM8, \TMP1
1292	pshufd	  $78, \XMM8, \TMP2
1293	pxor	  \XMM8, \TMP2
1294	movdqu	  HashKey(%arg2), \TMP5
1295	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1296	movaps 0x90(%arg1), \TMP3
1297	aesenc	  \TMP3, \XMM1             # Round 9
1298	aesenc	  \TMP3, \XMM2
1299	aesenc	  \TMP3, \XMM3
1300	aesenc	  \TMP3, \XMM4
1301	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1302	lea	  0xa0(%arg1),%r10
1303	mov	  keysize,%eax
1304	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1305	sub	  $4,%eax			# 128->0, 192->2, 256->4
1306	jz	  .Laes_loop_par_dec_done\@
1307
1308.Laes_loop_par_dec\@:
1309	MOVADQ	  (%r10),\TMP3
1310.irpc	index, 1234
1311	aesenc	  \TMP3, %xmm\index
1312.endr
1313	add	  $16,%r10
1314	sub	  $1,%eax
1315	jnz	  .Laes_loop_par_dec\@
1316
1317.Laes_loop_par_dec_done\@:
1318	MOVADQ	  (%r10), \TMP3
1319	aesenclast \TMP3, \XMM1           # last round
1320	aesenclast \TMP3, \XMM2
1321	aesenclast \TMP3, \XMM3
1322	aesenclast \TMP3, \XMM4
1323	movdqu    HashKey_k(%arg2), \TMP5
1324	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1325	movdqu	  (%arg4,%r11,1), \TMP3
1326	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1327	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1328	movdqa    \TMP3, \XMM1
1329	movdqu	  16(%arg4,%r11,1), \TMP3
1330	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1331	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1332	movdqa    \TMP3, \XMM2
1333	movdqu	  32(%arg4,%r11,1), \TMP3
1334	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1335	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1336	movdqa    \TMP3, \XMM3
1337	movdqu	  48(%arg4,%r11,1), \TMP3
1338	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1339	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1340	movdqa    \TMP3, \XMM4
1341	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1342	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1343	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1344	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1345
1346	pxor	  \TMP4, \TMP1
1347	pxor	  \XMM8, \XMM5
1348	pxor	  \TMP6, \TMP2
1349	pxor	  \TMP1, \TMP2
1350	pxor	  \XMM5, \TMP2
1351	movdqa	  \TMP2, \TMP3
1352	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1353	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1354	pxor	  \TMP3, \XMM5
1355	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1356
1357        # first phase of reduction
1358
1359	movdqa    \XMM5, \TMP2
1360	movdqa    \XMM5, \TMP3
1361	movdqa    \XMM5, \TMP4
1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363	pslld     $31, \TMP2                   # packed right shift << 31
1364	pslld     $30, \TMP3                   # packed right shift << 30
1365	pslld     $25, \TMP4                   # packed right shift << 25
1366	pxor      \TMP3, \TMP2	               # xor the shifted versions
1367	pxor      \TMP4, \TMP2
1368	movdqa    \TMP2, \TMP5
1369	psrldq    $4, \TMP5                    # right shift T5 1 DW
1370	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1371	pxor      \TMP2, \XMM5
1372
1373        # second phase of reduction
1374
1375	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376	movdqa    \XMM5,\TMP3
1377	movdqa    \XMM5,\TMP4
1378	psrld     $1, \TMP2                    # packed left shift >>1
1379	psrld     $2, \TMP3                    # packed left shift >>2
1380	psrld     $7, \TMP4                    # packed left shift >>7
1381	pxor      \TMP3,\TMP2		       # xor the shifted versions
1382	pxor      \TMP4,\TMP2
1383	pxor      \TMP5, \TMP2
1384	pxor      \TMP2, \XMM5
1385	pxor      \TMP1, \XMM5                 # result is in TMP1
1386
1387	pxor	  \XMM5, \XMM1
1388.endm
1389
1390/* GHASH the last 4 ciphertext blocks. */
1391.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394        # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396	movdqa	  \XMM1, \TMP6
1397	pshufd	  $78, \XMM1, \TMP2
1398	pxor	  \XMM1, \TMP2
1399	movdqu	  HashKey_4(%arg2), \TMP5
1400	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1401	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1402	movdqu	  HashKey_4_k(%arg2), \TMP4
1403	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1404	movdqa	  \XMM1, \XMMDst
1405	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1406
1407        # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409	movdqa	  \XMM2, \TMP1
1410	pshufd	  $78, \XMM2, \TMP2
1411	pxor	  \XMM2, \TMP2
1412	movdqu	  HashKey_3(%arg2), \TMP5
1413	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1414	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1415	movdqu	  HashKey_3_k(%arg2), \TMP4
1416	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1417	pxor	  \TMP1, \TMP6
1418	pxor	  \XMM2, \XMMDst
1419	pxor	  \TMP2, \XMM1
1420# results accumulated in TMP6, XMMDst, XMM1
1421
1422        # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424	movdqa	  \XMM3, \TMP1
1425	pshufd	  $78, \XMM3, \TMP2
1426	pxor	  \XMM3, \TMP2
1427	movdqu	  HashKey_2(%arg2), \TMP5
1428	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1429	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1430	movdqu	  HashKey_2_k(%arg2), \TMP4
1431	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1432	pxor	  \TMP1, \TMP6
1433	pxor	  \XMM3, \XMMDst
1434	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1435
1436        # Multiply TMP1 * HashKey (using Karatsuba)
1437	movdqa	  \XMM4, \TMP1
1438	pshufd	  $78, \XMM4, \TMP2
1439	pxor	  \XMM4, \TMP2
1440	movdqu	  HashKey(%arg2), \TMP5
1441	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1442	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1443	movdqu	  HashKey_k(%arg2), \TMP4
1444	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1445	pxor	  \TMP1, \TMP6
1446	pxor	  \XMM4, \XMMDst
1447	pxor	  \XMM1, \TMP2
1448	pxor	  \TMP6, \TMP2
1449	pxor	  \XMMDst, \TMP2
1450	# middle section of the temp results combined as in karatsuba algorithm
1451	movdqa	  \TMP2, \TMP4
1452	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1453	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1454	pxor	  \TMP4, \XMMDst
1455	pxor	  \TMP2, \TMP6
1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457	# first phase of the reduction
1458	movdqa    \XMMDst, \TMP2
1459	movdqa    \XMMDst, \TMP3
1460	movdqa    \XMMDst, \TMP4
1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462	pslld     $31, \TMP2                # packed right shifting << 31
1463	pslld     $30, \TMP3                # packed right shifting << 30
1464	pslld     $25, \TMP4                # packed right shifting << 25
1465	pxor      \TMP3, \TMP2              # xor the shifted versions
1466	pxor      \TMP4, \TMP2
1467	movdqa    \TMP2, \TMP7
1468	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1469	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1470	pxor      \TMP2, \XMMDst
1471
1472        # second phase of the reduction
1473	movdqa    \XMMDst, \TMP2
1474	# make 3 copies of XMMDst for doing 3 shift operations
1475	movdqa    \XMMDst, \TMP3
1476	movdqa    \XMMDst, \TMP4
1477	psrld     $1, \TMP2                 # packed left shift >> 1
1478	psrld     $2, \TMP3                 # packed left shift >> 2
1479	psrld     $7, \TMP4                 # packed left shift >> 7
1480	pxor      \TMP3, \TMP2              # xor the shifted versions
1481	pxor      \TMP4, \TMP2
1482	pxor      \TMP7, \TMP2
1483	pxor      \TMP2, \XMMDst
1484	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1485.endm
1486
 
 
1487
1488/* Encryption of a single block
1489* uses eax & r10
1490*/
1491
1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1493
1494	pxor		(%arg1), \XMM0
1495	mov		keysize,%eax
1496	shr		$2,%eax			# 128->4, 192->6, 256->8
1497	add		$5,%eax			# 128->9, 192->11, 256->13
1498	lea		16(%arg1), %r10	  # get first expanded key address
1499
1500_esb_loop_\@:
1501	MOVADQ		(%r10),\TMP1
1502	aesenc		\TMP1,\XMM0
1503	add		$16,%r10
1504	sub		$1,%eax
1505	jnz		_esb_loop_\@
1506
1507	MOVADQ		(%r10),\TMP1
1508	aesenclast	\TMP1,\XMM0
1509.endm
1510/*****************************************************************************
1511* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1512*                   struct gcm_context_data *data
1513*                                      // Context data
1514*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1515*                   const u8 *in,      // Ciphertext input
1516*                   u64 plaintext_len, // Length of data in bytes for decryption.
1517*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1518*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1520*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521*                   const u8 *aad,     // Additional Authentication Data (AAD)
1522*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1524*                                      // given authentication tag and only return the plaintext if they match.
1525*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526*                                      // (most likely), 12 or 8.
1527*
1528* Assumptions:
1529*
1530* keys:
1531*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1532*       set of 11 keys in the data structure void *aes_ctx
1533*
1534* iv:
1535*       0                   1                   2                   3
1536*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538*       |                             Salt  (From the SA)               |
1539*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540*       |                     Initialization Vector                     |
1541*       |         (This is the sequence number from IPSec header)       |
1542*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543*       |                              0x1                              |
1544*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*
1546*
1547*
1548* AAD:
1549*       AAD padded to 128 bits with 0
1550*       for example, assume AAD is a u32 vector
1551*
1552*       if AAD is 8 bytes:
1553*       AAD[3] = {A0, A1};
1554*       padded AAD in xmm register = {A1 A0 0 0}
1555*
1556*       0                   1                   2                   3
1557*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559*       |                               SPI (A1)                        |
1560*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561*       |                     32-bit Sequence Number (A0)               |
1562*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563*       |                              0x0                              |
1564*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*
1566*                                       AAD Format with 32-bit Sequence Number
1567*
1568*       if AAD is 12 bytes:
1569*       AAD[3] = {A0, A1, A2};
1570*       padded AAD in xmm register = {A2 A1 A0 0}
1571*
1572*       0                   1                   2                   3
1573*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577*       |                               SPI (A2)                        |
1578*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579*       |                 64-bit Extended Sequence Number {A1,A0}       |
1580*       |                                                               |
1581*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582*       |                              0x0                              |
1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*
1585*                        AAD Format with 64-bit Extended Sequence Number
1586*
 
 
 
 
 
 
 
 
1587* poly = x^128 + x^127 + x^126 + x^121 + 1
1588*
1589*****************************************************************************/
1590SYM_FUNC_START(aesni_gcm_dec)
1591	FUNC_SAVE
1592
1593	GCM_INIT %arg6, arg7, arg8, arg9
1594	GCM_ENC_DEC dec
1595	GCM_COMPLETE arg10, arg11
1596	FUNC_RESTORE
1597	RET
1598SYM_FUNC_END(aesni_gcm_dec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1599
1600
1601/*****************************************************************************
1602* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1603*                    struct gcm_context_data *data
1604*                                        // Context data
1605*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1606*                    const u8 *in,       // Plaintext input
1607*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1608*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1609*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1611*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612*                    const u8 *aad,      // Additional Authentication Data (AAD)
1613*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614*                    u8 *auth_tag,       // Authenticated Tag output.
1615*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616*                                        // 12 or 8.
1617*
1618* Assumptions:
1619*
1620* keys:
1621*       keys are pre-expanded and aligned to 16 bytes. we are using the
1622*       first set of 11 keys in the data structure void *aes_ctx
1623*
1624*
1625* iv:
1626*       0                   1                   2                   3
1627*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629*       |                             Salt  (From the SA)               |
1630*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631*       |                     Initialization Vector                     |
1632*       |         (This is the sequence number from IPSec header)       |
1633*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634*       |                              0x1                              |
1635*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*
1637*
1638*
1639* AAD:
1640*       AAD padded to 128 bits with 0
1641*       for example, assume AAD is a u32 vector
1642*
1643*       if AAD is 8 bytes:
1644*       AAD[3] = {A0, A1};
1645*       padded AAD in xmm register = {A1 A0 0 0}
1646*
1647*       0                   1                   2                   3
1648*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650*       |                               SPI (A1)                        |
1651*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652*       |                     32-bit Sequence Number (A0)               |
1653*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654*       |                              0x0                              |
1655*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*
1657*                                 AAD Format with 32-bit Sequence Number
1658*
1659*       if AAD is 12 bytes:
1660*       AAD[3] = {A0, A1, A2};
1661*       padded AAD in xmm register = {A2 A1 A0 0}
1662*
1663*       0                   1                   2                   3
1664*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666*       |                               SPI (A2)                        |
1667*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668*       |                 64-bit Extended Sequence Number {A1,A0}       |
1669*       |                                                               |
1670*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671*       |                              0x0                              |
1672*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*
1674*                         AAD Format with 64-bit Extended Sequence Number
1675*
 
 
 
 
 
 
 
 
1676* poly = x^128 + x^127 + x^126 + x^121 + 1
1677***************************************************************************/
1678SYM_FUNC_START(aesni_gcm_enc)
1679	FUNC_SAVE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1680
1681	GCM_INIT %arg6, arg7, arg8, arg9
1682	GCM_ENC_DEC enc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1683
1684	GCM_COMPLETE arg10, arg11
1685	FUNC_RESTORE
1686	RET
1687SYM_FUNC_END(aesni_gcm_enc)
 
1688
1689/*****************************************************************************
1690* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1691*                     struct gcm_context_data *data,
1692*                                         // context data
1693*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1694*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1696*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697*                     const u8 *aad,      // Additional Authentication Data (AAD)
1698*                     u64 aad_len)        // Length of AAD in bytes.
1699*/
1700SYM_FUNC_START(aesni_gcm_init)
1701	FUNC_SAVE
1702	GCM_INIT %arg3, %arg4,%arg5, %arg6
1703	FUNC_RESTORE
1704	RET
1705SYM_FUNC_END(aesni_gcm_init)
1706
1707/*****************************************************************************
1708* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1709*                    struct gcm_context_data *data,
1710*                                        // context data
1711*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1712*                    const u8 *in,       // Plaintext input
1713*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1714*/
1715SYM_FUNC_START(aesni_gcm_enc_update)
1716	FUNC_SAVE
1717	GCM_ENC_DEC enc
1718	FUNC_RESTORE
1719	RET
1720SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722/*****************************************************************************
1723* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1724*                    struct gcm_context_data *data,
1725*                                        // context data
1726*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1727*                    const u8 *in,       // Plaintext input
1728*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1729*/
1730SYM_FUNC_START(aesni_gcm_dec_update)
1731	FUNC_SAVE
1732	GCM_ENC_DEC dec
1733	FUNC_RESTORE
1734	RET
1735SYM_FUNC_END(aesni_gcm_dec_update)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1736
1737/*****************************************************************************
1738* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1739*                    struct gcm_context_data *data,
1740*                                        // context data
1741*                    u8 *auth_tag,       // Authenticated Tag output.
1742*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743*                                        // 12 or 8.
1744*/
1745SYM_FUNC_START(aesni_gcm_finalize)
1746	FUNC_SAVE
1747	GCM_COMPLETE %arg3 %arg4
1748	FUNC_RESTORE
1749	RET
1750SYM_FUNC_END(aesni_gcm_finalize)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1751
1752#endif
1753
1754SYM_FUNC_START_LOCAL(_key_expansion_256a)
 
 
1755	pshufd $0b11111111, %xmm1, %xmm1
1756	shufps $0b00010000, %xmm0, %xmm4
1757	pxor %xmm4, %xmm0
1758	shufps $0b10001100, %xmm0, %xmm4
1759	pxor %xmm4, %xmm0
1760	pxor %xmm1, %xmm0
1761	movaps %xmm0, (TKEYP)
1762	add $0x10, TKEYP
1763	RET
1764SYM_FUNC_END(_key_expansion_256a)
1765SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1766
1767SYM_FUNC_START_LOCAL(_key_expansion_192a)
 
1768	pshufd $0b01010101, %xmm1, %xmm1
1769	shufps $0b00010000, %xmm0, %xmm4
1770	pxor %xmm4, %xmm0
1771	shufps $0b10001100, %xmm0, %xmm4
1772	pxor %xmm4, %xmm0
1773	pxor %xmm1, %xmm0
1774
1775	movaps %xmm2, %xmm5
1776	movaps %xmm2, %xmm6
1777	pslldq $4, %xmm5
1778	pshufd $0b11111111, %xmm0, %xmm3
1779	pxor %xmm3, %xmm2
1780	pxor %xmm5, %xmm2
1781
1782	movaps %xmm0, %xmm1
1783	shufps $0b01000100, %xmm0, %xmm6
1784	movaps %xmm6, (TKEYP)
1785	shufps $0b01001110, %xmm2, %xmm1
1786	movaps %xmm1, 0x10(TKEYP)
1787	add $0x20, TKEYP
1788	RET
1789SYM_FUNC_END(_key_expansion_192a)
1790
1791SYM_FUNC_START_LOCAL(_key_expansion_192b)
 
1792	pshufd $0b01010101, %xmm1, %xmm1
1793	shufps $0b00010000, %xmm0, %xmm4
1794	pxor %xmm4, %xmm0
1795	shufps $0b10001100, %xmm0, %xmm4
1796	pxor %xmm4, %xmm0
1797	pxor %xmm1, %xmm0
1798
1799	movaps %xmm2, %xmm5
1800	pslldq $4, %xmm5
1801	pshufd $0b11111111, %xmm0, %xmm3
1802	pxor %xmm3, %xmm2
1803	pxor %xmm5, %xmm2
1804
1805	movaps %xmm0, (TKEYP)
1806	add $0x10, TKEYP
1807	RET
1808SYM_FUNC_END(_key_expansion_192b)
1809
1810SYM_FUNC_START_LOCAL(_key_expansion_256b)
 
1811	pshufd $0b10101010, %xmm1, %xmm1
1812	shufps $0b00010000, %xmm2, %xmm4
1813	pxor %xmm4, %xmm2
1814	shufps $0b10001100, %xmm2, %xmm4
1815	pxor %xmm4, %xmm2
1816	pxor %xmm1, %xmm2
1817	movaps %xmm2, (TKEYP)
1818	add $0x10, TKEYP
1819	RET
1820SYM_FUNC_END(_key_expansion_256b)
1821
1822/*
1823 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1824 *                   unsigned int key_len)
1825 */
1826SYM_FUNC_START(aesni_set_key)
1827	FRAME_BEGIN
1828#ifndef __x86_64__
1829	pushl KEYP
1830	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1831	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1832	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1833#endif
1834	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1835	movaps %xmm0, (KEYP)
1836	lea 0x10(KEYP), TKEYP		# key addr
1837	movl %edx, 480(KEYP)
1838	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1839	cmp $24, %dl
1840	jb .Lenc_key128
1841	je .Lenc_key192
1842	movups 0x10(UKEYP), %xmm2	# other user key
1843	movaps %xmm2, (TKEYP)
1844	add $0x10, TKEYP
1845	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1846	call _key_expansion_256a
1847	aeskeygenassist $0x1, %xmm0, %xmm1
1848	call _key_expansion_256b
1849	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1850	call _key_expansion_256a
1851	aeskeygenassist $0x2, %xmm0, %xmm1
1852	call _key_expansion_256b
1853	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1854	call _key_expansion_256a
1855	aeskeygenassist $0x4, %xmm0, %xmm1
1856	call _key_expansion_256b
1857	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1858	call _key_expansion_256a
1859	aeskeygenassist $0x8, %xmm0, %xmm1
1860	call _key_expansion_256b
1861	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1862	call _key_expansion_256a
1863	aeskeygenassist $0x10, %xmm0, %xmm1
1864	call _key_expansion_256b
1865	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1866	call _key_expansion_256a
1867	aeskeygenassist $0x20, %xmm0, %xmm1
1868	call _key_expansion_256b
1869	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1870	call _key_expansion_256a
1871	jmp .Ldec_key
1872.Lenc_key192:
1873	movq 0x10(UKEYP), %xmm2		# other user key
1874	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1875	call _key_expansion_192a
1876	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1877	call _key_expansion_192b
1878	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1879	call _key_expansion_192a
1880	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1881	call _key_expansion_192b
1882	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1883	call _key_expansion_192a
1884	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1885	call _key_expansion_192b
1886	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1887	call _key_expansion_192a
1888	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
1889	call _key_expansion_192b
1890	jmp .Ldec_key
1891.Lenc_key128:
1892	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
1893	call _key_expansion_128
1894	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
1895	call _key_expansion_128
1896	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
1897	call _key_expansion_128
1898	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
1899	call _key_expansion_128
1900	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
1901	call _key_expansion_128
1902	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
1903	call _key_expansion_128
1904	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
1905	call _key_expansion_128
1906	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
1907	call _key_expansion_128
1908	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
1909	call _key_expansion_128
1910	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
1911	call _key_expansion_128
1912.Ldec_key:
1913	sub $0x10, TKEYP
1914	movaps (KEYP), %xmm0
1915	movaps (TKEYP), %xmm1
1916	movaps %xmm0, 240(TKEYP)
1917	movaps %xmm1, 240(KEYP)
1918	add $0x10, KEYP
1919	lea 240-16(TKEYP), UKEYP
1920.align 4
1921.Ldec_key_loop:
1922	movaps (KEYP), %xmm0
1923	aesimc %xmm0, %xmm1
1924	movaps %xmm1, (UKEYP)
1925	add $0x10, KEYP
1926	sub $0x10, UKEYP
1927	cmp TKEYP, KEYP
1928	jb .Ldec_key_loop
1929	xor AREG, AREG
1930#ifndef __x86_64__
1931	popl KEYP
1932#endif
1933	FRAME_END
1934	RET
1935SYM_FUNC_END(aesni_set_key)
1936
1937/*
1938 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1939 */
1940SYM_FUNC_START(aesni_enc)
1941	FRAME_BEGIN
1942#ifndef __x86_64__
1943	pushl KEYP
1944	pushl KLEN
1945	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1946	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1947	movl (FRAME_OFFSET+20)(%esp), INP	# src
1948#endif
1949	movl 480(KEYP), KLEN		# key length
1950	movups (INP), STATE		# input
1951	call _aesni_enc1
1952	movups STATE, (OUTP)		# output
1953#ifndef __x86_64__
1954	popl KLEN
1955	popl KEYP
1956#endif
1957	FRAME_END
1958	RET
1959SYM_FUNC_END(aesni_enc)
1960
1961/*
1962 * _aesni_enc1:		internal ABI
1963 * input:
1964 *	KEYP:		key struct pointer
1965 *	KLEN:		round count
1966 *	STATE:		initial state (input)
1967 * output:
1968 *	STATE:		finial state (output)
1969 * changed:
1970 *	KEY
1971 *	TKEYP (T1)
1972 */
1973SYM_FUNC_START_LOCAL(_aesni_enc1)
 
1974	movaps (KEYP), KEY		# key
1975	mov KEYP, TKEYP
1976	pxor KEY, STATE		# round 0
1977	add $0x30, TKEYP
1978	cmp $24, KLEN
1979	jb .Lenc128
1980	lea 0x20(TKEYP), TKEYP
1981	je .Lenc192
1982	add $0x20, TKEYP
1983	movaps -0x60(TKEYP), KEY
1984	aesenc KEY, STATE
1985	movaps -0x50(TKEYP), KEY
1986	aesenc KEY, STATE
1987.align 4
1988.Lenc192:
1989	movaps -0x40(TKEYP), KEY
1990	aesenc KEY, STATE
1991	movaps -0x30(TKEYP), KEY
1992	aesenc KEY, STATE
1993.align 4
1994.Lenc128:
1995	movaps -0x20(TKEYP), KEY
1996	aesenc KEY, STATE
1997	movaps -0x10(TKEYP), KEY
1998	aesenc KEY, STATE
1999	movaps (TKEYP), KEY
2000	aesenc KEY, STATE
2001	movaps 0x10(TKEYP), KEY
2002	aesenc KEY, STATE
2003	movaps 0x20(TKEYP), KEY
2004	aesenc KEY, STATE
2005	movaps 0x30(TKEYP), KEY
2006	aesenc KEY, STATE
2007	movaps 0x40(TKEYP), KEY
2008	aesenc KEY, STATE
2009	movaps 0x50(TKEYP), KEY
2010	aesenc KEY, STATE
2011	movaps 0x60(TKEYP), KEY
2012	aesenc KEY, STATE
2013	movaps 0x70(TKEYP), KEY
2014	aesenclast KEY, STATE
2015	RET
2016SYM_FUNC_END(_aesni_enc1)
2017
2018/*
2019 * _aesni_enc4:	internal ABI
2020 * input:
2021 *	KEYP:		key struct pointer
2022 *	KLEN:		round count
2023 *	STATE1:		initial state (input)
2024 *	STATE2
2025 *	STATE3
2026 *	STATE4
2027 * output:
2028 *	STATE1:		finial state (output)
2029 *	STATE2
2030 *	STATE3
2031 *	STATE4
2032 * changed:
2033 *	KEY
2034 *	TKEYP (T1)
2035 */
2036SYM_FUNC_START_LOCAL(_aesni_enc4)
 
2037	movaps (KEYP), KEY		# key
2038	mov KEYP, TKEYP
2039	pxor KEY, STATE1		# round 0
2040	pxor KEY, STATE2
2041	pxor KEY, STATE3
2042	pxor KEY, STATE4
2043	add $0x30, TKEYP
2044	cmp $24, KLEN
2045	jb .L4enc128
2046	lea 0x20(TKEYP), TKEYP
2047	je .L4enc192
2048	add $0x20, TKEYP
2049	movaps -0x60(TKEYP), KEY
2050	aesenc KEY, STATE1
2051	aesenc KEY, STATE2
2052	aesenc KEY, STATE3
2053	aesenc KEY, STATE4
2054	movaps -0x50(TKEYP), KEY
2055	aesenc KEY, STATE1
2056	aesenc KEY, STATE2
2057	aesenc KEY, STATE3
2058	aesenc KEY, STATE4
2059#.align 4
2060.L4enc192:
2061	movaps -0x40(TKEYP), KEY
2062	aesenc KEY, STATE1
2063	aesenc KEY, STATE2
2064	aesenc KEY, STATE3
2065	aesenc KEY, STATE4
2066	movaps -0x30(TKEYP), KEY
2067	aesenc KEY, STATE1
2068	aesenc KEY, STATE2
2069	aesenc KEY, STATE3
2070	aesenc KEY, STATE4
2071#.align 4
2072.L4enc128:
2073	movaps -0x20(TKEYP), KEY
2074	aesenc KEY, STATE1
2075	aesenc KEY, STATE2
2076	aesenc KEY, STATE3
2077	aesenc KEY, STATE4
2078	movaps -0x10(TKEYP), KEY
2079	aesenc KEY, STATE1
2080	aesenc KEY, STATE2
2081	aesenc KEY, STATE3
2082	aesenc KEY, STATE4
2083	movaps (TKEYP), KEY
2084	aesenc KEY, STATE1
2085	aesenc KEY, STATE2
2086	aesenc KEY, STATE3
2087	aesenc KEY, STATE4
2088	movaps 0x10(TKEYP), KEY
2089	aesenc KEY, STATE1
2090	aesenc KEY, STATE2
2091	aesenc KEY, STATE3
2092	aesenc KEY, STATE4
2093	movaps 0x20(TKEYP), KEY
2094	aesenc KEY, STATE1
2095	aesenc KEY, STATE2
2096	aesenc KEY, STATE3
2097	aesenc KEY, STATE4
2098	movaps 0x30(TKEYP), KEY
2099	aesenc KEY, STATE1
2100	aesenc KEY, STATE2
2101	aesenc KEY, STATE3
2102	aesenc KEY, STATE4
2103	movaps 0x40(TKEYP), KEY
2104	aesenc KEY, STATE1
2105	aesenc KEY, STATE2
2106	aesenc KEY, STATE3
2107	aesenc KEY, STATE4
2108	movaps 0x50(TKEYP), KEY
2109	aesenc KEY, STATE1
2110	aesenc KEY, STATE2
2111	aesenc KEY, STATE3
2112	aesenc KEY, STATE4
2113	movaps 0x60(TKEYP), KEY
2114	aesenc KEY, STATE1
2115	aesenc KEY, STATE2
2116	aesenc KEY, STATE3
2117	aesenc KEY, STATE4
2118	movaps 0x70(TKEYP), KEY
2119	aesenclast KEY, STATE1		# last round
2120	aesenclast KEY, STATE2
2121	aesenclast KEY, STATE3
2122	aesenclast KEY, STATE4
2123	RET
2124SYM_FUNC_END(_aesni_enc4)
2125
2126/*
2127 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2128 */
2129SYM_FUNC_START(aesni_dec)
2130	FRAME_BEGIN
2131#ifndef __x86_64__
2132	pushl KEYP
2133	pushl KLEN
2134	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2135	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2136	movl (FRAME_OFFSET+20)(%esp), INP	# src
2137#endif
2138	mov 480(KEYP), KLEN		# key length
2139	add $240, KEYP
2140	movups (INP), STATE		# input
2141	call _aesni_dec1
2142	movups STATE, (OUTP)		#output
2143#ifndef __x86_64__
2144	popl KLEN
2145	popl KEYP
2146#endif
2147	FRAME_END
2148	RET
2149SYM_FUNC_END(aesni_dec)
2150
2151/*
2152 * _aesni_dec1:		internal ABI
2153 * input:
2154 *	KEYP:		key struct pointer
2155 *	KLEN:		key length
2156 *	STATE:		initial state (input)
2157 * output:
2158 *	STATE:		finial state (output)
2159 * changed:
2160 *	KEY
2161 *	TKEYP (T1)
2162 */
2163SYM_FUNC_START_LOCAL(_aesni_dec1)
 
2164	movaps (KEYP), KEY		# key
2165	mov KEYP, TKEYP
2166	pxor KEY, STATE		# round 0
2167	add $0x30, TKEYP
2168	cmp $24, KLEN
2169	jb .Ldec128
2170	lea 0x20(TKEYP), TKEYP
2171	je .Ldec192
2172	add $0x20, TKEYP
2173	movaps -0x60(TKEYP), KEY
2174	aesdec KEY, STATE
2175	movaps -0x50(TKEYP), KEY
2176	aesdec KEY, STATE
2177.align 4
2178.Ldec192:
2179	movaps -0x40(TKEYP), KEY
2180	aesdec KEY, STATE
2181	movaps -0x30(TKEYP), KEY
2182	aesdec KEY, STATE
2183.align 4
2184.Ldec128:
2185	movaps -0x20(TKEYP), KEY
2186	aesdec KEY, STATE
2187	movaps -0x10(TKEYP), KEY
2188	aesdec KEY, STATE
2189	movaps (TKEYP), KEY
2190	aesdec KEY, STATE
2191	movaps 0x10(TKEYP), KEY
2192	aesdec KEY, STATE
2193	movaps 0x20(TKEYP), KEY
2194	aesdec KEY, STATE
2195	movaps 0x30(TKEYP), KEY
2196	aesdec KEY, STATE
2197	movaps 0x40(TKEYP), KEY
2198	aesdec KEY, STATE
2199	movaps 0x50(TKEYP), KEY
2200	aesdec KEY, STATE
2201	movaps 0x60(TKEYP), KEY
2202	aesdec KEY, STATE
2203	movaps 0x70(TKEYP), KEY
2204	aesdeclast KEY, STATE
2205	RET
2206SYM_FUNC_END(_aesni_dec1)
2207
2208/*
2209 * _aesni_dec4:	internal ABI
2210 * input:
2211 *	KEYP:		key struct pointer
2212 *	KLEN:		key length
2213 *	STATE1:		initial state (input)
2214 *	STATE2
2215 *	STATE3
2216 *	STATE4
2217 * output:
2218 *	STATE1:		finial state (output)
2219 *	STATE2
2220 *	STATE3
2221 *	STATE4
2222 * changed:
2223 *	KEY
2224 *	TKEYP (T1)
2225 */
2226SYM_FUNC_START_LOCAL(_aesni_dec4)
 
2227	movaps (KEYP), KEY		# key
2228	mov KEYP, TKEYP
2229	pxor KEY, STATE1		# round 0
2230	pxor KEY, STATE2
2231	pxor KEY, STATE3
2232	pxor KEY, STATE4
2233	add $0x30, TKEYP
2234	cmp $24, KLEN
2235	jb .L4dec128
2236	lea 0x20(TKEYP), TKEYP
2237	je .L4dec192
2238	add $0x20, TKEYP
2239	movaps -0x60(TKEYP), KEY
2240	aesdec KEY, STATE1
2241	aesdec KEY, STATE2
2242	aesdec KEY, STATE3
2243	aesdec KEY, STATE4
2244	movaps -0x50(TKEYP), KEY
2245	aesdec KEY, STATE1
2246	aesdec KEY, STATE2
2247	aesdec KEY, STATE3
2248	aesdec KEY, STATE4
2249.align 4
2250.L4dec192:
2251	movaps -0x40(TKEYP), KEY
2252	aesdec KEY, STATE1
2253	aesdec KEY, STATE2
2254	aesdec KEY, STATE3
2255	aesdec KEY, STATE4
2256	movaps -0x30(TKEYP), KEY
2257	aesdec KEY, STATE1
2258	aesdec KEY, STATE2
2259	aesdec KEY, STATE3
2260	aesdec KEY, STATE4
2261.align 4
2262.L4dec128:
2263	movaps -0x20(TKEYP), KEY
2264	aesdec KEY, STATE1
2265	aesdec KEY, STATE2
2266	aesdec KEY, STATE3
2267	aesdec KEY, STATE4
2268	movaps -0x10(TKEYP), KEY
2269	aesdec KEY, STATE1
2270	aesdec KEY, STATE2
2271	aesdec KEY, STATE3
2272	aesdec KEY, STATE4
2273	movaps (TKEYP), KEY
2274	aesdec KEY, STATE1
2275	aesdec KEY, STATE2
2276	aesdec KEY, STATE3
2277	aesdec KEY, STATE4
2278	movaps 0x10(TKEYP), KEY
2279	aesdec KEY, STATE1
2280	aesdec KEY, STATE2
2281	aesdec KEY, STATE3
2282	aesdec KEY, STATE4
2283	movaps 0x20(TKEYP), KEY
2284	aesdec KEY, STATE1
2285	aesdec KEY, STATE2
2286	aesdec KEY, STATE3
2287	aesdec KEY, STATE4
2288	movaps 0x30(TKEYP), KEY
2289	aesdec KEY, STATE1
2290	aesdec KEY, STATE2
2291	aesdec KEY, STATE3
2292	aesdec KEY, STATE4
2293	movaps 0x40(TKEYP), KEY
2294	aesdec KEY, STATE1
2295	aesdec KEY, STATE2
2296	aesdec KEY, STATE3
2297	aesdec KEY, STATE4
2298	movaps 0x50(TKEYP), KEY
2299	aesdec KEY, STATE1
2300	aesdec KEY, STATE2
2301	aesdec KEY, STATE3
2302	aesdec KEY, STATE4
2303	movaps 0x60(TKEYP), KEY
2304	aesdec KEY, STATE1
2305	aesdec KEY, STATE2
2306	aesdec KEY, STATE3
2307	aesdec KEY, STATE4
2308	movaps 0x70(TKEYP), KEY
2309	aesdeclast KEY, STATE1		# last round
2310	aesdeclast KEY, STATE2
2311	aesdeclast KEY, STATE3
2312	aesdeclast KEY, STATE4
2313	RET
2314SYM_FUNC_END(_aesni_dec4)
2315
2316/*
2317 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 *		      size_t len)
2319 */
2320SYM_FUNC_START(aesni_ecb_enc)
2321	FRAME_BEGIN
2322#ifndef __x86_64__
2323	pushl LEN
2324	pushl KEYP
2325	pushl KLEN
2326	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2327	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2328	movl (FRAME_OFFSET+24)(%esp), INP	# src
2329	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2330#endif
2331	test LEN, LEN		# check length
2332	jz .Lecb_enc_ret
2333	mov 480(KEYP), KLEN
2334	cmp $16, LEN
2335	jb .Lecb_enc_ret
2336	cmp $64, LEN
2337	jb .Lecb_enc_loop1
2338.align 4
2339.Lecb_enc_loop4:
2340	movups (INP), STATE1
2341	movups 0x10(INP), STATE2
2342	movups 0x20(INP), STATE3
2343	movups 0x30(INP), STATE4
2344	call _aesni_enc4
2345	movups STATE1, (OUTP)
2346	movups STATE2, 0x10(OUTP)
2347	movups STATE3, 0x20(OUTP)
2348	movups STATE4, 0x30(OUTP)
2349	sub $64, LEN
2350	add $64, INP
2351	add $64, OUTP
2352	cmp $64, LEN
2353	jge .Lecb_enc_loop4
2354	cmp $16, LEN
2355	jb .Lecb_enc_ret
2356.align 4
2357.Lecb_enc_loop1:
2358	movups (INP), STATE1
2359	call _aesni_enc1
2360	movups STATE1, (OUTP)
2361	sub $16, LEN
2362	add $16, INP
2363	add $16, OUTP
2364	cmp $16, LEN
2365	jge .Lecb_enc_loop1
2366.Lecb_enc_ret:
2367#ifndef __x86_64__
2368	popl KLEN
2369	popl KEYP
2370	popl LEN
2371#endif
2372	FRAME_END
2373	RET
2374SYM_FUNC_END(aesni_ecb_enc)
2375
2376/*
2377 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2378 *		      size_t len);
2379 */
2380SYM_FUNC_START(aesni_ecb_dec)
2381	FRAME_BEGIN
2382#ifndef __x86_64__
2383	pushl LEN
2384	pushl KEYP
2385	pushl KLEN
2386	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2387	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2388	movl (FRAME_OFFSET+24)(%esp), INP	# src
2389	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2390#endif
2391	test LEN, LEN
2392	jz .Lecb_dec_ret
2393	mov 480(KEYP), KLEN
2394	add $240, KEYP
2395	cmp $16, LEN
2396	jb .Lecb_dec_ret
2397	cmp $64, LEN
2398	jb .Lecb_dec_loop1
2399.align 4
2400.Lecb_dec_loop4:
2401	movups (INP), STATE1
2402	movups 0x10(INP), STATE2
2403	movups 0x20(INP), STATE3
2404	movups 0x30(INP), STATE4
2405	call _aesni_dec4
2406	movups STATE1, (OUTP)
2407	movups STATE2, 0x10(OUTP)
2408	movups STATE3, 0x20(OUTP)
2409	movups STATE4, 0x30(OUTP)
2410	sub $64, LEN
2411	add $64, INP
2412	add $64, OUTP
2413	cmp $64, LEN
2414	jge .Lecb_dec_loop4
2415	cmp $16, LEN
2416	jb .Lecb_dec_ret
2417.align 4
2418.Lecb_dec_loop1:
2419	movups (INP), STATE1
2420	call _aesni_dec1
2421	movups STATE1, (OUTP)
2422	sub $16, LEN
2423	add $16, INP
2424	add $16, OUTP
2425	cmp $16, LEN
2426	jge .Lecb_dec_loop1
2427.Lecb_dec_ret:
2428#ifndef __x86_64__
2429	popl KLEN
2430	popl KEYP
2431	popl LEN
2432#endif
2433	FRAME_END
2434	RET
2435SYM_FUNC_END(aesni_ecb_dec)
2436
2437/*
2438 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2439 *		      size_t len, u8 *iv)
2440 */
2441SYM_FUNC_START(aesni_cbc_enc)
2442	FRAME_BEGIN
2443#ifndef __x86_64__
2444	pushl IVP
2445	pushl LEN
2446	pushl KEYP
2447	pushl KLEN
2448	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2449	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2450	movl (FRAME_OFFSET+28)(%esp), INP	# src
2451	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2452	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2453#endif
2454	cmp $16, LEN
2455	jb .Lcbc_enc_ret
2456	mov 480(KEYP), KLEN
2457	movups (IVP), STATE	# load iv as initial state
2458.align 4
2459.Lcbc_enc_loop:
2460	movups (INP), IN	# load input
2461	pxor IN, STATE
2462	call _aesni_enc1
2463	movups STATE, (OUTP)	# store output
2464	sub $16, LEN
2465	add $16, INP
2466	add $16, OUTP
2467	cmp $16, LEN
2468	jge .Lcbc_enc_loop
2469	movups STATE, (IVP)
2470.Lcbc_enc_ret:
2471#ifndef __x86_64__
2472	popl KLEN
2473	popl KEYP
2474	popl LEN
2475	popl IVP
2476#endif
2477	FRAME_END
2478	RET
2479SYM_FUNC_END(aesni_cbc_enc)
2480
2481/*
2482 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2483 *		      size_t len, u8 *iv)
2484 */
2485SYM_FUNC_START(aesni_cbc_dec)
2486	FRAME_BEGIN
2487#ifndef __x86_64__
2488	pushl IVP
2489	pushl LEN
2490	pushl KEYP
2491	pushl KLEN
2492	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2493	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2494	movl (FRAME_OFFSET+28)(%esp), INP	# src
2495	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2496	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2497#endif
2498	cmp $16, LEN
2499	jb .Lcbc_dec_just_ret
2500	mov 480(KEYP), KLEN
2501	add $240, KEYP
2502	movups (IVP), IV
2503	cmp $64, LEN
2504	jb .Lcbc_dec_loop1
2505.align 4
2506.Lcbc_dec_loop4:
2507	movups (INP), IN1
2508	movaps IN1, STATE1
2509	movups 0x10(INP), IN2
2510	movaps IN2, STATE2
2511#ifdef __x86_64__
2512	movups 0x20(INP), IN3
2513	movaps IN3, STATE3
2514	movups 0x30(INP), IN4
2515	movaps IN4, STATE4
2516#else
2517	movups 0x20(INP), IN1
2518	movaps IN1, STATE3
2519	movups 0x30(INP), IN2
2520	movaps IN2, STATE4
2521#endif
2522	call _aesni_dec4
2523	pxor IV, STATE1
2524#ifdef __x86_64__
2525	pxor IN1, STATE2
2526	pxor IN2, STATE3
2527	pxor IN3, STATE4
2528	movaps IN4, IV
2529#else
 
 
2530	pxor IN1, STATE4
2531	movaps IN2, IV
2532	movups (INP), IN1
2533	pxor IN1, STATE2
2534	movups 0x10(INP), IN2
2535	pxor IN2, STATE3
2536#endif
2537	movups STATE1, (OUTP)
2538	movups STATE2, 0x10(OUTP)
2539	movups STATE3, 0x20(OUTP)
2540	movups STATE4, 0x30(OUTP)
2541	sub $64, LEN
2542	add $64, INP
2543	add $64, OUTP
2544	cmp $64, LEN
2545	jge .Lcbc_dec_loop4
2546	cmp $16, LEN
2547	jb .Lcbc_dec_ret
2548.align 4
2549.Lcbc_dec_loop1:
2550	movups (INP), IN
2551	movaps IN, STATE
2552	call _aesni_dec1
2553	pxor IV, STATE
2554	movups STATE, (OUTP)
2555	movaps IN, IV
2556	sub $16, LEN
2557	add $16, INP
2558	add $16, OUTP
2559	cmp $16, LEN
2560	jge .Lcbc_dec_loop1
2561.Lcbc_dec_ret:
2562	movups IV, (IVP)
2563.Lcbc_dec_just_ret:
2564#ifndef __x86_64__
2565	popl KLEN
2566	popl KEYP
2567	popl LEN
2568	popl IVP
2569#endif
2570	FRAME_END
2571	RET
2572SYM_FUNC_END(aesni_cbc_dec)
2573
2574/*
2575 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2576 *			  size_t len, u8 *iv)
2577 */
2578SYM_FUNC_START(aesni_cts_cbc_enc)
2579	FRAME_BEGIN
2580#ifndef __x86_64__
2581	pushl IVP
2582	pushl LEN
2583	pushl KEYP
2584	pushl KLEN
2585	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2586	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2587	movl (FRAME_OFFSET+28)(%esp), INP	# src
2588	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2589	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2590	lea .Lcts_permute_table, T1
2591#else
2592	lea .Lcts_permute_table(%rip), T1
2593#endif
2594	mov 480(KEYP), KLEN
2595	movups (IVP), STATE
2596	sub $16, LEN
2597	mov T1, IVP
2598	add $32, IVP
2599	add LEN, T1
2600	sub LEN, IVP
2601	movups (T1), %xmm4
2602	movups (IVP), %xmm5
2603
2604	movups (INP), IN1
2605	add LEN, INP
2606	movups (INP), IN2
2607
2608	pxor IN1, STATE
2609	call _aesni_enc1
2610
2611	pshufb %xmm5, IN2
2612	pxor STATE, IN2
2613	pshufb %xmm4, STATE
2614	add OUTP, LEN
2615	movups STATE, (LEN)
2616
2617	movaps IN2, STATE
2618	call _aesni_enc1
2619	movups STATE, (OUTP)
2620
2621#ifndef __x86_64__
2622	popl KLEN
2623	popl KEYP
2624	popl LEN
2625	popl IVP
2626#endif
2627	FRAME_END
2628	RET
2629SYM_FUNC_END(aesni_cts_cbc_enc)
2630
2631/*
2632 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2633 *			  size_t len, u8 *iv)
2634 */
2635SYM_FUNC_START(aesni_cts_cbc_dec)
2636	FRAME_BEGIN
2637#ifndef __x86_64__
2638	pushl IVP
2639	pushl LEN
2640	pushl KEYP
2641	pushl KLEN
2642	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2643	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2644	movl (FRAME_OFFSET+28)(%esp), INP	# src
2645	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2646	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2647	lea .Lcts_permute_table, T1
2648#else
2649	lea .Lcts_permute_table(%rip), T1
2650#endif
2651	mov 480(KEYP), KLEN
2652	add $240, KEYP
2653	movups (IVP), IV
2654	sub $16, LEN
2655	mov T1, IVP
2656	add $32, IVP
2657	add LEN, T1
2658	sub LEN, IVP
2659	movups (T1), %xmm4
2660
2661	movups (INP), STATE
2662	add LEN, INP
2663	movups (INP), IN1
2664
2665	call _aesni_dec1
2666	movaps STATE, IN2
2667	pshufb %xmm4, STATE
2668	pxor IN1, STATE
2669
2670	add OUTP, LEN
2671	movups STATE, (LEN)
2672
2673	movups (IVP), %xmm0
2674	pshufb %xmm0, IN1
2675	pblendvb IN2, IN1
2676	movaps IN1, STATE
2677	call _aesni_dec1
2678
2679	pxor IV, STATE
2680	movups STATE, (OUTP)
2681
2682#ifndef __x86_64__
2683	popl KLEN
2684	popl KEYP
2685	popl LEN
2686	popl IVP
2687#endif
2688	FRAME_END
2689	RET
2690SYM_FUNC_END(aesni_cts_cbc_dec)
2691
2692.pushsection .rodata
2693.align 16
2694.Lcts_permute_table:
2695	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2696	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2697	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2698	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2699	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2700	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2701#ifdef __x86_64__
2702.Lbswap_mask:
2703	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2704#endif
2705.popsection
2706
2707#ifdef __x86_64__
2708/*
2709 * _aesni_inc_init:	internal ABI
2710 *	setup registers used by _aesni_inc
2711 * input:
2712 *	IV
2713 * output:
2714 *	CTR:	== IV, in little endian
2715 *	TCTR_LOW: == lower qword of CTR
2716 *	INC:	== 1, in little endian
2717 *	BSWAP_MASK == endian swapping mask
2718 */
2719SYM_FUNC_START_LOCAL(_aesni_inc_init)
2720	movaps .Lbswap_mask(%rip), BSWAP_MASK
 
2721	movaps IV, CTR
2722	pshufb BSWAP_MASK, CTR
2723	mov $1, TCTR_LOW
2724	movq TCTR_LOW, INC
2725	movq CTR, TCTR_LOW
2726	RET
2727SYM_FUNC_END(_aesni_inc_init)
2728
2729/*
2730 * _aesni_inc:		internal ABI
2731 *	Increase IV by 1, IV is in big endian
2732 * input:
2733 *	IV
2734 *	CTR:	== IV, in little endian
2735 *	TCTR_LOW: == lower qword of CTR
2736 *	INC:	== 1, in little endian
2737 *	BSWAP_MASK == endian swapping mask
2738 * output:
2739 *	IV:	Increase by 1
2740 * changed:
2741 *	CTR:	== output IV, in little endian
2742 *	TCTR_LOW: == lower qword of CTR
2743 */
2744SYM_FUNC_START_LOCAL(_aesni_inc)
 
2745	paddq INC, CTR
2746	add $1, TCTR_LOW
2747	jnc .Linc_low
2748	pslldq $8, INC
2749	paddq INC, CTR
2750	psrldq $8, INC
2751.Linc_low:
2752	movaps CTR, IV
2753	pshufb BSWAP_MASK, IV
2754	RET
2755SYM_FUNC_END(_aesni_inc)
2756
2757/*
2758 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2759 *		      size_t len, u8 *iv)
2760 */
2761SYM_FUNC_START(aesni_ctr_enc)
2762	FRAME_BEGIN
2763	cmp $16, LEN
2764	jb .Lctr_enc_just_ret
2765	mov 480(KEYP), KLEN
2766	movups (IVP), IV
2767	call _aesni_inc_init
2768	cmp $64, LEN
2769	jb .Lctr_enc_loop1
2770.align 4
2771.Lctr_enc_loop4:
2772	movaps IV, STATE1
2773	call _aesni_inc
2774	movups (INP), IN1
2775	movaps IV, STATE2
2776	call _aesni_inc
2777	movups 0x10(INP), IN2
2778	movaps IV, STATE3
2779	call _aesni_inc
2780	movups 0x20(INP), IN3
2781	movaps IV, STATE4
2782	call _aesni_inc
2783	movups 0x30(INP), IN4
2784	call _aesni_enc4
2785	pxor IN1, STATE1
2786	movups STATE1, (OUTP)
2787	pxor IN2, STATE2
2788	movups STATE2, 0x10(OUTP)
2789	pxor IN3, STATE3
2790	movups STATE3, 0x20(OUTP)
2791	pxor IN4, STATE4
2792	movups STATE4, 0x30(OUTP)
2793	sub $64, LEN
2794	add $64, INP
2795	add $64, OUTP
2796	cmp $64, LEN
2797	jge .Lctr_enc_loop4
2798	cmp $16, LEN
2799	jb .Lctr_enc_ret
2800.align 4
2801.Lctr_enc_loop1:
2802	movaps IV, STATE
2803	call _aesni_inc
2804	movups (INP), IN
2805	call _aesni_enc1
2806	pxor IN, STATE
2807	movups STATE, (OUTP)
2808	sub $16, LEN
2809	add $16, INP
2810	add $16, OUTP
2811	cmp $16, LEN
2812	jge .Lctr_enc_loop1
2813.Lctr_enc_ret:
2814	movups IV, (IVP)
2815.Lctr_enc_just_ret:
2816	FRAME_END
2817	RET
2818SYM_FUNC_END(aesni_ctr_enc)
2819
2820#endif
2821
2822.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2823.align 16
2824.Lgf128mul_x_ble_mask:
2825	.octa 0x00000000000000010000000000000087
2826.previous
2827
2828/*
2829 * _aesni_gf128mul_x_ble:		internal ABI
2830 *	Multiply in GF(2^128) for XTS IVs
2831 * input:
2832 *	IV:	current IV
2833 *	GF128MUL_MASK == mask with 0x87 and 0x01
2834 * output:
2835 *	IV:	next IV
2836 * changed:
2837 *	CTR:	== temporary value
2838 */
2839#define _aesni_gf128mul_x_ble() \
2840	pshufd $0x13, IV, KEY; \
2841	paddq IV, IV; \
2842	psrad $31, KEY; \
2843	pand GF128MUL_MASK, KEY; \
2844	pxor KEY, IV;
2845
2846/*
2847 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2848 *			  const u8 *src, unsigned int len, le128 *iv)
2849 */
2850SYM_FUNC_START(aesni_xts_encrypt)
2851	FRAME_BEGIN
2852#ifndef __x86_64__
2853	pushl IVP
2854	pushl LEN
2855	pushl KEYP
2856	pushl KLEN
2857	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2858	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2859	movl (FRAME_OFFSET+28)(%esp), INP	# src
2860	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2861	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2862	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2863#else
2864	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2865#endif
2866	movups (IVP), IV
2867
2868	mov 480(KEYP), KLEN
2869
2870.Lxts_enc_loop4:
2871	sub $64, LEN
2872	jl .Lxts_enc_1x
2873
2874	movdqa IV, STATE1
2875	movdqu 0x00(INP), IN
2876	pxor IN, STATE1
2877	movdqu IV, 0x00(OUTP)
2878
2879	_aesni_gf128mul_x_ble()
2880	movdqa IV, STATE2
2881	movdqu 0x10(INP), IN
2882	pxor IN, STATE2
2883	movdqu IV, 0x10(OUTP)
2884
2885	_aesni_gf128mul_x_ble()
2886	movdqa IV, STATE3
2887	movdqu 0x20(INP), IN
2888	pxor IN, STATE3
2889	movdqu IV, 0x20(OUTP)
2890
2891	_aesni_gf128mul_x_ble()
2892	movdqa IV, STATE4
2893	movdqu 0x30(INP), IN
2894	pxor IN, STATE4
2895	movdqu IV, 0x30(OUTP)
2896
2897	call _aesni_enc4
2898
2899	movdqu 0x00(OUTP), IN
2900	pxor IN, STATE1
2901	movdqu STATE1, 0x00(OUTP)
2902
2903	movdqu 0x10(OUTP), IN
2904	pxor IN, STATE2
2905	movdqu STATE2, 0x10(OUTP)
2906
2907	movdqu 0x20(OUTP), IN
2908	pxor IN, STATE3
2909	movdqu STATE3, 0x20(OUTP)
2910
2911	movdqu 0x30(OUTP), IN
2912	pxor IN, STATE4
2913	movdqu STATE4, 0x30(OUTP)
2914
2915	_aesni_gf128mul_x_ble()
2916
2917	add $64, INP
2918	add $64, OUTP
2919	test LEN, LEN
2920	jnz .Lxts_enc_loop4
2921
2922.Lxts_enc_ret_iv:
2923	movups IV, (IVP)
2924
2925.Lxts_enc_ret:
2926#ifndef __x86_64__
2927	popl KLEN
2928	popl KEYP
2929	popl LEN
2930	popl IVP
2931#endif
2932	FRAME_END
2933	RET
2934
2935.Lxts_enc_1x:
2936	add $64, LEN
2937	jz .Lxts_enc_ret_iv
2938	sub $16, LEN
2939	jl .Lxts_enc_cts4
2940
2941.Lxts_enc_loop1:
2942	movdqu (INP), STATE
2943	pxor IV, STATE
2944	call _aesni_enc1
2945	pxor IV, STATE
2946	_aesni_gf128mul_x_ble()
2947
2948	test LEN, LEN
2949	jz .Lxts_enc_out
2950
2951	add $16, INP
2952	sub $16, LEN
2953	jl .Lxts_enc_cts1
2954
2955	movdqu STATE, (OUTP)
2956	add $16, OUTP
2957	jmp .Lxts_enc_loop1
2958
2959.Lxts_enc_out:
2960	movdqu STATE, (OUTP)
2961	jmp .Lxts_enc_ret_iv
2962
2963.Lxts_enc_cts4:
2964	movdqa STATE4, STATE
2965	sub $16, OUTP
2966
2967.Lxts_enc_cts1:
2968#ifndef __x86_64__
2969	lea .Lcts_permute_table, T1
2970#else
2971	lea .Lcts_permute_table(%rip), T1
2972#endif
2973	add LEN, INP		/* rewind input pointer */
2974	add $16, LEN		/* # bytes in final block */
2975	movups (INP), IN1
2976
2977	mov T1, IVP
2978	add $32, IVP
2979	add LEN, T1
2980	sub LEN, IVP
2981	add OUTP, LEN
2982
2983	movups (T1), %xmm4
2984	movaps STATE, IN2
2985	pshufb %xmm4, STATE
2986	movups STATE, (LEN)
2987
2988	movups (IVP), %xmm0
2989	pshufb %xmm0, IN1
2990	pblendvb IN2, IN1
2991	movaps IN1, STATE
2992
2993	pxor IV, STATE
2994	call _aesni_enc1
2995	pxor IV, STATE
2996
2997	movups STATE, (OUTP)
2998	jmp .Lxts_enc_ret
2999SYM_FUNC_END(aesni_xts_encrypt)
3000
3001/*
3002 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3003 *			  const u8 *src, unsigned int len, le128 *iv)
3004 */
3005SYM_FUNC_START(aesni_xts_decrypt)
3006	FRAME_BEGIN
3007#ifndef __x86_64__
3008	pushl IVP
3009	pushl LEN
3010	pushl KEYP
3011	pushl KLEN
3012	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
3013	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
3014	movl (FRAME_OFFSET+28)(%esp), INP	# src
3015	movl (FRAME_OFFSET+32)(%esp), LEN	# len
3016	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
3017	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3018#else
3019	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3020#endif
3021	movups (IVP), IV
3022
3023	mov 480(KEYP), KLEN
3024	add $240, KEYP
3025
3026	test $15, LEN
3027	jz .Lxts_dec_loop4
3028	sub $16, LEN
3029
3030.Lxts_dec_loop4:
3031	sub $64, LEN
3032	jl .Lxts_dec_1x
3033
3034	movdqa IV, STATE1
3035	movdqu 0x00(INP), IN
3036	pxor IN, STATE1
3037	movdqu IV, 0x00(OUTP)
3038
3039	_aesni_gf128mul_x_ble()
3040	movdqa IV, STATE2
3041	movdqu 0x10(INP), IN
3042	pxor IN, STATE2
3043	movdqu IV, 0x10(OUTP)
3044
3045	_aesni_gf128mul_x_ble()
3046	movdqa IV, STATE3
3047	movdqu 0x20(INP), IN
3048	pxor IN, STATE3
3049	movdqu IV, 0x20(OUTP)
3050
3051	_aesni_gf128mul_x_ble()
3052	movdqa IV, STATE4
3053	movdqu 0x30(INP), IN
3054	pxor IN, STATE4
3055	movdqu IV, 0x30(OUTP)
3056
3057	call _aesni_dec4
3058
3059	movdqu 0x00(OUTP), IN
3060	pxor IN, STATE1
3061	movdqu STATE1, 0x00(OUTP)
3062
3063	movdqu 0x10(OUTP), IN
3064	pxor IN, STATE2
3065	movdqu STATE2, 0x10(OUTP)
3066
3067	movdqu 0x20(OUTP), IN
3068	pxor IN, STATE3
3069	movdqu STATE3, 0x20(OUTP)
3070
3071	movdqu 0x30(OUTP), IN
3072	pxor IN, STATE4
3073	movdqu STATE4, 0x30(OUTP)
3074
3075	_aesni_gf128mul_x_ble()
3076
3077	add $64, INP
3078	add $64, OUTP
3079	test LEN, LEN
3080	jnz .Lxts_dec_loop4
3081
3082.Lxts_dec_ret_iv:
3083	movups IV, (IVP)
3084
3085.Lxts_dec_ret:
3086#ifndef __x86_64__
3087	popl KLEN
3088	popl KEYP
3089	popl LEN
3090	popl IVP
3091#endif
3092	FRAME_END
3093	RET
3094
3095.Lxts_dec_1x:
3096	add $64, LEN
3097	jz .Lxts_dec_ret_iv
3098
3099.Lxts_dec_loop1:
3100	movdqu (INP), STATE
3101
3102	add $16, INP
3103	sub $16, LEN
3104	jl .Lxts_dec_cts1
3105
3106	pxor IV, STATE
3107	call _aesni_dec1
3108	pxor IV, STATE
3109	_aesni_gf128mul_x_ble()
3110
3111	test LEN, LEN
3112	jz .Lxts_dec_out
3113
3114	movdqu STATE, (OUTP)
3115	add $16, OUTP
3116	jmp .Lxts_dec_loop1
3117
3118.Lxts_dec_out:
3119	movdqu STATE, (OUTP)
3120	jmp .Lxts_dec_ret_iv
3121
3122.Lxts_dec_cts1:
3123	movdqa IV, STATE4
3124	_aesni_gf128mul_x_ble()
3125
3126	pxor IV, STATE
3127	call _aesni_dec1
3128	pxor IV, STATE
3129
3130#ifndef __x86_64__
3131	lea .Lcts_permute_table, T1
3132#else
3133	lea .Lcts_permute_table(%rip), T1
3134#endif
3135	add LEN, INP		/* rewind input pointer */
3136	add $16, LEN		/* # bytes in final block */
3137	movups (INP), IN1
3138
3139	mov T1, IVP
3140	add $32, IVP
3141	add LEN, T1
3142	sub LEN, IVP
3143	add OUTP, LEN
3144
3145	movups (T1), %xmm4
3146	movaps STATE, IN2
3147	pshufb %xmm4, STATE
3148	movups STATE, (LEN)
3149
3150	movups (IVP), %xmm0
3151	pshufb %xmm0, IN1
3152	pblendvb IN2, IN1
3153	movaps IN1, STATE
3154
3155	pxor STATE4, STATE
3156	call _aesni_dec1
3157	pxor STATE4, STATE
3158
3159	movups STATE, (OUTP)
3160	jmp .Lxts_dec_ret
3161SYM_FUNC_END(aesni_xts_decrypt)