aesni-intel_asm.S - arch/x86/crypto/aesni-intel_asm.S - Linux diff v3.1

   1/*
   2 * Implement AES algorithm in Intel AES-NI instructions.
   3 *
   4 * The white paper of AES-NI instructions can be downloaded from:
   5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   6 *
   7 * Copyright (C) 2008, Intel Corp.
   8 *    Author: Huang Ying <ying.huang@intel.com>
   9 *            Vinodh Gopal <vinodh.gopal@intel.com>
  10 *            Kahraman Akdemir
  11 *
  12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13 * interface for 64-bit kernels.
  14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  16 *             Adrian Hoban <adrian.hoban@intel.com>
  17 *             James Guilford (james.guilford@intel.com)
  18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  19 *             Tadeusz Struk (tadeusz.struk@intel.com)
  20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  21 *    Copyright (c) 2010, Intel Corporation.
  22 *
  23 * Ported x86_64 version to x86:
  24 *    Author: Mathias Krause <minipli@googlemail.com>
  25 *
  26 * This program is free software; you can redistribute it and/or modify
  27 * it under the terms of the GNU General Public License as published by
  28 * the Free Software Foundation; either version 2 of the License, or
  29 * (at your option) any later version.
  30 */
  31
  32#include <linux/linkage.h>
  33#include <asm/inst.h>
 
 
 
 
 
 
 
 
 
 
 
 
 
  34
  35#ifdef __x86_64__
  36.data
 
 
 
 
 
 
 
  37POLY:   .octa 0xC2000000000000000000000000000001
 
 
  38TWOONE: .octa 0x00000001000000000000000000000001
  39
  40# order of these constants should not change.
  41# more specifically, ALL_F should follow SHIFT_MASK,
  42# and ZERO should follow ALL_F
  43
  44SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
 
 
  45MASK1:      .octa 0x0000000000000000ffffffffffffffff
 
 
  46MASK2:      .octa 0xffffffffffffffff0000000000000000
  47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  48ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  49ZERO:       .octa 0x00000000000000000000000000000000
  50ONE:        .octa 0x00000000000000000000000000000001
 
 
  51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
 
 
  52dec:        .octa 0x1
 
 
  53enc:        .octa 0x2
  54
 
 
 
 
 
 
 
 
  55
  56.text
  57
  58
  59#define	STACK_OFFSET    8*3
  60#define	HashKey		16*0	// store HashKey <<1 mod poly here
  61#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
  62#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
  63#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
  64#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
 
 
 
 
 
 
 
 
  65				// bits of  HashKey <<1 mod poly here
  66				//(for Karatsuba purposes)
  67#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
  68				// bits of  HashKey^2 <<1 mod poly here
  69				// (for Karatsuba purposes)
  70#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
  71				// bits of  HashKey^3 <<1 mod poly here
  72				// (for Karatsuba purposes)
  73#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
  74				// bits of  HashKey^4 <<1 mod poly here
  75				// (for Karatsuba purposes)
  76#define	VARIABLE_OFFSET	16*8
  77
  78#define arg1 rdi
  79#define arg2 rsi
  80#define arg3 rdx
  81#define arg4 rcx
  82#define arg5 r8
  83#define arg6 r9
  84#define arg7 STACK_OFFSET+8(%r14)
  85#define arg8 STACK_OFFSET+16(%r14)
  86#define arg9 STACK_OFFSET+24(%r14)
  87#define arg10 STACK_OFFSET+32(%r14)
 
 
  88#endif
  89
  90
  91#define STATE1	%xmm0
  92#define STATE2	%xmm4
  93#define STATE3	%xmm5
  94#define STATE4	%xmm6
  95#define STATE	STATE1
  96#define IN1	%xmm1
  97#define IN2	%xmm7
  98#define IN3	%xmm8
  99#define IN4	%xmm9
 100#define IN	IN1
 101#define KEY	%xmm2
 102#define IV	%xmm3
 103
 104#define BSWAP_MASK %xmm10
 105#define CTR	%xmm11
 106#define INC	%xmm12
 107
 
 
 108#ifdef __x86_64__
 109#define AREG	%rax
 110#define KEYP	%rdi
 111#define OUTP	%rsi
 112#define UKEYP	OUTP
 113#define INP	%rdx
 114#define LEN	%rcx
 115#define IVP	%r8
 116#define KLEN	%r9d
 117#define T1	%r10
 118#define TKEYP	T1
 119#define T2	%r11
 120#define TCTR_LOW T2
 121#else
 122#define AREG	%eax
 123#define KEYP	%edi
 124#define OUTP	AREG
 125#define UKEYP	OUTP
 126#define INP	%edx
 127#define LEN	%esi
 128#define IVP	%ebp
 129#define KLEN	%ebx
 130#define T1	%ecx
 131#define TKEYP	T1
 132#endif
 133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 134
 135#ifdef __x86_64__
 136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 137*
 138*
 139* Input: A and B (128-bits each, bit-reflected)
 140* Output: C = A*B*x mod poly, (i.e. >>1 )
 141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 143*
 144*/
 145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 146	movdqa	  \GH, \TMP1
 147	pshufd	  $78, \GH, \TMP2
 148	pshufd	  $78, \HK, \TMP3
 149	pxor	  \GH, \TMP2            # TMP2 = a1+a0
 150	pxor	  \HK, \TMP3            # TMP3 = b1+b0
 151	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 152	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 153	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 154	pxor	  \GH, \TMP2
 155	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 156	movdqa	  \TMP2, \TMP3
 157	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
 158	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
 159	pxor	  \TMP3, \GH
 160	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 161
 162        # first phase of the reduction
 163
 164	movdqa    \GH, \TMP2
 165	movdqa    \GH, \TMP3
 166	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 167					# in in order to perform
 168					# independent shifts
 169	pslld     $31, \TMP2            # packed right shift <<31
 170	pslld     $30, \TMP3            # packed right shift <<30
 171	pslld     $25, \TMP4            # packed right shift <<25
 172	pxor      \TMP3, \TMP2          # xor the shifted versions
 173	pxor      \TMP4, \TMP2
 174	movdqa    \TMP2, \TMP5
 175	psrldq    $4, \TMP5             # right shift TMP5 1 DW
 176	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 177	pxor      \TMP2, \GH
 178
 179        # second phase of the reduction
 180
 181	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 182					# in in order to perform
 183					# independent shifts
 184	movdqa    \GH,\TMP3
 185	movdqa    \GH,\TMP4
 186	psrld     $1,\TMP2              # packed left shift >>1
 187	psrld     $2,\TMP3              # packed left shift >>2
 188	psrld     $7,\TMP4              # packed left shift >>7
 189	pxor      \TMP3,\TMP2		# xor the shifted versions
 190	pxor      \TMP4,\TMP2
 191	pxor      \TMP5, \TMP2
 192	pxor      \TMP2, \GH
 193	pxor      \TMP1, \GH            # result is in TMP1
 194.endm
 195
 196/*
 197* if a = number of total plaintext bytes
 198* b = floor(a/16)
 199* num_initial_blocks = b mod 4
 200* encrypt the initial num_initial_blocks blocks and apply ghash on
 201* the ciphertext
 202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 203* are clobbered
 204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
 205*/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 206
 
 
 
 
 
 
 
 
 
 207
 208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 210	mov	   arg7, %r10           # %r10 = AAD
 211	mov	   arg8, %r12           # %r12 = aadLen
 212	mov	   %r12, %r11
 213	pxor	   %xmm\i, %xmm\i
 214_get_AAD_loop\num_initial_blocks\operation:
 215	movd	   (%r10), \TMP1
 216	pslldq	   $12, \TMP1
 217	psrldq	   $4, %xmm\i
 218	pxor	   \TMP1, %xmm\i
 219	add	   $4, %r10
 220	sub	   $4, %r12
 221	jne	   _get_AAD_loop\num_initial_blocks\operation
 222	cmp	   $16, %r11
 223	je	   _get_AAD_loop2_done\num_initial_blocks\operation
 224	mov	   $16, %r12
 225_get_AAD_loop2\num_initial_blocks\operation:
 226	psrldq	   $4, %xmm\i
 227	sub	   $4, %r12
 228	cmp	   %r11, %r12
 229	jne	   _get_AAD_loop2\num_initial_blocks\operation
 230_get_AAD_loop2_done\num_initial_blocks\operation:
 231        movdqa     SHUF_MASK(%rip), %xmm14
 232	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 233
 234	xor	   %r11, %r11 # initialise the data pointer offset as zero
 235
 236        # start AES for num_initial_blocks blocks
 237
 238	mov	   %arg5, %rax                      # %rax = *Y0
 239	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
 240        movdqa     SHUF_MASK(%rip), %xmm14
 241	PSHUFB_XMM   %xmm14, \XMM0
 242
 243.if (\i == 5) || (\i == 6) || (\i == 7)
 244.irpc index, \i_seq
 245	paddd	   ONE(%rip), \XMM0                 # INCR Y0
 246	movdqa	   \XMM0, %xmm\index
 247        movdqa     SHUF_MASK(%rip), %xmm14
 248	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 249
 250.endr
 251.irpc index, \i_seq
 252	pxor	   16*0(%arg1), %xmm\index
 253.endr
 254.irpc index, \i_seq
 255	movaps 0x10(%rdi), \TMP1
 256	AESENC     \TMP1, %xmm\index          # Round 1
 257.endr
 258.irpc index, \i_seq
 259	movaps 0x20(%arg1), \TMP1
 260	AESENC     \TMP1, %xmm\index          # Round 2
 261.endr
 262.irpc index, \i_seq
 263	movaps 0x30(%arg1), \TMP1
 264	AESENC     \TMP1, %xmm\index          # Round 2
 265.endr
 266.irpc index, \i_seq
 267	movaps 0x40(%arg1), \TMP1
 268	AESENC     \TMP1, %xmm\index          # Round 2
 269.endr
 270.irpc index, \i_seq
 271	movaps 0x50(%arg1), \TMP1
 272	AESENC     \TMP1, %xmm\index          # Round 2
 273.endr
 274.irpc index, \i_seq
 275	movaps 0x60(%arg1), \TMP1
 276	AESENC     \TMP1, %xmm\index          # Round 2
 277.endr
 278.irpc index, \i_seq
 279	movaps 0x70(%arg1), \TMP1
 280	AESENC     \TMP1, %xmm\index          # Round 2
 281.endr
 282.irpc index, \i_seq
 283	movaps 0x80(%arg1), \TMP1
 284	AESENC     \TMP1, %xmm\index          # Round 2
 285.endr
 286.irpc index, \i_seq
 287	movaps 0x90(%arg1), \TMP1
 288	AESENC     \TMP1, %xmm\index          # Round 2
 289.endr
 290.irpc index, \i_seq
 291	movaps 0xa0(%arg1), \TMP1
 292	AESENCLAST \TMP1, %xmm\index         # Round 10
 293.endr
 294.irpc index, \i_seq
 295	movdqu	   (%arg3 , %r11, 1), \TMP1
 296	pxor	   \TMP1, %xmm\index
 297	movdqu	   %xmm\index, (%arg2 , %r11, 1)
 298	# write back plaintext/ciphertext for num_initial_blocks
 299	add	   $16, %r11
 300
 301	movdqa     \TMP1, %xmm\index
 302        movdqa     SHUF_MASK(%rip), %xmm14
 303	PSHUFB_XMM	   %xmm14, %xmm\index
 304
 305		# prepare plaintext/ciphertext for GHASH computation
 306.endr
 307.endif
 308	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 309        # apply GHASH on num_initial_blocks blocks
 
 
 
 
 
 310
 311.if \i == 5
 312        pxor       %xmm5, %xmm6
 313	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 314        pxor       %xmm6, %xmm7
 315	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 316        pxor       %xmm7, %xmm8
 317	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 318.elseif \i == 6
 319        pxor       %xmm6, %xmm7
 320	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 321        pxor       %xmm7, %xmm8
 322	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 323.elseif \i == 7
 324        pxor       %xmm7, %xmm8
 325	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 326.endif
 327	cmp	   $64, %r13
 328	jl	_initial_blocks_done\num_initial_blocks\operation
 329	# no need for precomputed values
 330/*
 331*
 332* Precomputations for HashKey parallel with encryption of first 4 blocks.
 333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 334*/
 335	paddd	   ONE(%rip), \XMM0              # INCR Y0
 336	movdqa	   \XMM0, \XMM1
 337        movdqa     SHUF_MASK(%rip), %xmm14
 338	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 339
 340	paddd	   ONE(%rip), \XMM0              # INCR Y0
 341	movdqa	   \XMM0, \XMM2
 342        movdqa     SHUF_MASK(%rip), %xmm14
 343	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 344
 345	paddd	   ONE(%rip), \XMM0              # INCR Y0
 346	movdqa	   \XMM0, \XMM3
 347        movdqa     SHUF_MASK(%rip), %xmm14
 348	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 349
 350	paddd	   ONE(%rip), \XMM0              # INCR Y0
 351	movdqa	   \XMM0, \XMM4
 352        movdqa     SHUF_MASK(%rip), %xmm14
 353	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 354
 355	pxor	   16*0(%arg1), \XMM1
 356	pxor	   16*0(%arg1), \XMM2
 357	pxor	   16*0(%arg1), \XMM3
 358	pxor	   16*0(%arg1), \XMM4
 359	movdqa	   \TMP3, \TMP5
 360	pshufd	   $78, \TMP3, \TMP1
 361	pxor	   \TMP3, \TMP1
 362	movdqa	   \TMP1, HashKey_k(%rsp)
 363	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 364# TMP5 = HashKey^2<<1 (mod poly)
 365	movdqa	   \TMP5, HashKey_2(%rsp)
 366# HashKey_2 = HashKey^2<<1 (mod poly)
 367	pshufd	   $78, \TMP5, \TMP1
 368	pxor	   \TMP5, \TMP1
 369	movdqa	   \TMP1, HashKey_2_k(%rsp)
 370.irpc index, 1234 # do 4 rounds
 371	movaps 0x10*\index(%arg1), \TMP1
 372	AESENC	   \TMP1, \XMM1
 373	AESENC	   \TMP1, \XMM2
 374	AESENC	   \TMP1, \XMM3
 375	AESENC	   \TMP1, \XMM4
 376.endr
 377	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 378# TMP5 = HashKey^3<<1 (mod poly)
 379	movdqa	   \TMP5, HashKey_3(%rsp)
 380	pshufd	   $78, \TMP5, \TMP1
 381	pxor	   \TMP5, \TMP1
 382	movdqa	   \TMP1, HashKey_3_k(%rsp)
 383.irpc index, 56789 # do next 5 rounds
 384	movaps 0x10*\index(%arg1), \TMP1
 385	AESENC	   \TMP1, \XMM1
 386	AESENC	   \TMP1, \XMM2
 387	AESENC	   \TMP1, \XMM3
 388	AESENC	   \TMP1, \XMM4
 389.endr
 390	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 391# TMP5 = HashKey^3<<1 (mod poly)
 392	movdqa	   \TMP5, HashKey_4(%rsp)
 393	pshufd	   $78, \TMP5, \TMP1
 394	pxor	   \TMP5, \TMP1
 395	movdqa	   \TMP1, HashKey_4_k(%rsp)
 396	movaps 0xa0(%arg1), \TMP2
 397	AESENCLAST \TMP2, \XMM1
 398	AESENCLAST \TMP2, \XMM2
 399	AESENCLAST \TMP2, \XMM3
 400	AESENCLAST \TMP2, \XMM4
 401	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
 402	pxor	   \TMP1, \XMM1
 403	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
 404	movdqa     \TMP1, \XMM1
 405	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
 406	pxor	   \TMP1, \XMM2
 407	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
 408	movdqa     \TMP1, \XMM2
 409	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
 410	pxor	   \TMP1, \XMM3
 411	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
 412	movdqa     \TMP1, \XMM3
 413	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
 414	pxor	   \TMP1, \XMM4
 415	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
 416	movdqa     \TMP1, \XMM4
 417	add	   $64, %r11
 418        movdqa     SHUF_MASK(%rip), %xmm14
 419	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 420	pxor	   \XMMDst, \XMM1
 421# combine GHASHed value with the corresponding ciphertext
 422        movdqa     SHUF_MASK(%rip), %xmm14
 423	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 424        movdqa     SHUF_MASK(%rip), %xmm14
 425	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 426        movdqa     SHUF_MASK(%rip), %xmm14
 427	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 428
 429_initial_blocks_done\num_initial_blocks\operation:
 430
 431.endm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 432
 
 
 
 
 
 
 
 
 
 
 
 
 
 433
 434/*
 435* if a = number of total plaintext bytes
 436* b = floor(a/16)
 437* num_initial_blocks = b mod 4
 438* encrypt the initial num_initial_blocks blocks and apply ghash on
 439* the ciphertext
 440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 441* are clobbered
 442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
 443*/
 444
 445
 446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 448	mov	   arg7, %r10           # %r10 = AAD
 449	mov	   arg8, %r12           # %r12 = aadLen
 450	mov	   %r12, %r11
 451	pxor	   %xmm\i, %xmm\i
 452_get_AAD_loop\num_initial_blocks\operation:
 453	movd	   (%r10), \TMP1
 454	pslldq	   $12, \TMP1
 455	psrldq	   $4, %xmm\i
 456	pxor	   \TMP1, %xmm\i
 457	add	   $4, %r10
 458	sub	   $4, %r12
 459	jne	   _get_AAD_loop\num_initial_blocks\operation
 460	cmp	   $16, %r11
 461	je	   _get_AAD_loop2_done\num_initial_blocks\operation
 462	mov	   $16, %r12
 463_get_AAD_loop2\num_initial_blocks\operation:
 464	psrldq	   $4, %xmm\i
 465	sub	   $4, %r12
 466	cmp	   %r11, %r12
 467	jne	   _get_AAD_loop2\num_initial_blocks\operation
 468_get_AAD_loop2_done\num_initial_blocks\operation:
 469        movdqa     SHUF_MASK(%rip), %xmm14
 470	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 471
 472	xor	   %r11, %r11 # initialise the data pointer offset as zero
 473
 474        # start AES for num_initial_blocks blocks
 475
 476	mov	   %arg5, %rax                      # %rax = *Y0
 477	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
 478        movdqa     SHUF_MASK(%rip), %xmm14
 479	PSHUFB_XMM   %xmm14, \XMM0
 480
 481.if (\i == 5) || (\i == 6) || (\i == 7)
 482.irpc index, \i_seq
 483	paddd	   ONE(%rip), \XMM0                 # INCR Y0
 484	movdqa	   \XMM0, %xmm\index
 485        movdqa     SHUF_MASK(%rip), %xmm14
 486	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 487
 488.endr
 489.irpc index, \i_seq
 490	pxor	   16*0(%arg1), %xmm\index
 491.endr
 492.irpc index, \i_seq
 493	movaps 0x10(%rdi), \TMP1
 494	AESENC     \TMP1, %xmm\index          # Round 1
 495.endr
 496.irpc index, \i_seq
 497	movaps 0x20(%arg1), \TMP1
 498	AESENC     \TMP1, %xmm\index          # Round 2
 499.endr
 500.irpc index, \i_seq
 501	movaps 0x30(%arg1), \TMP1
 502	AESENC     \TMP1, %xmm\index          # Round 2
 503.endr
 504.irpc index, \i_seq
 505	movaps 0x40(%arg1), \TMP1
 506	AESENC     \TMP1, %xmm\index          # Round 2
 507.endr
 508.irpc index, \i_seq
 509	movaps 0x50(%arg1), \TMP1
 510	AESENC     \TMP1, %xmm\index          # Round 2
 511.endr
 512.irpc index, \i_seq
 513	movaps 0x60(%arg1), \TMP1
 514	AESENC     \TMP1, %xmm\index          # Round 2
 515.endr
 516.irpc index, \i_seq
 517	movaps 0x70(%arg1), \TMP1
 518	AESENC     \TMP1, %xmm\index          # Round 2
 519.endr
 520.irpc index, \i_seq
 521	movaps 0x80(%arg1), \TMP1
 522	AESENC     \TMP1, %xmm\index          # Round 2
 523.endr
 524.irpc index, \i_seq
 525	movaps 0x90(%arg1), \TMP1
 526	AESENC     \TMP1, %xmm\index          # Round 2
 
 
 
 
 
 
 527.endr
 
 
 
 
 
 528.irpc index, \i_seq
 529	movaps 0xa0(%arg1), \TMP1
 530	AESENCLAST \TMP1, %xmm\index         # Round 10
 531.endr
 532.irpc index, \i_seq
 533	movdqu	   (%arg3 , %r11, 1), \TMP1
 534	pxor	   \TMP1, %xmm\index
 535	movdqu	   %xmm\index, (%arg2 , %r11, 1)
 536	# write back plaintext/ciphertext for num_initial_blocks
 537	add	   $16, %r11
 538
 539        movdqa     SHUF_MASK(%rip), %xmm14
 
 
 540	PSHUFB_XMM	   %xmm14, %xmm\index
 541
 542		# prepare plaintext/ciphertext for GHASH computation
 543.endr
 544.endif
 545	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 546        # apply GHASH on num_initial_blocks blocks
 547
 548.if \i == 5
 549        pxor       %xmm5, %xmm6
 550	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 551        pxor       %xmm6, %xmm7
 552	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 553        pxor       %xmm7, %xmm8
 554	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 555.elseif \i == 6
 556        pxor       %xmm6, %xmm7
 557	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 558        pxor       %xmm7, %xmm8
 559	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 560.elseif \i == 7
 561        pxor       %xmm7, %xmm8
 562	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 563.endif
 564	cmp	   $64, %r13
 565	jl	_initial_blocks_done\num_initial_blocks\operation
 566	# no need for precomputed values
 567/*
 568*
 569* Precomputations for HashKey parallel with encryption of first 4 blocks.
 570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 571*/
 572	paddd	   ONE(%rip), \XMM0              # INCR Y0
 573	movdqa	   \XMM0, \XMM1
 574        movdqa     SHUF_MASK(%rip), %xmm14
 575	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 576
 577	paddd	   ONE(%rip), \XMM0              # INCR Y0
 578	movdqa	   \XMM0, \XMM2
 579        movdqa     SHUF_MASK(%rip), %xmm14
 580	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 581
 582	paddd	   ONE(%rip), \XMM0              # INCR Y0
 583	movdqa	   \XMM0, \XMM3
 584        movdqa     SHUF_MASK(%rip), %xmm14
 585	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 586
 587	paddd	   ONE(%rip), \XMM0              # INCR Y0
 588	movdqa	   \XMM0, \XMM4
 589        movdqa     SHUF_MASK(%rip), %xmm14
 590	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 591
 592	pxor	   16*0(%arg1), \XMM1
 593	pxor	   16*0(%arg1), \XMM2
 594	pxor	   16*0(%arg1), \XMM3
 595	pxor	   16*0(%arg1), \XMM4
 596	movdqa	   \TMP3, \TMP5
 597	pshufd	   $78, \TMP3, \TMP1
 598	pxor	   \TMP3, \TMP1
 599	movdqa	   \TMP1, HashKey_k(%rsp)
 600	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 601# TMP5 = HashKey^2<<1 (mod poly)
 602	movdqa	   \TMP5, HashKey_2(%rsp)
 603# HashKey_2 = HashKey^2<<1 (mod poly)
 604	pshufd	   $78, \TMP5, \TMP1
 605	pxor	   \TMP5, \TMP1
 606	movdqa	   \TMP1, HashKey_2_k(%rsp)
 607.irpc index, 1234 # do 4 rounds
 608	movaps 0x10*\index(%arg1), \TMP1
 609	AESENC	   \TMP1, \XMM1
 610	AESENC	   \TMP1, \XMM2
 611	AESENC	   \TMP1, \XMM3
 612	AESENC	   \TMP1, \XMM4
 613.endr
 614	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 615# TMP5 = HashKey^3<<1 (mod poly)
 616	movdqa	   \TMP5, HashKey_3(%rsp)
 617	pshufd	   $78, \TMP5, \TMP1
 618	pxor	   \TMP5, \TMP1
 619	movdqa	   \TMP1, HashKey_3_k(%rsp)
 620.irpc index, 56789 # do next 5 rounds
 621	movaps 0x10*\index(%arg1), \TMP1
 622	AESENC	   \TMP1, \XMM1
 623	AESENC	   \TMP1, \XMM2
 624	AESENC	   \TMP1, \XMM3
 625	AESENC	   \TMP1, \XMM4
 626.endr
 627	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 628# TMP5 = HashKey^3<<1 (mod poly)
 629	movdqa	   \TMP5, HashKey_4(%rsp)
 630	pshufd	   $78, \TMP5, \TMP1
 631	pxor	   \TMP5, \TMP1
 632	movdqa	   \TMP1, HashKey_4_k(%rsp)
 633	movaps 0xa0(%arg1), \TMP2
 
 
 
 
 
 
 
 
 
 
 634	AESENCLAST \TMP2, \XMM1
 635	AESENCLAST \TMP2, \XMM2
 636	AESENCLAST \TMP2, \XMM3
 637	AESENCLAST \TMP2, \XMM4
 638	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
 639	pxor	   \TMP1, \XMM1
 640	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
 
 
 
 
 641	pxor	   \TMP1, \XMM2
 642	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
 
 
 
 
 643	pxor	   \TMP1, \XMM3
 644	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
 
 
 
 
 645	pxor	   \TMP1, \XMM4
 646	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
 647	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
 648	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
 649	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
 
 
 
 
 
 650
 651	add	   $64, %r11
 652        movdqa     SHUF_MASK(%rip), %xmm14
 653	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 654	pxor	   \XMMDst, \XMM1
 655# combine GHASHed value with the corresponding ciphertext
 656        movdqa     SHUF_MASK(%rip), %xmm14
 657	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 658        movdqa     SHUF_MASK(%rip), %xmm14
 659	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 660        movdqa     SHUF_MASK(%rip), %xmm14
 661	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 662
 663_initial_blocks_done\num_initial_blocks\operation:
 664
 665.endm
 666
 667/*
 668* encrypt 4 blocks at a time
 669* ghash the 4 previously encrypted ciphertext blocks
 670* arg1, %arg2, %arg3 are used as pointers only, not modified
 671* %r11 is the data offset value
 672*/
 673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 675
 676	movdqa	  \XMM1, \XMM5
 677	movdqa	  \XMM2, \XMM6
 678	movdqa	  \XMM3, \XMM7
 679	movdqa	  \XMM4, \XMM8
 680
 681        movdqa    SHUF_MASK(%rip), %xmm15
 682        # multiply TMP5 * HashKey using karatsuba
 683
 684	movdqa	  \XMM5, \TMP4
 685	pshufd	  $78, \XMM5, \TMP6
 686	pxor	  \XMM5, \TMP6
 687	paddd     ONE(%rip), \XMM0		# INCR CNT
 688	movdqa	  HashKey_4(%rsp), \TMP5
 689	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 690	movdqa    \XMM0, \XMM1
 691	paddd     ONE(%rip), \XMM0		# INCR CNT
 692	movdqa    \XMM0, \XMM2
 693	paddd     ONE(%rip), \XMM0		# INCR CNT
 694	movdqa    \XMM0, \XMM3
 695	paddd     ONE(%rip), \XMM0		# INCR CNT
 696	movdqa    \XMM0, \XMM4
 697	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
 698	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
 699	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 700	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 701	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 702
 703	pxor	  (%arg1), \XMM1
 704	pxor	  (%arg1), \XMM2
 705	pxor	  (%arg1), \XMM3
 706	pxor	  (%arg1), \XMM4
 707	movdqa	  HashKey_4_k(%rsp), \TMP5
 708	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
 709	movaps 0x10(%arg1), \TMP1
 710	AESENC	  \TMP1, \XMM1              # Round 1
 711	AESENC	  \TMP1, \XMM2
 712	AESENC	  \TMP1, \XMM3
 713	AESENC	  \TMP1, \XMM4
 714	movaps 0x20(%arg1), \TMP1
 715	AESENC	  \TMP1, \XMM1              # Round 2
 716	AESENC	  \TMP1, \XMM2
 717	AESENC	  \TMP1, \XMM3
 718	AESENC	  \TMP1, \XMM4
 719	movdqa	  \XMM6, \TMP1
 720	pshufd	  $78, \XMM6, \TMP2
 721	pxor	  \XMM6, \TMP2
 722	movdqa	  HashKey_3(%rsp), \TMP5
 723	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
 724	movaps 0x30(%arg1), \TMP3
 725	AESENC    \TMP3, \XMM1              # Round 3
 726	AESENC    \TMP3, \XMM2
 727	AESENC    \TMP3, \XMM3
 728	AESENC    \TMP3, \XMM4
 729	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
 730	movaps 0x40(%arg1), \TMP3
 731	AESENC	  \TMP3, \XMM1              # Round 4
 732	AESENC	  \TMP3, \XMM2
 733	AESENC	  \TMP3, \XMM3
 734	AESENC	  \TMP3, \XMM4
 735	movdqa	  HashKey_3_k(%rsp), \TMP5
 736	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 737	movaps 0x50(%arg1), \TMP3
 738	AESENC	  \TMP3, \XMM1              # Round 5
 739	AESENC	  \TMP3, \XMM2
 740	AESENC	  \TMP3, \XMM3
 741	AESENC	  \TMP3, \XMM4
 742	pxor	  \TMP1, \TMP4
 743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 744	pxor	  \XMM6, \XMM5
 745	pxor	  \TMP2, \TMP6
 746	movdqa	  \XMM7, \TMP1
 747	pshufd	  $78, \XMM7, \TMP2
 748	pxor	  \XMM7, \TMP2
 749	movdqa	  HashKey_2(%rsp ), \TMP5
 750
 751        # Multiply TMP5 * HashKey using karatsuba
 752
 753	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
 754	movaps 0x60(%arg1), \TMP3
 755	AESENC	  \TMP3, \XMM1              # Round 6
 756	AESENC	  \TMP3, \XMM2
 757	AESENC	  \TMP3, \XMM3
 758	AESENC	  \TMP3, \XMM4
 759	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
 760	movaps 0x70(%arg1), \TMP3
 761	AESENC	  \TMP3, \XMM1             # Round 7
 762	AESENC	  \TMP3, \XMM2
 763	AESENC	  \TMP3, \XMM3
 764	AESENC	  \TMP3, \XMM4
 765	movdqa	  HashKey_2_k(%rsp), \TMP5
 766	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 767	movaps 0x80(%arg1), \TMP3
 768	AESENC	  \TMP3, \XMM1             # Round 8
 769	AESENC	  \TMP3, \XMM2
 770	AESENC	  \TMP3, \XMM3
 771	AESENC	  \TMP3, \XMM4
 772	pxor	  \TMP1, \TMP4
 773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 774	pxor	  \XMM7, \XMM5
 775	pxor	  \TMP2, \TMP6
 776
 777        # Multiply XMM8 * HashKey
 778        # XMM8 and TMP5 hold the values for the two operands
 779
 780	movdqa	  \XMM8, \TMP1
 781	pshufd	  $78, \XMM8, \TMP2
 782	pxor	  \XMM8, \TMP2
 783	movdqa	  HashKey(%rsp), \TMP5
 784	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
 785	movaps 0x90(%arg1), \TMP3
 786	AESENC	  \TMP3, \XMM1            # Round 9
 787	AESENC	  \TMP3, \XMM2
 788	AESENC	  \TMP3, \XMM3
 789	AESENC	  \TMP3, \XMM4
 790	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 791	movaps 0xa0(%arg1), \TMP3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 792	AESENCLAST \TMP3, \XMM1           # Round 10
 793	AESENCLAST \TMP3, \XMM2
 794	AESENCLAST \TMP3, \XMM3
 795	AESENCLAST \TMP3, \XMM4
 796	movdqa    HashKey_k(%rsp), \TMP5
 797	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
 798	movdqu	  (%arg3,%r11,1), \TMP3
 799	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
 800	movdqu	  16(%arg3,%r11,1), \TMP3
 801	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
 802	movdqu	  32(%arg3,%r11,1), \TMP3
 803	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
 804	movdqu	  48(%arg3,%r11,1), \TMP3
 805	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
 806        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
 807        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
 808        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
 809        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
 810	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
 811	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 812	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 813	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 814
 815	pxor	  \TMP4, \TMP1
 816	pxor	  \XMM8, \XMM5
 817	pxor	  \TMP6, \TMP2
 818	pxor	  \TMP1, \TMP2
 819	pxor	  \XMM5, \TMP2
 820	movdqa	  \TMP2, \TMP3
 821	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
 822	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
 823	pxor	  \TMP3, \XMM5
 824	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
 825
 826        # first phase of reduction
 827
 828	movdqa    \XMM5, \TMP2
 829	movdqa    \XMM5, \TMP3
 830	movdqa    \XMM5, \TMP4
 831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
 832	pslld     $31, \TMP2                   # packed right shift << 31
 833	pslld     $30, \TMP3                   # packed right shift << 30
 834	pslld     $25, \TMP4                   # packed right shift << 25
 835	pxor      \TMP3, \TMP2	               # xor the shifted versions
 836	pxor      \TMP4, \TMP2
 837	movdqa    \TMP2, \TMP5
 838	psrldq    $4, \TMP5                    # right shift T5 1 DW
 839	pslldq    $12, \TMP2                   # left shift T2 3 DWs
 840	pxor      \TMP2, \XMM5
 841
 842        # second phase of reduction
 843
 844	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
 845	movdqa    \XMM5,\TMP3
 846	movdqa    \XMM5,\TMP4
 847	psrld     $1, \TMP2                    # packed left shift >>1
 848	psrld     $2, \TMP3                    # packed left shift >>2
 849	psrld     $7, \TMP4                    # packed left shift >>7
 850	pxor      \TMP3,\TMP2		       # xor the shifted versions
 851	pxor      \TMP4,\TMP2
 852	pxor      \TMP5, \TMP2
 853	pxor      \TMP2, \XMM5
 854	pxor      \TMP1, \XMM5                 # result is in TMP1
 855
 856	pxor	  \XMM5, \XMM1
 857.endm
 858
 859/*
 860* decrypt 4 blocks at a time
 861* ghash the 4 previously decrypted ciphertext blocks
 862* arg1, %arg2, %arg3 are used as pointers only, not modified
 863* %r11 is the data offset value
 864*/
 865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
 866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 867
 868	movdqa	  \XMM1, \XMM5
 869	movdqa	  \XMM2, \XMM6
 870	movdqa	  \XMM3, \XMM7
 871	movdqa	  \XMM4, \XMM8
 872
 873        movdqa    SHUF_MASK(%rip), %xmm15
 874        # multiply TMP5 * HashKey using karatsuba
 875
 876	movdqa	  \XMM5, \TMP4
 877	pshufd	  $78, \XMM5, \TMP6
 878	pxor	  \XMM5, \TMP6
 879	paddd     ONE(%rip), \XMM0		# INCR CNT
 880	movdqa	  HashKey_4(%rsp), \TMP5
 881	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 882	movdqa    \XMM0, \XMM1
 883	paddd     ONE(%rip), \XMM0		# INCR CNT
 884	movdqa    \XMM0, \XMM2
 885	paddd     ONE(%rip), \XMM0		# INCR CNT
 886	movdqa    \XMM0, \XMM3
 887	paddd     ONE(%rip), \XMM0		# INCR CNT
 888	movdqa    \XMM0, \XMM4
 889	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
 890	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
 891	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
 892	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
 893	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
 894
 895	pxor	  (%arg1), \XMM1
 896	pxor	  (%arg1), \XMM2
 897	pxor	  (%arg1), \XMM3
 898	pxor	  (%arg1), \XMM4
 899	movdqa	  HashKey_4_k(%rsp), \TMP5
 900	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
 901	movaps 0x10(%arg1), \TMP1
 902	AESENC	  \TMP1, \XMM1              # Round 1
 903	AESENC	  \TMP1, \XMM2
 904	AESENC	  \TMP1, \XMM3
 905	AESENC	  \TMP1, \XMM4
 906	movaps 0x20(%arg1), \TMP1
 907	AESENC	  \TMP1, \XMM1              # Round 2
 908	AESENC	  \TMP1, \XMM2
 909	AESENC	  \TMP1, \XMM3
 910	AESENC	  \TMP1, \XMM4
 911	movdqa	  \XMM6, \TMP1
 912	pshufd	  $78, \XMM6, \TMP2
 913	pxor	  \XMM6, \TMP2
 914	movdqa	  HashKey_3(%rsp), \TMP5
 915	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
 916	movaps 0x30(%arg1), \TMP3
 917	AESENC    \TMP3, \XMM1              # Round 3
 918	AESENC    \TMP3, \XMM2
 919	AESENC    \TMP3, \XMM3
 920	AESENC    \TMP3, \XMM4
 921	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
 922	movaps 0x40(%arg1), \TMP3
 923	AESENC	  \TMP3, \XMM1              # Round 4
 924	AESENC	  \TMP3, \XMM2
 925	AESENC	  \TMP3, \XMM3
 926	AESENC	  \TMP3, \XMM4
 927	movdqa	  HashKey_3_k(%rsp), \TMP5
 928	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 929	movaps 0x50(%arg1), \TMP3
 930	AESENC	  \TMP3, \XMM1              # Round 5
 931	AESENC	  \TMP3, \XMM2
 932	AESENC	  \TMP3, \XMM3
 933	AESENC	  \TMP3, \XMM4
 934	pxor	  \TMP1, \TMP4
 935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 936	pxor	  \XMM6, \XMM5
 937	pxor	  \TMP2, \TMP6
 938	movdqa	  \XMM7, \TMP1
 939	pshufd	  $78, \XMM7, \TMP2
 940	pxor	  \XMM7, \TMP2
 941	movdqa	  HashKey_2(%rsp ), \TMP5
 942
 943        # Multiply TMP5 * HashKey using karatsuba
 944
 945	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
 946	movaps 0x60(%arg1), \TMP3
 947	AESENC	  \TMP3, \XMM1              # Round 6
 948	AESENC	  \TMP3, \XMM2
 949	AESENC	  \TMP3, \XMM3
 950	AESENC	  \TMP3, \XMM4
 951	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
 952	movaps 0x70(%arg1), \TMP3
 953	AESENC	  \TMP3, \XMM1             # Round 7
 954	AESENC	  \TMP3, \XMM2
 955	AESENC	  \TMP3, \XMM3
 956	AESENC	  \TMP3, \XMM4
 957	movdqa	  HashKey_2_k(%rsp), \TMP5
 958	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 959	movaps 0x80(%arg1), \TMP3
 960	AESENC	  \TMP3, \XMM1             # Round 8
 961	AESENC	  \TMP3, \XMM2
 962	AESENC	  \TMP3, \XMM3
 963	AESENC	  \TMP3, \XMM4
 964	pxor	  \TMP1, \TMP4
 965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 966	pxor	  \XMM7, \XMM5
 967	pxor	  \TMP2, \TMP6
 968
 969        # Multiply XMM8 * HashKey
 970        # XMM8 and TMP5 hold the values for the two operands
 971
 972	movdqa	  \XMM8, \TMP1
 973	pshufd	  $78, \XMM8, \TMP2
 974	pxor	  \XMM8, \TMP2
 975	movdqa	  HashKey(%rsp), \TMP5
 976	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
 977	movaps 0x90(%arg1), \TMP3
 978	AESENC	  \TMP3, \XMM1            # Round 9
 979	AESENC	  \TMP3, \XMM2
 980	AESENC	  \TMP3, \XMM3
 981	AESENC	  \TMP3, \XMM4
 982	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 983	movaps 0xa0(%arg1), \TMP3
 984	AESENCLAST \TMP3, \XMM1           # Round 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 985	AESENCLAST \TMP3, \XMM2
 986	AESENCLAST \TMP3, \XMM3
 987	AESENCLAST \TMP3, \XMM4
 988	movdqa    HashKey_k(%rsp), \TMP5
 989	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
 990	movdqu	  (%arg3,%r11,1), \TMP3
 991	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
 992	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
 993	movdqa    \TMP3, \XMM1
 994	movdqu	  16(%arg3,%r11,1), \TMP3
 995	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
 996	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
 997	movdqa    \TMP3, \XMM2
 998	movdqu	  32(%arg3,%r11,1), \TMP3
 999	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1000	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1001	movdqa    \TMP3, \XMM3
1002	movdqu	  48(%arg3,%r11,1), \TMP3
1003	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1004	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1005	movdqa    \TMP3, \XMM4
1006	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1007	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1008	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1009	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1010
1011	pxor	  \TMP4, \TMP1
1012	pxor	  \XMM8, \XMM5
1013	pxor	  \TMP6, \TMP2
1014	pxor	  \TMP1, \TMP2
1015	pxor	  \XMM5, \TMP2
1016	movdqa	  \TMP2, \TMP3
1017	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1018	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1019	pxor	  \TMP3, \XMM5
1020	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1021
1022        # first phase of reduction
1023
1024	movdqa    \XMM5, \TMP2
1025	movdqa    \XMM5, \TMP3
1026	movdqa    \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028	pslld     $31, \TMP2                   # packed right shift << 31
1029	pslld     $30, \TMP3                   # packed right shift << 30
1030	pslld     $25, \TMP4                   # packed right shift << 25
1031	pxor      \TMP3, \TMP2	               # xor the shifted versions
1032	pxor      \TMP4, \TMP2
1033	movdqa    \TMP2, \TMP5
1034	psrldq    $4, \TMP5                    # right shift T5 1 DW
1035	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1036	pxor      \TMP2, \XMM5
1037
1038        # second phase of reduction
1039
1040	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041	movdqa    \XMM5,\TMP3
1042	movdqa    \XMM5,\TMP4
1043	psrld     $1, \TMP2                    # packed left shift >>1
1044	psrld     $2, \TMP3                    # packed left shift >>2
1045	psrld     $7, \TMP4                    # packed left shift >>7
1046	pxor      \TMP3,\TMP2		       # xor the shifted versions
1047	pxor      \TMP4,\TMP2
1048	pxor      \TMP5, \TMP2
1049	pxor      \TMP2, \XMM5
1050	pxor      \TMP1, \XMM5                 # result is in TMP1
1051
1052	pxor	  \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059        # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061	movdqa	  \XMM1, \TMP6
1062	pshufd	  $78, \XMM1, \TMP2
1063	pxor	  \XMM1, \TMP2
1064	movdqa	  HashKey_4(%rsp), \TMP5
1065	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1066	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1067	movdqa	  HashKey_4_k(%rsp), \TMP4
1068	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1069	movdqa	  \XMM1, \XMMDst
1070	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1071
1072        # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074	movdqa	  \XMM2, \TMP1
1075	pshufd	  $78, \XMM2, \TMP2
1076	pxor	  \XMM2, \TMP2
1077	movdqa	  HashKey_3(%rsp), \TMP5
1078	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1079	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1080	movdqa	  HashKey_3_k(%rsp), \TMP4
1081	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1082	pxor	  \TMP1, \TMP6
1083	pxor	  \XMM2, \XMMDst
1084	pxor	  \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087        # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089	movdqa	  \XMM3, \TMP1
1090	pshufd	  $78, \XMM3, \TMP2
1091	pxor	  \XMM3, \TMP2
1092	movdqa	  HashKey_2(%rsp), \TMP5
1093	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1094	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1095	movdqa	  HashKey_2_k(%rsp), \TMP4
1096	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1097	pxor	  \TMP1, \TMP6
1098	pxor	  \XMM3, \XMMDst
1099	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1100
1101        # Multiply TMP1 * HashKey (using Karatsuba)
1102	movdqa	  \XMM4, \TMP1
1103	pshufd	  $78, \XMM4, \TMP2
1104	pxor	  \XMM4, \TMP2
1105	movdqa	  HashKey(%rsp), \TMP5
1106	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1107	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1108	movdqa	  HashKey_k(%rsp), \TMP4
1109	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1110	pxor	  \TMP1, \TMP6
1111	pxor	  \XMM4, \XMMDst
1112	pxor	  \XMM1, \TMP2
1113	pxor	  \TMP6, \TMP2
1114	pxor	  \XMMDst, \TMP2
1115	# middle section of the temp results combined as in karatsuba algorithm
1116	movdqa	  \TMP2, \TMP4
1117	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1118	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1119	pxor	  \TMP4, \XMMDst
1120	pxor	  \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122	# first phase of the reduction
1123	movdqa    \XMMDst, \TMP2
1124	movdqa    \XMMDst, \TMP3
1125	movdqa    \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127	pslld     $31, \TMP2                # packed right shifting << 31
1128	pslld     $30, \TMP3                # packed right shifting << 30
1129	pslld     $25, \TMP4                # packed right shifting << 25
1130	pxor      \TMP3, \TMP2              # xor the shifted versions
1131	pxor      \TMP4, \TMP2
1132	movdqa    \TMP2, \TMP7
1133	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1134	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1135	pxor      \TMP2, \XMMDst
1136
1137        # second phase of the reduction
1138	movdqa    \XMMDst, \TMP2
1139	# make 3 copies of XMMDst for doing 3 shift operations
1140	movdqa    \XMMDst, \TMP3
1141	movdqa    \XMMDst, \TMP4
1142	psrld     $1, \TMP2                 # packed left shift >> 1
1143	psrld     $2, \TMP3                 # packed left shift >> 2
1144	psrld     $7, \TMP4                 # packed left shift >> 7
1145	pxor      \TMP3, \TMP2              # xor the shifted versions
1146	pxor      \TMP4, \TMP2
1147	pxor      \TMP7, \TMP2
1148	pxor      \TMP2, \XMMDst
1149	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155	pxor	(%arg1), \XMM0
1156        movaps 16(%arg1), \TMP1
1157	AESENC	\TMP1, \XMM0
1158        movaps 32(%arg1), \TMP1
1159	AESENC	\TMP1, \XMM0
1160        movaps 48(%arg1), \TMP1
1161	AESENC	\TMP1, \XMM0
1162        movaps 64(%arg1), \TMP1
1163	AESENC	\TMP1, \XMM0
1164        movaps 80(%arg1), \TMP1
1165	AESENC	\TMP1, \XMM0
1166        movaps 96(%arg1), \TMP1
1167	AESENC	\TMP1, \XMM0
1168        movaps 112(%arg1), \TMP1
1169	AESENC	\TMP1, \XMM0
1170        movaps 128(%arg1), \TMP1
1171	AESENC	\TMP1, \XMM0
1172        movaps 144(%arg1), \TMP1
1173	AESENC	\TMP1, \XMM0
1174        movaps 160(%arg1), \TMP1
1175	AESENCLAST	\TMP1, \XMM0
1176.endm
1177
 
1178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
 
 
1181*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1182*                   const u8 *in,      // Ciphertext input
1183*                   u64 plaintext_len, // Length of data in bytes for decryption.
1184*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1185*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1187*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188*                   const u8 *aad,     // Additional Authentication Data (AAD)
1189*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1191*                                      // given authentication tag and only return the plaintext if they match.
1192*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193*                                      // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1199*       set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202*       0                   1                   2                   3
1203*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205*       |                             Salt  (From the SA)               |
1206*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207*       |                     Initialization Vector                     |
1208*       |         (This is the sequence number from IPSec header)       |
1209*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210*       |                              0x1                              |
1211*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216*       AAD padded to 128 bits with 0
1217*       for example, assume AAD is a u32 vector
1218*
1219*       if AAD is 8 bytes:
1220*       AAD[3] = {A0, A1};
1221*       padded AAD in xmm register = {A1 A0 0 0}
1222*
1223*       0                   1                   2                   3
1224*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226*       |                               SPI (A1)                        |
1227*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228*       |                     32-bit Sequence Number (A0)               |
1229*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230*       |                              0x0                              |
1231*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233*                                       AAD Format with 32-bit Sequence Number
1234*
1235*       if AAD is 12 bytes:
1236*       AAD[3] = {A0, A1, A2};
1237*       padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239*       0                   1                   2                   3
1240*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244*       |                               SPI (A2)                        |
1245*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246*       |                 64-bit Extended Sequence Number {A1,A0}       |
1247*       |                                                               |
1248*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249*       |                              0x0                              |
1250*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252*                        AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256*       The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260*       For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267	push	%r12
1268	push	%r13
1269	push	%r14
1270	mov	%rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275	sub	$VARIABLE_OFFSET, %rsp
1276	and	$~63, %rsp                        # align rsp to 64 bytes
1277	mov	%arg6, %r12
1278	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1279        movdqa  SHUF_MASK(%rip), %xmm2
1280	PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285	movdqa	%xmm13, %xmm2
1286	psllq	$1, %xmm13
1287	psrlq	$63, %xmm2
1288	movdqa	%xmm2, %xmm1
1289	pslldq	$8, %xmm2
1290	psrldq	$8, %xmm1
1291	por	%xmm2, %xmm13
1292
1293        # Reduction
1294
1295	pshufd	$0x24, %xmm1, %xmm2
1296	pcmpeqd TWOONE(%rip), %xmm2
1297	pand	POLY(%rip), %xmm2
1298	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301        # Decrypt first few blocks
1302
1303	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1304	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1305	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1306	mov %r13, %r12
1307	and $(3<<4), %r12
1308	jz _initial_num_blocks_is_0_decrypt
1309	cmp $(2<<4), %r12
1310	jb _initial_num_blocks_is_1_decrypt
1311	je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315	sub	$48, %r13
1316	jmp	_initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320	sub	$32, %r13
1321	jmp	_initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325	sub	$16, %r13
1326	jmp	_initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331	cmp	$0, %r13
1332	je	_zero_cipher_left_decrypt
1333	sub	$64, %r13
1334	je	_four_cipher_left_decrypt
1335_decrypt_by_4:
1336	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338	add	$64, %r11
1339	sub	$64, %r13
1340	jne	_decrypt_by_4
1341_four_cipher_left_decrypt:
1342	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345	mov	%arg4, %r13
1346	and	$15, %r13				# %r13 = arg4 (mod 16)
1347	je	_multiple_of_16_bytes_decrypt
1348
1349        # Handle the last <16 byte block separately
1350
1351	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1352        movdqa SHUF_MASK(%rip), %xmm10
1353	PSHUFB_XMM %xmm10, %xmm0
1354
1355	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1356	sub $16, %r11
1357	add %r13, %r11
1358	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1359	lea SHIFT_MASK+16(%rip), %r12
1360	sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1364	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1365
1366	movdqa  %xmm1, %xmm2
1367	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1368	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1371	pand    %xmm1, %xmm2
1372        movdqa SHUF_MASK(%rip), %xmm10
1373	PSHUFB_XMM %xmm10 ,%xmm2
1374
1375	pxor %xmm2, %xmm8
1376	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377	          # GHASH computation for the last <16 byte block
1378	sub %r13, %r11
1379	add $16, %r11
1380
1381        # output %r13 bytes
1382	MOVQ_R64_XMM	%xmm0, %rax
1383	cmp	$8, %r13
1384	jle	_less_than_8_bytes_left_decrypt
1385	mov	%rax, (%arg2 , %r11, 1)
1386	add	$8, %r11
1387	psrldq	$8, %xmm0
1388	MOVQ_R64_XMM	%xmm0, %rax
1389	sub	$8, %r13
1390_less_than_8_bytes_left_decrypt:
1391	mov	%al,  (%arg2, %r11, 1)
1392	add	$1, %r11
1393	shr	$8, %rax
1394	sub	$1, %r13
1395	jne	_less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1398	shl	$3, %r12		  # convert into number of bits
1399	movd	%r12d, %xmm15		  # len(A) in %xmm15
1400	shl	$3, %arg4		  # len(C) in bits (*128)
1401	MOVQ_R64_XMM	%arg4, %xmm1
1402	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1403	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1404	pxor	%xmm15, %xmm8
1405	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406	         # final GHASH computation
1407        movdqa SHUF_MASK(%rip), %xmm10
1408	PSHUFB_XMM %xmm10, %xmm8
1409
1410	mov	%arg5, %rax		  # %rax = *Y0
1411	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1412	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1413	pxor	%xmm8, %xmm0
1414_return_T_decrypt:
1415	mov	arg9, %r10                # %r10 = authTag
1416	mov	arg10, %r11               # %r11 = auth_tag_len
1417	cmp	$16, %r11
1418	je	_T_16_decrypt
1419	cmp	$12, %r11
1420	je	_T_12_decrypt
1421_T_8_decrypt:
1422	MOVQ_R64_XMM	%xmm0, %rax
1423	mov	%rax, (%r10)
1424	jmp	_return_T_done_decrypt
1425_T_12_decrypt:
1426	MOVQ_R64_XMM	%xmm0, %rax
1427	mov	%rax, (%r10)
1428	psrldq	$8, %xmm0
1429	movd	%xmm0, %eax
1430	mov	%eax, 8(%r10)
1431	jmp	_return_T_done_decrypt
1432_T_16_decrypt:
1433	movdqu	%xmm0, (%r10)
1434_return_T_done_decrypt:
1435	mov	%r14, %rsp
1436	pop	%r14
1437	pop	%r13
1438	pop	%r12
1439	ret
 
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
 
 
1444*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1445*                    const u8 *in,       // Plaintext input
1446*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1447*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1448*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1450*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451*                    const u8 *aad,      // Additional Authentication Data (AAD)
1452*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453*                    u8 *auth_tag,       // Authenticated Tag output.
1454*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455*                                        // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460*       keys are pre-expanded and aligned to 16 bytes. we are using the
1461*       first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465*       0                   1                   2                   3
1466*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468*       |                             Salt  (From the SA)               |
1469*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470*       |                     Initialization Vector                     |
1471*       |         (This is the sequence number from IPSec header)       |
1472*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473*       |                              0x1                              |
1474*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479*       AAD padded to 128 bits with 0
1480*       for example, assume AAD is a u32 vector
1481*
1482*       if AAD is 8 bytes:
1483*       AAD[3] = {A0, A1};
1484*       padded AAD in xmm register = {A1 A0 0 0}
1485*
1486*       0                   1                   2                   3
1487*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489*       |                               SPI (A1)                        |
1490*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491*       |                     32-bit Sequence Number (A0)               |
1492*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493*       |                              0x0                              |
1494*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496*                                 AAD Format with 32-bit Sequence Number
1497*
1498*       if AAD is 12 bytes:
1499*       AAD[3] = {A0, A1, A2};
1500*       padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502*       0                   1                   2                   3
1503*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505*       |                               SPI (A2)                        |
1506*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507*       |                 64-bit Extended Sequence Number {A1,A0}       |
1508*       |                                                               |
1509*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510*       |                              0x0                              |
1511*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513*                         AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517*       The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521*       For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526	push	%r12
1527	push	%r13
1528	push	%r14
1529	mov	%rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534	sub	$VARIABLE_OFFSET, %rsp
1535	and	$~63, %rsp
1536	mov	%arg6, %r12
1537	movdqu	(%r12), %xmm13
1538        movdqa  SHUF_MASK(%rip), %xmm2
1539	PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544	movdqa	%xmm13, %xmm2
1545	psllq	$1, %xmm13
1546	psrlq	$63, %xmm2
1547	movdqa	%xmm2, %xmm1
1548	pslldq	$8, %xmm2
1549	psrldq	$8, %xmm1
1550	por	%xmm2, %xmm13
1551
1552        # reduce HashKey<<1
1553
1554	pshufd	$0x24, %xmm1, %xmm2
1555	pcmpeqd TWOONE(%rip), %xmm2
1556	pand	POLY(%rip), %xmm2
1557	pxor	%xmm2, %xmm13
1558	movdqa	%xmm13, HashKey(%rsp)
1559	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1560	and	$-16, %r13
1561	mov	%r13, %r12
1562
1563        # Encrypt first few blocks
 
1564
1565	and	$(3<<4), %r12
1566	jz	_initial_num_blocks_is_0_encrypt
1567	cmp	$(2<<4), %r12
1568	jb	_initial_num_blocks_is_1_encrypt
1569	je	_initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573	sub	$48, %r13
1574	jmp	_initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578	sub	$32, %r13
1579	jmp	_initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583	sub	$16, %r13
1584	jmp	_initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590        # Main loop - Encrypt remaining blocks
1591
1592	cmp	$0, %r13
1593	je	_zero_cipher_left_encrypt
1594	sub	$64, %r13
1595	je	_four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599	add	$64, %r11
1600	sub	$64, %r13
1601	jne	_encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606	mov	%arg4, %r13
1607	and	$15, %r13			# %r13 = arg4 (mod 16)
1608	je	_multiple_of_16_bytes_encrypt
1609
1610         # Handle the last <16 Byte block separately
1611	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1612        movdqa SHUF_MASK(%rip), %xmm10
1613	PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1617	sub $16, %r11
1618	add %r13, %r11
1619	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1620	lea SHIFT_MASK+16(%rip), %r12
1621	sub %r13, %r12
1622	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623	# (%r13 is the number of bytes in plaintext mod 16)
1624	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1625	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1626	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1627	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1628	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1630        movdqa SHUF_MASK(%rip), %xmm10
1631	PSHUFB_XMM %xmm10,%xmm0
1632
1633	pxor	%xmm0, %xmm8
1634	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635	# GHASH computation for the last <16 byte block
1636	sub	%r13, %r11
1637	add	$16, %r11
1638
1639	movdqa SHUF_MASK(%rip), %xmm10
1640	PSHUFB_XMM %xmm10, %xmm0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1641
1642	# shuffle xmm0 back to output as ciphertext
 
 
 
 
 
 
 
 
 
 
 
 
 
1643
1644        # Output %r13 bytes
1645	MOVQ_R64_XMM %xmm0, %rax
1646	cmp $8, %r13
1647	jle _less_than_8_bytes_left_encrypt
1648	mov %rax, (%arg2 , %r11, 1)
1649	add $8, %r11
1650	psrldq $8, %xmm0
1651	MOVQ_R64_XMM %xmm0, %rax
1652	sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654	mov %al,  (%arg2, %r11, 1)
1655	add $1, %r11
1656	shr $8, %rax
1657	sub $1, %r13
1658	jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1661	shl	$3, %r12
1662	movd	%r12d, %xmm15       # len(A) in %xmm15
1663	shl	$3, %arg4               # len(C) in bits (*128)
1664	MOVQ_R64_XMM	%arg4, %xmm1
1665	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1666	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1667	pxor	%xmm15, %xmm8
1668	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669	# final GHASH computation
1670        movdqa SHUF_MASK(%rip), %xmm10
1671	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1672
1673	mov	%arg5, %rax		       # %rax  = *Y0
1674	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1675	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1676	pxor	%xmm8, %xmm0
1677_return_T_encrypt:
1678	mov	arg9, %r10                     # %r10 = authTag
1679	mov	arg10, %r11                    # %r11 = auth_tag_len
1680	cmp	$16, %r11
1681	je	_T_16_encrypt
1682	cmp	$12, %r11
1683	je	_T_12_encrypt
1684_T_8_encrypt:
1685	MOVQ_R64_XMM	%xmm0, %rax
1686	mov	%rax, (%r10)
1687	jmp	_return_T_done_encrypt
1688_T_12_encrypt:
1689	MOVQ_R64_XMM	%xmm0, %rax
1690	mov	%rax, (%r10)
1691	psrldq	$8, %xmm0
1692	movd	%xmm0, %eax
1693	mov	%eax, 8(%r10)
1694	jmp	_return_T_done_encrypt
1695_T_16_encrypt:
1696	movdqu	%xmm0, (%r10)
1697_return_T_done_encrypt:
1698	mov	%r14, %rsp
1699	pop	%r14
1700	pop	%r13
1701	pop	%r12
1702	ret
 
1703
1704#endif
1705
1706
 
1707_key_expansion_128:
1708_key_expansion_256a:
1709	pshufd $0b11111111, %xmm1, %xmm1
1710	shufps $0b00010000, %xmm0, %xmm4
1711	pxor %xmm4, %xmm0
1712	shufps $0b10001100, %xmm0, %xmm4
1713	pxor %xmm4, %xmm0
1714	pxor %xmm1, %xmm0
1715	movaps %xmm0, (TKEYP)
1716	add $0x10, TKEYP
1717	ret
 
 
1718
1719.align 4
1720_key_expansion_192a:
1721	pshufd $0b01010101, %xmm1, %xmm1
1722	shufps $0b00010000, %xmm0, %xmm4
1723	pxor %xmm4, %xmm0
1724	shufps $0b10001100, %xmm0, %xmm4
1725	pxor %xmm4, %xmm0
1726	pxor %xmm1, %xmm0
1727
1728	movaps %xmm2, %xmm5
1729	movaps %xmm2, %xmm6
1730	pslldq $4, %xmm5
1731	pshufd $0b11111111, %xmm0, %xmm3
1732	pxor %xmm3, %xmm2
1733	pxor %xmm5, %xmm2
1734
1735	movaps %xmm0, %xmm1
1736	shufps $0b01000100, %xmm0, %xmm6
1737	movaps %xmm6, (TKEYP)
1738	shufps $0b01001110, %xmm2, %xmm1
1739	movaps %xmm1, 0x10(TKEYP)
1740	add $0x20, TKEYP
1741	ret
 
1742
1743.align 4
1744_key_expansion_192b:
1745	pshufd $0b01010101, %xmm1, %xmm1
1746	shufps $0b00010000, %xmm0, %xmm4
1747	pxor %xmm4, %xmm0
1748	shufps $0b10001100, %xmm0, %xmm4
1749	pxor %xmm4, %xmm0
1750	pxor %xmm1, %xmm0
1751
1752	movaps %xmm2, %xmm5
1753	pslldq $4, %xmm5
1754	pshufd $0b11111111, %xmm0, %xmm3
1755	pxor %xmm3, %xmm2
1756	pxor %xmm5, %xmm2
1757
1758	movaps %xmm0, (TKEYP)
1759	add $0x10, TKEYP
1760	ret
 
1761
1762.align 4
1763_key_expansion_256b:
1764	pshufd $0b10101010, %xmm1, %xmm1
1765	shufps $0b00010000, %xmm2, %xmm4
1766	pxor %xmm4, %xmm2
1767	shufps $0b10001100, %xmm2, %xmm4
1768	pxor %xmm4, %xmm2
1769	pxor %xmm1, %xmm2
1770	movaps %xmm2, (TKEYP)
1771	add $0x10, TKEYP
1772	ret
 
1773
1774/*
1775 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1776 *                   unsigned int key_len)
1777 */
1778ENTRY(aesni_set_key)
 
1779#ifndef __x86_64__
1780	pushl KEYP
1781	movl 8(%esp), KEYP		# ctx
1782	movl 12(%esp), UKEYP		# in_key
1783	movl 16(%esp), %edx		# key_len
1784#endif
1785	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1786	movaps %xmm0, (KEYP)
1787	lea 0x10(KEYP), TKEYP		# key addr
1788	movl %edx, 480(KEYP)
1789	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1790	cmp $24, %dl
1791	jb .Lenc_key128
1792	je .Lenc_key192
1793	movups 0x10(UKEYP), %xmm2	# other user key
1794	movaps %xmm2, (TKEYP)
1795	add $0x10, TKEYP
1796	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1797	call _key_expansion_256a
1798	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1799	call _key_expansion_256b
1800	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1801	call _key_expansion_256a
1802	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1803	call _key_expansion_256b
1804	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1805	call _key_expansion_256a
1806	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1807	call _key_expansion_256b
1808	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1809	call _key_expansion_256a
1810	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1811	call _key_expansion_256b
1812	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1813	call _key_expansion_256a
1814	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1815	call _key_expansion_256b
1816	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1817	call _key_expansion_256a
1818	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1819	call _key_expansion_256b
1820	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1821	call _key_expansion_256a
1822	jmp .Ldec_key
1823.Lenc_key192:
1824	movq 0x10(UKEYP), %xmm2		# other user key
1825	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1826	call _key_expansion_192a
1827	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1828	call _key_expansion_192b
1829	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1830	call _key_expansion_192a
1831	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1832	call _key_expansion_192b
1833	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1834	call _key_expansion_192a
1835	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1836	call _key_expansion_192b
1837	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1838	call _key_expansion_192a
1839	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1840	call _key_expansion_192b
1841	jmp .Ldec_key
1842.Lenc_key128:
1843	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1844	call _key_expansion_128
1845	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1846	call _key_expansion_128
1847	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1848	call _key_expansion_128
1849	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1850	call _key_expansion_128
1851	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1852	call _key_expansion_128
1853	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1854	call _key_expansion_128
1855	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1856	call _key_expansion_128
1857	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1858	call _key_expansion_128
1859	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1860	call _key_expansion_128
1861	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1862	call _key_expansion_128
1863.Ldec_key:
1864	sub $0x10, TKEYP
1865	movaps (KEYP), %xmm0
1866	movaps (TKEYP), %xmm1
1867	movaps %xmm0, 240(TKEYP)
1868	movaps %xmm1, 240(KEYP)
1869	add $0x10, KEYP
1870	lea 240-16(TKEYP), UKEYP
1871.align 4
1872.Ldec_key_loop:
1873	movaps (KEYP), %xmm0
1874	AESIMC %xmm0 %xmm1
1875	movaps %xmm1, (UKEYP)
1876	add $0x10, KEYP
1877	sub $0x10, UKEYP
1878	cmp TKEYP, KEYP
1879	jb .Ldec_key_loop
1880	xor AREG, AREG
1881#ifndef __x86_64__
1882	popl KEYP
1883#endif
 
1884	ret
 
1885
1886/*
1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1888 */
1889ENTRY(aesni_enc)
 
1890#ifndef __x86_64__
1891	pushl KEYP
1892	pushl KLEN
1893	movl 12(%esp), KEYP
1894	movl 16(%esp), OUTP
1895	movl 20(%esp), INP
1896#endif
1897	movl 480(KEYP), KLEN		# key length
1898	movups (INP), STATE		# input
1899	call _aesni_enc1
1900	movups STATE, (OUTP)		# output
1901#ifndef __x86_64__
1902	popl KLEN
1903	popl KEYP
1904#endif
 
1905	ret
 
1906
1907/*
1908 * _aesni_enc1:		internal ABI
1909 * input:
1910 *	KEYP:		key struct pointer
1911 *	KLEN:		round count
1912 *	STATE:		initial state (input)
1913 * output:
1914 *	STATE:		finial state (output)
1915 * changed:
1916 *	KEY
1917 *	TKEYP (T1)
1918 */
1919.align 4
1920_aesni_enc1:
1921	movaps (KEYP), KEY		# key
1922	mov KEYP, TKEYP
1923	pxor KEY, STATE		# round 0
1924	add $0x30, TKEYP
1925	cmp $24, KLEN
1926	jb .Lenc128
1927	lea 0x20(TKEYP), TKEYP
1928	je .Lenc192
1929	add $0x20, TKEYP
1930	movaps -0x60(TKEYP), KEY
1931	AESENC KEY STATE
1932	movaps -0x50(TKEYP), KEY
1933	AESENC KEY STATE
1934.align 4
1935.Lenc192:
1936	movaps -0x40(TKEYP), KEY
1937	AESENC KEY STATE
1938	movaps -0x30(TKEYP), KEY
1939	AESENC KEY STATE
1940.align 4
1941.Lenc128:
1942	movaps -0x20(TKEYP), KEY
1943	AESENC KEY STATE
1944	movaps -0x10(TKEYP), KEY
1945	AESENC KEY STATE
1946	movaps (TKEYP), KEY
1947	AESENC KEY STATE
1948	movaps 0x10(TKEYP), KEY
1949	AESENC KEY STATE
1950	movaps 0x20(TKEYP), KEY
1951	AESENC KEY STATE
1952	movaps 0x30(TKEYP), KEY
1953	AESENC KEY STATE
1954	movaps 0x40(TKEYP), KEY
1955	AESENC KEY STATE
1956	movaps 0x50(TKEYP), KEY
1957	AESENC KEY STATE
1958	movaps 0x60(TKEYP), KEY
1959	AESENC KEY STATE
1960	movaps 0x70(TKEYP), KEY
1961	AESENCLAST KEY STATE
1962	ret
 
1963
1964/*
1965 * _aesni_enc4:	internal ABI
1966 * input:
1967 *	KEYP:		key struct pointer
1968 *	KLEN:		round count
1969 *	STATE1:		initial state (input)
1970 *	STATE2
1971 *	STATE3
1972 *	STATE4
1973 * output:
1974 *	STATE1:		finial state (output)
1975 *	STATE2
1976 *	STATE3
1977 *	STATE4
1978 * changed:
1979 *	KEY
1980 *	TKEYP (T1)
1981 */
1982.align 4
1983_aesni_enc4:
1984	movaps (KEYP), KEY		# key
1985	mov KEYP, TKEYP
1986	pxor KEY, STATE1		# round 0
1987	pxor KEY, STATE2
1988	pxor KEY, STATE3
1989	pxor KEY, STATE4
1990	add $0x30, TKEYP
1991	cmp $24, KLEN
1992	jb .L4enc128
1993	lea 0x20(TKEYP), TKEYP
1994	je .L4enc192
1995	add $0x20, TKEYP
1996	movaps -0x60(TKEYP), KEY
1997	AESENC KEY STATE1
1998	AESENC KEY STATE2
1999	AESENC KEY STATE3
2000	AESENC KEY STATE4
2001	movaps -0x50(TKEYP), KEY
2002	AESENC KEY STATE1
2003	AESENC KEY STATE2
2004	AESENC KEY STATE3
2005	AESENC KEY STATE4
2006#.align 4
2007.L4enc192:
2008	movaps -0x40(TKEYP), KEY
2009	AESENC KEY STATE1
2010	AESENC KEY STATE2
2011	AESENC KEY STATE3
2012	AESENC KEY STATE4
2013	movaps -0x30(TKEYP), KEY
2014	AESENC KEY STATE1
2015	AESENC KEY STATE2
2016	AESENC KEY STATE3
2017	AESENC KEY STATE4
2018#.align 4
2019.L4enc128:
2020	movaps -0x20(TKEYP), KEY
2021	AESENC KEY STATE1
2022	AESENC KEY STATE2
2023	AESENC KEY STATE3
2024	AESENC KEY STATE4
2025	movaps -0x10(TKEYP), KEY
2026	AESENC KEY STATE1
2027	AESENC KEY STATE2
2028	AESENC KEY STATE3
2029	AESENC KEY STATE4
2030	movaps (TKEYP), KEY
2031	AESENC KEY STATE1
2032	AESENC KEY STATE2
2033	AESENC KEY STATE3
2034	AESENC KEY STATE4
2035	movaps 0x10(TKEYP), KEY
2036	AESENC KEY STATE1
2037	AESENC KEY STATE2
2038	AESENC KEY STATE3
2039	AESENC KEY STATE4
2040	movaps 0x20(TKEYP), KEY
2041	AESENC KEY STATE1
2042	AESENC KEY STATE2
2043	AESENC KEY STATE3
2044	AESENC KEY STATE4
2045	movaps 0x30(TKEYP), KEY
2046	AESENC KEY STATE1
2047	AESENC KEY STATE2
2048	AESENC KEY STATE3
2049	AESENC KEY STATE4
2050	movaps 0x40(TKEYP), KEY
2051	AESENC KEY STATE1
2052	AESENC KEY STATE2
2053	AESENC KEY STATE3
2054	AESENC KEY STATE4
2055	movaps 0x50(TKEYP), KEY
2056	AESENC KEY STATE1
2057	AESENC KEY STATE2
2058	AESENC KEY STATE3
2059	AESENC KEY STATE4
2060	movaps 0x60(TKEYP), KEY
2061	AESENC KEY STATE1
2062	AESENC KEY STATE2
2063	AESENC KEY STATE3
2064	AESENC KEY STATE4
2065	movaps 0x70(TKEYP), KEY
2066	AESENCLAST KEY STATE1		# last round
2067	AESENCLAST KEY STATE2
2068	AESENCLAST KEY STATE3
2069	AESENCLAST KEY STATE4
2070	ret
 
2071
2072/*
2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2074 */
2075ENTRY(aesni_dec)
 
2076#ifndef __x86_64__
2077	pushl KEYP
2078	pushl KLEN
2079	movl 12(%esp), KEYP
2080	movl 16(%esp), OUTP
2081	movl 20(%esp), INP
2082#endif
2083	mov 480(KEYP), KLEN		# key length
2084	add $240, KEYP
2085	movups (INP), STATE		# input
2086	call _aesni_dec1
2087	movups STATE, (OUTP)		#output
2088#ifndef __x86_64__
2089	popl KLEN
2090	popl KEYP
2091#endif
 
2092	ret
 
2093
2094/*
2095 * _aesni_dec1:		internal ABI
2096 * input:
2097 *	KEYP:		key struct pointer
2098 *	KLEN:		key length
2099 *	STATE:		initial state (input)
2100 * output:
2101 *	STATE:		finial state (output)
2102 * changed:
2103 *	KEY
2104 *	TKEYP (T1)
2105 */
2106.align 4
2107_aesni_dec1:
2108	movaps (KEYP), KEY		# key
2109	mov KEYP, TKEYP
2110	pxor KEY, STATE		# round 0
2111	add $0x30, TKEYP
2112	cmp $24, KLEN
2113	jb .Ldec128
2114	lea 0x20(TKEYP), TKEYP
2115	je .Ldec192
2116	add $0x20, TKEYP
2117	movaps -0x60(TKEYP), KEY
2118	AESDEC KEY STATE
2119	movaps -0x50(TKEYP), KEY
2120	AESDEC KEY STATE
2121.align 4
2122.Ldec192:
2123	movaps -0x40(TKEYP), KEY
2124	AESDEC KEY STATE
2125	movaps -0x30(TKEYP), KEY
2126	AESDEC KEY STATE
2127.align 4
2128.Ldec128:
2129	movaps -0x20(TKEYP), KEY
2130	AESDEC KEY STATE
2131	movaps -0x10(TKEYP), KEY
2132	AESDEC KEY STATE
2133	movaps (TKEYP), KEY
2134	AESDEC KEY STATE
2135	movaps 0x10(TKEYP), KEY
2136	AESDEC KEY STATE
2137	movaps 0x20(TKEYP), KEY
2138	AESDEC KEY STATE
2139	movaps 0x30(TKEYP), KEY
2140	AESDEC KEY STATE
2141	movaps 0x40(TKEYP), KEY
2142	AESDEC KEY STATE
2143	movaps 0x50(TKEYP), KEY
2144	AESDEC KEY STATE
2145	movaps 0x60(TKEYP), KEY
2146	AESDEC KEY STATE
2147	movaps 0x70(TKEYP), KEY
2148	AESDECLAST KEY STATE
2149	ret
 
2150
2151/*
2152 * _aesni_dec4:	internal ABI
2153 * input:
2154 *	KEYP:		key struct pointer
2155 *	KLEN:		key length
2156 *	STATE1:		initial state (input)
2157 *	STATE2
2158 *	STATE3
2159 *	STATE4
2160 * output:
2161 *	STATE1:		finial state (output)
2162 *	STATE2
2163 *	STATE3
2164 *	STATE4
2165 * changed:
2166 *	KEY
2167 *	TKEYP (T1)
2168 */
2169.align 4
2170_aesni_dec4:
2171	movaps (KEYP), KEY		# key
2172	mov KEYP, TKEYP
2173	pxor KEY, STATE1		# round 0
2174	pxor KEY, STATE2
2175	pxor KEY, STATE3
2176	pxor KEY, STATE4
2177	add $0x30, TKEYP
2178	cmp $24, KLEN
2179	jb .L4dec128
2180	lea 0x20(TKEYP), TKEYP
2181	je .L4dec192
2182	add $0x20, TKEYP
2183	movaps -0x60(TKEYP), KEY
2184	AESDEC KEY STATE1
2185	AESDEC KEY STATE2
2186	AESDEC KEY STATE3
2187	AESDEC KEY STATE4
2188	movaps -0x50(TKEYP), KEY
2189	AESDEC KEY STATE1
2190	AESDEC KEY STATE2
2191	AESDEC KEY STATE3
2192	AESDEC KEY STATE4
2193.align 4
2194.L4dec192:
2195	movaps -0x40(TKEYP), KEY
2196	AESDEC KEY STATE1
2197	AESDEC KEY STATE2
2198	AESDEC KEY STATE3
2199	AESDEC KEY STATE4
2200	movaps -0x30(TKEYP), KEY
2201	AESDEC KEY STATE1
2202	AESDEC KEY STATE2
2203	AESDEC KEY STATE3
2204	AESDEC KEY STATE4
2205.align 4
2206.L4dec128:
2207	movaps -0x20(TKEYP), KEY
2208	AESDEC KEY STATE1
2209	AESDEC KEY STATE2
2210	AESDEC KEY STATE3
2211	AESDEC KEY STATE4
2212	movaps -0x10(TKEYP), KEY
2213	AESDEC KEY STATE1
2214	AESDEC KEY STATE2
2215	AESDEC KEY STATE3
2216	AESDEC KEY STATE4
2217	movaps (TKEYP), KEY
2218	AESDEC KEY STATE1
2219	AESDEC KEY STATE2
2220	AESDEC KEY STATE3
2221	AESDEC KEY STATE4
2222	movaps 0x10(TKEYP), KEY
2223	AESDEC KEY STATE1
2224	AESDEC KEY STATE2
2225	AESDEC KEY STATE3
2226	AESDEC KEY STATE4
2227	movaps 0x20(TKEYP), KEY
2228	AESDEC KEY STATE1
2229	AESDEC KEY STATE2
2230	AESDEC KEY STATE3
2231	AESDEC KEY STATE4
2232	movaps 0x30(TKEYP), KEY
2233	AESDEC KEY STATE1
2234	AESDEC KEY STATE2
2235	AESDEC KEY STATE3
2236	AESDEC KEY STATE4
2237	movaps 0x40(TKEYP), KEY
2238	AESDEC KEY STATE1
2239	AESDEC KEY STATE2
2240	AESDEC KEY STATE3
2241	AESDEC KEY STATE4
2242	movaps 0x50(TKEYP), KEY
2243	AESDEC KEY STATE1
2244	AESDEC KEY STATE2
2245	AESDEC KEY STATE3
2246	AESDEC KEY STATE4
2247	movaps 0x60(TKEYP), KEY
2248	AESDEC KEY STATE1
2249	AESDEC KEY STATE2
2250	AESDEC KEY STATE3
2251	AESDEC KEY STATE4
2252	movaps 0x70(TKEYP), KEY
2253	AESDECLAST KEY STATE1		# last round
2254	AESDECLAST KEY STATE2
2255	AESDECLAST KEY STATE3
2256	AESDECLAST KEY STATE4
2257	ret
 
2258
2259/*
2260 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 *		      size_t len)
2262 */
2263ENTRY(aesni_ecb_enc)
 
2264#ifndef __x86_64__
2265	pushl LEN
2266	pushl KEYP
2267	pushl KLEN
2268	movl 16(%esp), KEYP
2269	movl 20(%esp), OUTP
2270	movl 24(%esp), INP
2271	movl 28(%esp), LEN
2272#endif
2273	test LEN, LEN		# check length
2274	jz .Lecb_enc_ret
2275	mov 480(KEYP), KLEN
2276	cmp $16, LEN
2277	jb .Lecb_enc_ret
2278	cmp $64, LEN
2279	jb .Lecb_enc_loop1
2280.align 4
2281.Lecb_enc_loop4:
2282	movups (INP), STATE1
2283	movups 0x10(INP), STATE2
2284	movups 0x20(INP), STATE3
2285	movups 0x30(INP), STATE4
2286	call _aesni_enc4
2287	movups STATE1, (OUTP)
2288	movups STATE2, 0x10(OUTP)
2289	movups STATE3, 0x20(OUTP)
2290	movups STATE4, 0x30(OUTP)
2291	sub $64, LEN
2292	add $64, INP
2293	add $64, OUTP
2294	cmp $64, LEN
2295	jge .Lecb_enc_loop4
2296	cmp $16, LEN
2297	jb .Lecb_enc_ret
2298.align 4
2299.Lecb_enc_loop1:
2300	movups (INP), STATE1
2301	call _aesni_enc1
2302	movups STATE1, (OUTP)
2303	sub $16, LEN
2304	add $16, INP
2305	add $16, OUTP
2306	cmp $16, LEN
2307	jge .Lecb_enc_loop1
2308.Lecb_enc_ret:
2309#ifndef __x86_64__
2310	popl KLEN
2311	popl KEYP
2312	popl LEN
2313#endif
 
2314	ret
 
2315
2316/*
2317 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 *		      size_t len);
2319 */
2320ENTRY(aesni_ecb_dec)
 
2321#ifndef __x86_64__
2322	pushl LEN
2323	pushl KEYP
2324	pushl KLEN
2325	movl 16(%esp), KEYP
2326	movl 20(%esp), OUTP
2327	movl 24(%esp), INP
2328	movl 28(%esp), LEN
2329#endif
2330	test LEN, LEN
2331	jz .Lecb_dec_ret
2332	mov 480(KEYP), KLEN
2333	add $240, KEYP
2334	cmp $16, LEN
2335	jb .Lecb_dec_ret
2336	cmp $64, LEN
2337	jb .Lecb_dec_loop1
2338.align 4
2339.Lecb_dec_loop4:
2340	movups (INP), STATE1
2341	movups 0x10(INP), STATE2
2342	movups 0x20(INP), STATE3
2343	movups 0x30(INP), STATE4
2344	call _aesni_dec4
2345	movups STATE1, (OUTP)
2346	movups STATE2, 0x10(OUTP)
2347	movups STATE3, 0x20(OUTP)
2348	movups STATE4, 0x30(OUTP)
2349	sub $64, LEN
2350	add $64, INP
2351	add $64, OUTP
2352	cmp $64, LEN
2353	jge .Lecb_dec_loop4
2354	cmp $16, LEN
2355	jb .Lecb_dec_ret
2356.align 4
2357.Lecb_dec_loop1:
2358	movups (INP), STATE1
2359	call _aesni_dec1
2360	movups STATE1, (OUTP)
2361	sub $16, LEN
2362	add $16, INP
2363	add $16, OUTP
2364	cmp $16, LEN
2365	jge .Lecb_dec_loop1
2366.Lecb_dec_ret:
2367#ifndef __x86_64__
2368	popl KLEN
2369	popl KEYP
2370	popl LEN
2371#endif
 
2372	ret
 
2373
2374/*
2375 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2376 *		      size_t len, u8 *iv)
2377 */
2378ENTRY(aesni_cbc_enc)
 
2379#ifndef __x86_64__
2380	pushl IVP
2381	pushl LEN
2382	pushl KEYP
2383	pushl KLEN
2384	movl 20(%esp), KEYP
2385	movl 24(%esp), OUTP
2386	movl 28(%esp), INP
2387	movl 32(%esp), LEN
2388	movl 36(%esp), IVP
2389#endif
2390	cmp $16, LEN
2391	jb .Lcbc_enc_ret
2392	mov 480(KEYP), KLEN
2393	movups (IVP), STATE	# load iv as initial state
2394.align 4
2395.Lcbc_enc_loop:
2396	movups (INP), IN	# load input
2397	pxor IN, STATE
2398	call _aesni_enc1
2399	movups STATE, (OUTP)	# store output
2400	sub $16, LEN
2401	add $16, INP
2402	add $16, OUTP
2403	cmp $16, LEN
2404	jge .Lcbc_enc_loop
2405	movups STATE, (IVP)
2406.Lcbc_enc_ret:
2407#ifndef __x86_64__
2408	popl KLEN
2409	popl KEYP
2410	popl LEN
2411	popl IVP
2412#endif
 
2413	ret
 
2414
2415/*
2416 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2417 *		      size_t len, u8 *iv)
2418 */
2419ENTRY(aesni_cbc_dec)
 
2420#ifndef __x86_64__
2421	pushl IVP
2422	pushl LEN
2423	pushl KEYP
2424	pushl KLEN
2425	movl 20(%esp), KEYP
2426	movl 24(%esp), OUTP
2427	movl 28(%esp), INP
2428	movl 32(%esp), LEN
2429	movl 36(%esp), IVP
2430#endif
2431	cmp $16, LEN
2432	jb .Lcbc_dec_just_ret
2433	mov 480(KEYP), KLEN
2434	add $240, KEYP
2435	movups (IVP), IV
2436	cmp $64, LEN
2437	jb .Lcbc_dec_loop1
2438.align 4
2439.Lcbc_dec_loop4:
2440	movups (INP), IN1
2441	movaps IN1, STATE1
2442	movups 0x10(INP), IN2
2443	movaps IN2, STATE2
2444#ifdef __x86_64__
2445	movups 0x20(INP), IN3
2446	movaps IN3, STATE3
2447	movups 0x30(INP), IN4
2448	movaps IN4, STATE4
2449#else
2450	movups 0x20(INP), IN1
2451	movaps IN1, STATE3
2452	movups 0x30(INP), IN2
2453	movaps IN2, STATE4
2454#endif
2455	call _aesni_dec4
2456	pxor IV, STATE1
2457#ifdef __x86_64__
2458	pxor IN1, STATE2
2459	pxor IN2, STATE3
2460	pxor IN3, STATE4
2461	movaps IN4, IV
2462#else
2463	pxor (INP), STATE2
2464	pxor 0x10(INP), STATE3
2465	pxor IN1, STATE4
2466	movaps IN2, IV
 
 
 
 
2467#endif
2468	movups STATE1, (OUTP)
2469	movups STATE2, 0x10(OUTP)
2470	movups STATE3, 0x20(OUTP)
2471	movups STATE4, 0x30(OUTP)
2472	sub $64, LEN
2473	add $64, INP
2474	add $64, OUTP
2475	cmp $64, LEN
2476	jge .Lcbc_dec_loop4
2477	cmp $16, LEN
2478	jb .Lcbc_dec_ret
2479.align 4
2480.Lcbc_dec_loop1:
2481	movups (INP), IN
2482	movaps IN, STATE
2483	call _aesni_dec1
2484	pxor IV, STATE
2485	movups STATE, (OUTP)
2486	movaps IN, IV
2487	sub $16, LEN
2488	add $16, INP
2489	add $16, OUTP
2490	cmp $16, LEN
2491	jge .Lcbc_dec_loop1
2492.Lcbc_dec_ret:
2493	movups IV, (IVP)
2494.Lcbc_dec_just_ret:
2495#ifndef __x86_64__
2496	popl KLEN
2497	popl KEYP
2498	popl LEN
2499	popl IVP
2500#endif
 
2501	ret
 
2502
2503#ifdef __x86_64__
 
2504.align 16
2505.Lbswap_mask:
2506	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
2507
2508/*
2509 * _aesni_inc_init:	internal ABI
2510 *	setup registers used by _aesni_inc
2511 * input:
2512 *	IV
2513 * output:
2514 *	CTR:	== IV, in little endian
2515 *	TCTR_LOW: == lower qword of CTR
2516 *	INC:	== 1, in little endian
2517 *	BSWAP_MASK == endian swapping mask
2518 */
2519.align 4
2520_aesni_inc_init:
2521	movaps .Lbswap_mask, BSWAP_MASK
2522	movaps IV, CTR
2523	PSHUFB_XMM BSWAP_MASK CTR
2524	mov $1, TCTR_LOW
2525	MOVQ_R64_XMM TCTR_LOW INC
2526	MOVQ_R64_XMM CTR TCTR_LOW
2527	ret
 
2528
2529/*
2530 * _aesni_inc:		internal ABI
2531 *	Increase IV by 1, IV is in big endian
2532 * input:
2533 *	IV
2534 *	CTR:	== IV, in little endian
2535 *	TCTR_LOW: == lower qword of CTR
2536 *	INC:	== 1, in little endian
2537 *	BSWAP_MASK == endian swapping mask
2538 * output:
2539 *	IV:	Increase by 1
2540 * changed:
2541 *	CTR:	== output IV, in little endian
2542 *	TCTR_LOW: == lower qword of CTR
2543 */
2544.align 4
2545_aesni_inc:
2546	paddq INC, CTR
2547	add $1, TCTR_LOW
2548	jnc .Linc_low
2549	pslldq $8, INC
2550	paddq INC, CTR
2551	psrldq $8, INC
2552.Linc_low:
2553	movaps CTR, IV
2554	PSHUFB_XMM BSWAP_MASK IV
2555	ret
 
2556
2557/*
2558 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2559 *		      size_t len, u8 *iv)
2560 */
2561ENTRY(aesni_ctr_enc)
 
2562	cmp $16, LEN
2563	jb .Lctr_enc_just_ret
2564	mov 480(KEYP), KLEN
2565	movups (IVP), IV
2566	call _aesni_inc_init
2567	cmp $64, LEN
2568	jb .Lctr_enc_loop1
2569.align 4
2570.Lctr_enc_loop4:
2571	movaps IV, STATE1
2572	call _aesni_inc
2573	movups (INP), IN1
2574	movaps IV, STATE2
2575	call _aesni_inc
2576	movups 0x10(INP), IN2
2577	movaps IV, STATE3
2578	call _aesni_inc
2579	movups 0x20(INP), IN3
2580	movaps IV, STATE4
2581	call _aesni_inc
2582	movups 0x30(INP), IN4
2583	call _aesni_enc4
2584	pxor IN1, STATE1
2585	movups STATE1, (OUTP)
2586	pxor IN2, STATE2
2587	movups STATE2, 0x10(OUTP)
2588	pxor IN3, STATE3
2589	movups STATE3, 0x20(OUTP)
2590	pxor IN4, STATE4
2591	movups STATE4, 0x30(OUTP)
2592	sub $64, LEN
2593	add $64, INP
2594	add $64, OUTP
2595	cmp $64, LEN
2596	jge .Lctr_enc_loop4
2597	cmp $16, LEN
2598	jb .Lctr_enc_ret
2599.align 4
2600.Lctr_enc_loop1:
2601	movaps IV, STATE
2602	call _aesni_inc
2603	movups (INP), IN
2604	call _aesni_enc1
2605	pxor IN, STATE
2606	movups STATE, (OUTP)
2607	sub $16, LEN
2608	add $16, INP
2609	add $16, OUTP
2610	cmp $16, LEN
2611	jge .Lctr_enc_loop1
2612.Lctr_enc_ret:
2613	movups IV, (IVP)
2614.Lctr_enc_just_ret:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2615	ret
 
 
2616#endif

   1/*
   2 * Implement AES algorithm in Intel AES-NI instructions.
   3 *
   4 * The white paper of AES-NI instructions can be downloaded from:
   5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   6 *
   7 * Copyright (C) 2008, Intel Corp.
   8 *    Author: Huang Ying <ying.huang@intel.com>
   9 *            Vinodh Gopal <vinodh.gopal@intel.com>
  10 *            Kahraman Akdemir
  11 *
  12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13 * interface for 64-bit kernels.
  14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  16 *             Adrian Hoban <adrian.hoban@intel.com>
  17 *             James Guilford (james.guilford@intel.com)
  18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  19 *             Tadeusz Struk (tadeusz.struk@intel.com)
  20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  21 *    Copyright (c) 2010, Intel Corporation.
  22 *
  23 * Ported x86_64 version to x86:
  24 *    Author: Mathias Krause <minipli@googlemail.com>
  25 *
  26 * This program is free software; you can redistribute it and/or modify
  27 * it under the terms of the GNU General Public License as published by
  28 * the Free Software Foundation; either version 2 of the License, or
  29 * (at your option) any later version.
  30 */
  31
  32#include <linux/linkage.h>
  33#include <asm/inst.h>
  34#include <asm/frame.h>
  35#include <asm/nospec-branch.h>
  36
  37/*
  38 * The following macros are used to move an (un)aligned 16 byte value to/from
  39 * an XMM register.  This can done for either FP or integer values, for FP use
  40 * movaps (move aligned packed single) or integer use movdqa (move double quad
  41 * aligned).  It doesn't make a performance difference which instruction is used
  42 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  43 * shorter, so that is the one we'll use for now. (same for unaligned).
  44 */
  45#define MOVADQ	movaps
  46#define MOVUDQ	movups
  47
  48#ifdef __x86_64__
  49
  50# constants in mergeable sections, linker can reorder and merge
  51.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  52.align 16
  53.Lgf128mul_x_ble_mask:
  54	.octa 0x00000000000000010000000000000087
  55.section	.rodata.cst16.POLY, "aM", @progbits, 16
  56.align 16
  57POLY:   .octa 0xC2000000000000000000000000000001
  58.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
  59.align 16
  60TWOONE: .octa 0x00000001000000000000000000000001
  61
  62.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  63.align 16
 
 
  64SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  65.section	.rodata.cst16.MASK1, "aM", @progbits, 16
  66.align 16
  67MASK1:      .octa 0x0000000000000000ffffffffffffffff
  68.section	.rodata.cst16.MASK2, "aM", @progbits, 16
  69.align 16
  70MASK2:      .octa 0xffffffffffffffff0000000000000000
  71.section	.rodata.cst16.ONE, "aM", @progbits, 16
  72.align 16
 
  73ONE:        .octa 0x00000000000000000000000000000001
  74.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  75.align 16
  76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  77.section	.rodata.cst16.dec, "aM", @progbits, 16
  78.align 16
  79dec:        .octa 0x1
  80.section	.rodata.cst16.enc, "aM", @progbits, 16
  81.align 16
  82enc:        .octa 0x2
  83
  84# order of these constants should not change.
  85# more specifically, ALL_F should follow SHIFT_MASK,
  86# and zero should follow ALL_F
  87.section	.rodata, "a", @progbits
  88.align 16
  89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  90ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  91            .octa 0x00000000000000000000000000000000
  92
  93.text
  94
  95
  96#define	STACK_OFFSET    8*3
  97
  98#define AadHash 16*0
  99#define AadLen 16*1
 100#define InLen (16*1)+8
 101#define PBlockEncKey 16*2
 102#define OrigIV 16*3
 103#define CurCount 16*4
 104#define PBlockLen 16*5
 105#define	HashKey		16*6	// store HashKey <<1 mod poly here
 106#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
 107#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
 108#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
 109#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
 110				// bits of  HashKey <<1 mod poly here
 111				//(for Karatsuba purposes)
 112#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
 113				// bits of  HashKey^2 <<1 mod poly here
 114				// (for Karatsuba purposes)
 115#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
 116				// bits of  HashKey^3 <<1 mod poly here
 117				// (for Karatsuba purposes)
 118#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
 119				// bits of  HashKey^4 <<1 mod poly here
 120				// (for Karatsuba purposes)
 
 121
 122#define arg1 rdi
 123#define arg2 rsi
 124#define arg3 rdx
 125#define arg4 rcx
 126#define arg5 r8
 127#define arg6 r9
 128#define arg7 STACK_OFFSET+8(%rsp)
 129#define arg8 STACK_OFFSET+16(%rsp)
 130#define arg9 STACK_OFFSET+24(%rsp)
 131#define arg10 STACK_OFFSET+32(%rsp)
 132#define arg11 STACK_OFFSET+40(%rsp)
 133#define keysize 2*15*16(%arg1)
 134#endif
 135
 136
 137#define STATE1	%xmm0
 138#define STATE2	%xmm4
 139#define STATE3	%xmm5
 140#define STATE4	%xmm6
 141#define STATE	STATE1
 142#define IN1	%xmm1
 143#define IN2	%xmm7
 144#define IN3	%xmm8
 145#define IN4	%xmm9
 146#define IN	IN1
 147#define KEY	%xmm2
 148#define IV	%xmm3
 149
 150#define BSWAP_MASK %xmm10
 151#define CTR	%xmm11
 152#define INC	%xmm12
 153
 154#define GF128MUL_MASK %xmm10
 155
 156#ifdef __x86_64__
 157#define AREG	%rax
 158#define KEYP	%rdi
 159#define OUTP	%rsi
 160#define UKEYP	OUTP
 161#define INP	%rdx
 162#define LEN	%rcx
 163#define IVP	%r8
 164#define KLEN	%r9d
 165#define T1	%r10
 166#define TKEYP	T1
 167#define T2	%r11
 168#define TCTR_LOW T2
 169#else
 170#define AREG	%eax
 171#define KEYP	%edi
 172#define OUTP	AREG
 173#define UKEYP	OUTP
 174#define INP	%edx
 175#define LEN	%esi
 176#define IVP	%ebp
 177#define KLEN	%ebx
 178#define T1	%ecx
 179#define TKEYP	T1
 180#endif
 181
 182.macro FUNC_SAVE
 183	push	%r12
 184	push	%r13
 185	push	%r14
 186#
 187# states of %xmm registers %xmm6:%xmm15 not saved
 188# all %xmm registers are clobbered
 189#
 190.endm
 191
 192
 193.macro FUNC_RESTORE
 194	pop	%r14
 195	pop	%r13
 196	pop	%r12
 197.endm
 198
 199# Precompute hashkeys.
 200# Input: Hash subkey.
 201# Output: HashKeys stored in gcm_context_data.  Only needs to be called
 202# once per key.
 203# clobbers r12, and tmp xmm registers.
 204.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 205	mov	\SUBKEY, %r12
 206	movdqu	(%r12), \TMP3
 207	movdqa	SHUF_MASK(%rip), \TMP2
 208	PSHUFB_XMM \TMP2, \TMP3
 209
 210	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 211
 212	movdqa	\TMP3, \TMP2
 213	psllq	$1, \TMP3
 214	psrlq	$63, \TMP2
 215	movdqa	\TMP2, \TMP1
 216	pslldq	$8, \TMP2
 217	psrldq	$8, \TMP1
 218	por	\TMP2, \TMP3
 219
 220	# reduce HashKey<<1
 221
 222	pshufd	$0x24, \TMP1, \TMP2
 223	pcmpeqd TWOONE(%rip), \TMP2
 224	pand	POLY(%rip), \TMP2
 225	pxor	\TMP2, \TMP3
 226	movdqa	\TMP3, HashKey(%arg2)
 227
 228	movdqa	   \TMP3, \TMP5
 229	pshufd	   $78, \TMP3, \TMP1
 230	pxor	   \TMP3, \TMP1
 231	movdqa	   \TMP1, HashKey_k(%arg2)
 232
 233	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 234# TMP5 = HashKey^2<<1 (mod poly)
 235	movdqa	   \TMP5, HashKey_2(%arg2)
 236# HashKey_2 = HashKey^2<<1 (mod poly)
 237	pshufd	   $78, \TMP5, \TMP1
 238	pxor	   \TMP5, \TMP1
 239	movdqa	   \TMP1, HashKey_2_k(%arg2)
 240
 241	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 242# TMP5 = HashKey^3<<1 (mod poly)
 243	movdqa	   \TMP5, HashKey_3(%arg2)
 244	pshufd	   $78, \TMP5, \TMP1
 245	pxor	   \TMP5, \TMP1
 246	movdqa	   \TMP1, HashKey_3_k(%arg2)
 247
 248	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 249# TMP5 = HashKey^3<<1 (mod poly)
 250	movdqa	   \TMP5, HashKey_4(%arg2)
 251	pshufd	   $78, \TMP5, \TMP1
 252	pxor	   \TMP5, \TMP1
 253	movdqa	   \TMP1, HashKey_4_k(%arg2)
 254.endm
 255
 256# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 257# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 258.macro GCM_INIT Iv SUBKEY AAD AADLEN
 259	mov \AADLEN, %r11
 260	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 261	xor %r11, %r11
 262	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 263	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 264	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 265	mov \Iv, %rax
 266	movdqu (%rax), %xmm0
 267	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 268
 269	movdqa  SHUF_MASK(%rip), %xmm2
 270	PSHUFB_XMM %xmm2, %xmm0
 271	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 272
 273	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 274	movdqa HashKey(%arg2), %xmm13
 275
 276	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 277	%xmm4, %xmm5, %xmm6
 278.endm
 279
 280# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 281# struct has been initialized by GCM_INIT.
 282# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 283# Clobbers rax, r10-r13, and xmm0-xmm15
 284.macro GCM_ENC_DEC operation
 285	movdqu AadHash(%arg2), %xmm8
 286	movdqu HashKey(%arg2), %xmm13
 287	add %arg5, InLen(%arg2)
 288
 289	xor %r11, %r11 # initialise the data pointer offset as zero
 290	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 291
 292	sub %r11, %arg5		# sub partial block data used
 293	mov %arg5, %r13		# save the number of bytes
 294
 295	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
 296	mov %r13, %r12
 297	# Encrypt/Decrypt first few blocks
 298
 299	and	$(3<<4), %r12
 300	jz	_initial_num_blocks_is_0_\@
 301	cmp	$(2<<4), %r12
 302	jb	_initial_num_blocks_is_1_\@
 303	je	_initial_num_blocks_is_2_\@
 304_initial_num_blocks_is_3_\@:
 305	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 307	sub	$48, %r13
 308	jmp	_initial_blocks_\@
 309_initial_num_blocks_is_2_\@:
 310	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 312	sub	$32, %r13
 313	jmp	_initial_blocks_\@
 314_initial_num_blocks_is_1_\@:
 315	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 317	sub	$16, %r13
 318	jmp	_initial_blocks_\@
 319_initial_num_blocks_is_0_\@:
 320	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 321%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 322_initial_blocks_\@:
 323
 324	# Main loop - Encrypt/Decrypt remaining blocks
 325
 326	cmp	$0, %r13
 327	je	_zero_cipher_left_\@
 328	sub	$64, %r13
 329	je	_four_cipher_left_\@
 330_crypt_by_4_\@:
 331	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
 332	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 333	%xmm7, %xmm8, enc
 334	add	$64, %r11
 335	sub	$64, %r13
 336	jne	_crypt_by_4_\@
 337_four_cipher_left_\@:
 338	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 339%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 340_zero_cipher_left_\@:
 341	movdqu %xmm8, AadHash(%arg2)
 342	movdqu %xmm0, CurCount(%arg2)
 343
 344	mov	%arg5, %r13
 345	and	$15, %r13			# %r13 = arg5 (mod 16)
 346	je	_multiple_of_16_bytes_\@
 347
 348	mov %r13, PBlockLen(%arg2)
 349
 350	# Handle the last <16 Byte block separately
 351	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 352	movdqu %xmm0, CurCount(%arg2)
 353	movdqa SHUF_MASK(%rip), %xmm10
 354	PSHUFB_XMM %xmm10, %xmm0
 355
 356	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
 357	movdqu %xmm0, PBlockEncKey(%arg2)
 358
 359	cmp	$16, %arg5
 360	jge _large_enough_update_\@
 361
 362	lea (%arg4,%r11,1), %r10
 363	mov %r13, %r12
 364	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 365	jmp _data_read_\@
 366
 367_large_enough_update_\@:
 368	sub	$16, %r11
 369	add	%r13, %r11
 370
 371	# receive the last <16 Byte block
 372	movdqu	(%arg4, %r11, 1), %xmm1
 373
 374	sub	%r13, %r11
 375	add	$16, %r11
 376
 377	lea	SHIFT_MASK+16(%rip), %r12
 378	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 379	# (r13 is the number of bytes in plaintext mod 16)
 380	sub	%r13, %r12
 381	# get the appropriate shuffle mask
 382	movdqu	(%r12), %xmm2
 383	# shift right 16-r13 bytes
 384	PSHUFB_XMM  %xmm2, %xmm1
 385
 386_data_read_\@:
 387	lea ALL_F+16(%rip), %r12
 388	sub %r13, %r12
 389
 390.ifc \operation, dec
 391	movdqa  %xmm1, %xmm2
 392.endif
 393	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
 394	movdqu	(%r12), %xmm1
 395	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
 396	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 397.ifc \operation, dec
 398	pand    %xmm1, %xmm2
 399	movdqa SHUF_MASK(%rip), %xmm10
 400	PSHUFB_XMM %xmm10 ,%xmm2
 401
 402	pxor %xmm2, %xmm8
 403.else
 404	movdqa SHUF_MASK(%rip), %xmm10
 405	PSHUFB_XMM %xmm10,%xmm0
 406
 407	pxor	%xmm0, %xmm8
 408.endif
 409
 410	movdqu %xmm8, AadHash(%arg2)
 411.ifc \operation, enc
 412	# GHASH computation for the last <16 byte block
 413	movdqa SHUF_MASK(%rip), %xmm10
 414	# shuffle xmm0 back to output as ciphertext
 415	PSHUFB_XMM %xmm10, %xmm0
 416.endif
 417
 418	# Output %r13 bytes
 419	MOVQ_R64_XMM %xmm0, %rax
 420	cmp $8, %r13
 421	jle _less_than_8_bytes_left_\@
 422	mov %rax, (%arg3 , %r11, 1)
 423	add $8, %r11
 424	psrldq $8, %xmm0
 425	MOVQ_R64_XMM %xmm0, %rax
 426	sub $8, %r13
 427_less_than_8_bytes_left_\@:
 428	mov %al,  (%arg3, %r11, 1)
 429	add $1, %r11
 430	shr $8, %rax
 431	sub $1, %r13
 432	jne _less_than_8_bytes_left_\@
 433_multiple_of_16_bytes_\@:
 434.endm
 435
 436# GCM_COMPLETE Finishes update of tag of last partial block
 437# Output: Authorization Tag (AUTH_TAG)
 438# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 439.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 440	movdqu AadHash(%arg2), %xmm8
 441	movdqu HashKey(%arg2), %xmm13
 442
 443	mov PBlockLen(%arg2), %r12
 444
 445	cmp $0, %r12
 446	je _partial_done\@
 447
 448	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 449
 450_partial_done\@:
 451	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 452	shl	$3, %r12		  # convert into number of bits
 453	movd	%r12d, %xmm15		  # len(A) in %xmm15
 454	mov InLen(%arg2), %r12
 455	shl     $3, %r12                  # len(C) in bits (*128)
 456	MOVQ_R64_XMM    %r12, %xmm1
 457
 458	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
 459	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
 460	pxor	%xmm15, %xmm8
 461	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 462	# final GHASH computation
 463	movdqa SHUF_MASK(%rip), %xmm10
 464	PSHUFB_XMM %xmm10, %xmm8
 465
 466	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 467	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
 468	pxor	%xmm8, %xmm0
 469_return_T_\@:
 470	mov	\AUTHTAG, %r10                     # %r10 = authTag
 471	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 472	cmp	$16, %r11
 473	je	_T_16_\@
 474	cmp	$8, %r11
 475	jl	_T_4_\@
 476_T_8_\@:
 477	MOVQ_R64_XMM	%xmm0, %rax
 478	mov	%rax, (%r10)
 479	add	$8, %r10
 480	sub	$8, %r11
 481	psrldq	$8, %xmm0
 482	cmp	$0, %r11
 483	je	_return_T_done_\@
 484_T_4_\@:
 485	movd	%xmm0, %eax
 486	mov	%eax, (%r10)
 487	add	$4, %r10
 488	sub	$4, %r11
 489	psrldq	$4, %xmm0
 490	cmp	$0, %r11
 491	je	_return_T_done_\@
 492_T_123_\@:
 493	movd	%xmm0, %eax
 494	cmp	$2, %r11
 495	jl	_T_1_\@
 496	mov	%ax, (%r10)
 497	cmp	$2, %r11
 498	je	_return_T_done_\@
 499	add	$2, %r10
 500	sar	$16, %eax
 501_T_1_\@:
 502	mov	%al, (%r10)
 503	jmp	_return_T_done_\@
 504_T_16_\@:
 505	movdqu	%xmm0, (%r10)
 506_return_T_done_\@:
 507.endm
 508
 509#ifdef __x86_64__
 510/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 511*
 512*
 513* Input: A and B (128-bits each, bit-reflected)
 514* Output: C = A*B*x mod poly, (i.e. >>1 )
 515* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 516* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 517*
 518*/
 519.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 520	movdqa	  \GH, \TMP1
 521	pshufd	  $78, \GH, \TMP2
 522	pshufd	  $78, \HK, \TMP3
 523	pxor	  \GH, \TMP2            # TMP2 = a1+a0
 524	pxor	  \HK, \TMP3            # TMP3 = b1+b0
 525	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 526	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 527	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 528	pxor	  \GH, \TMP2
 529	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 530	movdqa	  \TMP2, \TMP3
 531	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
 532	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
 533	pxor	  \TMP3, \GH
 534	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 535
 536        # first phase of the reduction
 537
 538	movdqa    \GH, \TMP2
 539	movdqa    \GH, \TMP3
 540	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 541					# in in order to perform
 542					# independent shifts
 543	pslld     $31, \TMP2            # packed right shift <<31
 544	pslld     $30, \TMP3            # packed right shift <<30
 545	pslld     $25, \TMP4            # packed right shift <<25
 546	pxor      \TMP3, \TMP2          # xor the shifted versions
 547	pxor      \TMP4, \TMP2
 548	movdqa    \TMP2, \TMP5
 549	psrldq    $4, \TMP5             # right shift TMP5 1 DW
 550	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 551	pxor      \TMP2, \GH
 552
 553        # second phase of the reduction
 554
 555	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 556					# in in order to perform
 557					# independent shifts
 558	movdqa    \GH,\TMP3
 559	movdqa    \GH,\TMP4
 560	psrld     $1,\TMP2              # packed left shift >>1
 561	psrld     $2,\TMP3              # packed left shift >>2
 562	psrld     $7,\TMP4              # packed left shift >>7
 563	pxor      \TMP3,\TMP2		# xor the shifted versions
 564	pxor      \TMP4,\TMP2
 565	pxor      \TMP5, \TMP2
 566	pxor      \TMP2, \GH
 567	pxor      \TMP1, \GH            # result is in TMP1
 568.endm
 569
 570# Reads DLEN bytes starting at DPTR and stores in XMMDst
 571# where 0 < DLEN < 16
 572# Clobbers %rax, DLEN and XMM1
 573.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 574        cmp $8, \DLEN
 575        jl _read_lt8_\@
 576        mov (\DPTR), %rax
 577        MOVQ_R64_XMM %rax, \XMMDst
 578        sub $8, \DLEN
 579        jz _done_read_partial_block_\@
 580	xor %eax, %eax
 581_read_next_byte_\@:
 582        shl $8, %rax
 583        mov 7(\DPTR, \DLEN, 1), %al
 584        dec \DLEN
 585        jnz _read_next_byte_\@
 586        MOVQ_R64_XMM %rax, \XMM1
 587	pslldq $8, \XMM1
 588        por \XMM1, \XMMDst
 589	jmp _done_read_partial_block_\@
 590_read_lt8_\@:
 591	xor %eax, %eax
 592_read_next_byte_lt8_\@:
 593        shl $8, %rax
 594        mov -1(\DPTR, \DLEN, 1), %al
 595        dec \DLEN
 596        jnz _read_next_byte_lt8_\@
 597        MOVQ_R64_XMM %rax, \XMMDst
 598_done_read_partial_block_\@:
 599.endm
 600
 601# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 602# clobbers r10-11, xmm14
 603.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 604	TMP6 TMP7
 605	MOVADQ	   SHUF_MASK(%rip), %xmm14
 606	mov	   \AAD, %r10		# %r10 = AAD
 607	mov	   \AADLEN, %r11		# %r11 = aadLen
 608	pxor	   \TMP7, \TMP7
 609	pxor	   \TMP6, \TMP6
 610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 611	cmp	   $16, %r11
 612	jl	   _get_AAD_rest\@
 613_get_AAD_blocks\@:
 614	movdqu	   (%r10), \TMP7
 615	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 616	pxor	   \TMP7, \TMP6
 617	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 618	add	   $16, %r10
 619	sub	   $16, %r11
 620	cmp	   $16, %r11
 621	jge	   _get_AAD_blocks\@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 622
 623	movdqu	   \TMP6, \TMP7
 
 
 624
 625	/* read the last <16B of AAD */
 626_get_AAD_rest\@:
 627	cmp	   $0, %r11
 628	je	   _get_AAD_done\@
 629
 630	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 631	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 632	pxor	   \TMP6, \TMP7
 633	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 634	movdqu \TMP7, \TMP6
 635
 636_get_AAD_done\@:
 637	movdqu \TMP6, AadHash(%arg2)
 638.endm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 639
 640# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 641# between update calls.
 642# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 643# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 644# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 645.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 646	AAD_HASH operation
 647	mov 	PBlockLen(%arg2), %r13
 648	cmp	$0, %r13
 649	je	_partial_block_done_\@	# Leave Macro if no partial blocks
 650	# Read in input data without over reading
 651	cmp	$16, \PLAIN_CYPH_LEN
 652	jl	_fewer_than_16_bytes_\@
 653	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
 654	jmp	_data_read_\@
 655
 656_fewer_than_16_bytes_\@:
 657	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 658	mov	\PLAIN_CYPH_LEN, %r12
 659	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 660
 661	mov PBlockLen(%arg2), %r13
 662
 663_data_read_\@:				# Finished reading in data
 664
 665	movdqu	PBlockEncKey(%arg2), %xmm9
 666	movdqu	HashKey(%arg2), %xmm13
 667
 668	lea	SHIFT_MASK(%rip), %r12
 669
 670	# adjust the shuffle mask pointer to be able to shift r13 bytes
 671	# r16-r13 is the number of bytes in plaintext mod 16)
 672	add	%r13, %r12
 673	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
 674	PSHUFB_XMM %xmm2, %xmm9		# shift right r13 bytes
 675
 676.ifc \operation, dec
 677	movdqa	%xmm1, %xmm3
 678	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
 679
 680	mov	\PLAIN_CYPH_LEN, %r10
 681	add	%r13, %r10
 682	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 683	sub	$16, %r10
 684	# Determine if if partial block is not being filled and
 685	# shift mask accordingly
 686	jge	_no_extra_mask_1_\@
 687	sub	%r10, %r12
 688_no_extra_mask_1_\@:
 689
 690	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
 691	# get the appropriate mask to mask out bottom r13 bytes of xmm9
 692	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
 
 693
 694	pand	%xmm1, %xmm3
 695	movdqa	SHUF_MASK(%rip), %xmm10
 696	PSHUFB_XMM	%xmm10, %xmm3
 697	PSHUFB_XMM	%xmm2, %xmm3
 698	pxor	%xmm3, \AAD_HASH
 699
 700	cmp	$0, %r10
 701	jl	_partial_incomplete_1_\@
 702
 703	# GHASH computation for the last <16 Byte block
 704	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 705	xor	%rax,%rax
 706
 707	mov	%rax, PBlockLen(%arg2)
 708	jmp	_dec_done_\@
 709_partial_incomplete_1_\@:
 710	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
 711_dec_done_\@:
 712	movdqu	\AAD_HASH, AadHash(%arg2)
 713.else
 714	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
 715
 716	mov	\PLAIN_CYPH_LEN, %r10
 717	add	%r13, %r10
 718	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 719	sub	$16, %r10
 720	# Determine if if partial block is not being filled and
 721	# shift mask accordingly
 722	jge	_no_extra_mask_2_\@
 723	sub	%r10, %r12
 724_no_extra_mask_2_\@:
 725
 726	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
 727	# get the appropriate mask to mask out bottom r13 bytes of xmm9
 728	pand	%xmm1, %xmm9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 729
 730	movdqa	SHUF_MASK(%rip), %xmm1
 731	PSHUFB_XMM %xmm1, %xmm9
 732	PSHUFB_XMM %xmm2, %xmm9
 733	pxor	%xmm9, \AAD_HASH
 734
 735	cmp	$0, %r10
 736	jl	_partial_incomplete_2_\@
 737
 738	# GHASH computation for the last <16 Byte block
 739	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 740	xor	%rax,%rax
 741
 742	mov	%rax, PBlockLen(%arg2)
 743	jmp	_encode_done_\@
 744_partial_incomplete_2_\@:
 745	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
 746_encode_done_\@:
 747	movdqu	\AAD_HASH, AadHash(%arg2)
 748
 749	movdqa	SHUF_MASK(%rip), %xmm10
 750	# shuffle xmm9 back to output as ciphertext
 751	PSHUFB_XMM	%xmm10, %xmm9
 752	PSHUFB_XMM	%xmm2, %xmm9
 753.endif
 754	# output encrypted Bytes
 755	cmp	$0, %r10
 756	jl	_partial_fill_\@
 757	mov	%r13, %r12
 758	mov	$16, %r13
 759	# Set r13 to be the number of bytes to write out
 760	sub	%r12, %r13
 761	jmp	_count_set_\@
 762_partial_fill_\@:
 763	mov	\PLAIN_CYPH_LEN, %r13
 764_count_set_\@:
 765	movdqa	%xmm9, %xmm0
 766	MOVQ_R64_XMM	%xmm0, %rax
 767	cmp	$8, %r13
 768	jle	_less_than_8_bytes_left_\@
 769
 770	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 771	add	$8, \DATA_OFFSET
 772	psrldq	$8, %xmm0
 773	MOVQ_R64_XMM	%xmm0, %rax
 774	sub	$8, %r13
 775_less_than_8_bytes_left_\@:
 776	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 777	add	$1, \DATA_OFFSET
 778	shr	$8, %rax
 779	sub	$1, %r13
 780	jne	_less_than_8_bytes_left_\@
 781_partial_block_done_\@:
 782.endm # PARTIAL_BLOCK
 783
 784/*
 785* if a = number of total plaintext bytes
 786* b = floor(a/16)
 787* num_initial_blocks = b mod 4
 788* encrypt the initial num_initial_blocks blocks and apply ghash on
 789* the ciphertext
 790* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 791* are clobbered
 792* arg1, %arg2, %arg3 are used as a pointer only, not modified
 793*/
 794
 795
 796.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 797	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 798	MOVADQ		SHUF_MASK(%rip), %xmm14
 799
 800	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
 801
 802	# start AES for num_initial_blocks blocks
 803
 804	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 805
 806.if (\i == 5) || (\i == 6) || (\i == 7)
 
 
 
 
 
 807
 808	MOVADQ		ONE(%RIP),\TMP1
 809	MOVADQ		0(%arg1),\TMP2
 
 
 810.irpc index, \i_seq
 811	paddd		\TMP1, \XMM0                 # INCR Y0
 812.ifc \operation, dec
 813        movdqa     \XMM0, %xmm\index
 814.else
 815	MOVADQ		\XMM0, %xmm\index
 816.endif
 817	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
 818	pxor		\TMP2, %xmm\index
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 819.endr
 820	lea	0x10(%arg1),%r10
 821	mov	keysize,%eax
 822	shr	$2,%eax				# 128->4, 192->6, 256->8
 823	add	$5,%eax			      # 128->9, 192->11, 256->13
 824
 825aes_loop_initial_\@:
 826	MOVADQ	(%r10),\TMP1
 827.irpc	index, \i_seq
 828	AESENC	\TMP1, %xmm\index
 829.endr
 830	add	$16,%r10
 831	sub	$1,%eax
 832	jnz	aes_loop_initial_\@
 833
 834	MOVADQ	(%r10), \TMP1
 835.irpc index, \i_seq
 836	AESENCLAST \TMP1, %xmm\index         # Last Round
 
 837.endr
 838.irpc index, \i_seq
 839	movdqu	   (%arg4 , %r11, 1), \TMP1
 840	pxor	   \TMP1, %xmm\index
 841	movdqu	   %xmm\index, (%arg3 , %r11, 1)
 842	# write back plaintext/ciphertext for num_initial_blocks
 843	add	   $16, %r11
 844
 845.ifc \operation, dec
 846	movdqa     \TMP1, %xmm\index
 847.endif
 848	PSHUFB_XMM	   %xmm14, %xmm\index
 849
 850		# prepare plaintext/ciphertext for GHASH computation
 851.endr
 852.endif
 853
 854        # apply GHASH on num_initial_blocks blocks
 855
 856.if \i == 5
 857        pxor       %xmm5, %xmm6
 858	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859        pxor       %xmm6, %xmm7
 860	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 861        pxor       %xmm7, %xmm8
 862	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 863.elseif \i == 6
 864        pxor       %xmm6, %xmm7
 865	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 866        pxor       %xmm7, %xmm8
 867	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 868.elseif \i == 7
 869        pxor       %xmm7, %xmm8
 870	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 871.endif
 872	cmp	   $64, %r13
 873	jl	_initial_blocks_done\@
 874	# no need for precomputed values
 875/*
 876*
 877* Precomputations for HashKey parallel with encryption of first 4 blocks.
 878* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 879*/
 880	MOVADQ	   ONE(%RIP),\TMP1
 881	paddd	   \TMP1, \XMM0              # INCR Y0
 882	MOVADQ	   \XMM0, \XMM1
 883	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 884
 885	paddd	   \TMP1, \XMM0              # INCR Y0
 886	MOVADQ	   \XMM0, \XMM2
 
 887	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 888
 889	paddd	   \TMP1, \XMM0              # INCR Y0
 890	MOVADQ	   \XMM0, \XMM3
 
 891	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 892
 893	paddd	   \TMP1, \XMM0              # INCR Y0
 894	MOVADQ	   \XMM0, \XMM4
 
 895	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 896
 897	MOVADQ	   0(%arg1),\TMP1
 898	pxor	   \TMP1, \XMM1
 899	pxor	   \TMP1, \XMM2
 900	pxor	   \TMP1, \XMM3
 901	pxor	   \TMP1, \XMM4
 
 
 
 
 
 
 
 
 
 
 902.irpc index, 1234 # do 4 rounds
 903	movaps 0x10*\index(%arg1), \TMP1
 904	AESENC	   \TMP1, \XMM1
 905	AESENC	   \TMP1, \XMM2
 906	AESENC	   \TMP1, \XMM3
 907	AESENC	   \TMP1, \XMM4
 908.endr
 
 
 
 
 
 
 909.irpc index, 56789 # do next 5 rounds
 910	movaps 0x10*\index(%arg1), \TMP1
 911	AESENC	   \TMP1, \XMM1
 912	AESENC	   \TMP1, \XMM2
 913	AESENC	   \TMP1, \XMM3
 914	AESENC	   \TMP1, \XMM4
 915.endr
 916	lea	   0xa0(%arg1),%r10
 917	mov	   keysize,%eax
 918	shr	   $2,%eax			# 128->4, 192->6, 256->8
 919	sub	   $4,%eax			# 128->0, 192->2, 256->4
 920	jz	   aes_loop_pre_done\@
 921
 922aes_loop_pre_\@:
 923	MOVADQ	   (%r10),\TMP2
 924.irpc	index, 1234
 925	AESENC	   \TMP2, %xmm\index
 926.endr
 927	add	   $16,%r10
 928	sub	   $1,%eax
 929	jnz	   aes_loop_pre_\@
 930
 931aes_loop_pre_done\@:
 932	MOVADQ	   (%r10), \TMP2
 933	AESENCLAST \TMP2, \XMM1
 934	AESENCLAST \TMP2, \XMM2
 935	AESENCLAST \TMP2, \XMM3
 936	AESENCLAST \TMP2, \XMM4
 937	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
 938	pxor	   \TMP1, \XMM1
 939.ifc \operation, dec
 940	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 941	movdqa     \TMP1, \XMM1
 942.endif
 943	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
 944	pxor	   \TMP1, \XMM2
 945.ifc \operation, dec
 946	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 947	movdqa     \TMP1, \XMM2
 948.endif
 949	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
 950	pxor	   \TMP1, \XMM3
 951.ifc \operation, dec
 952	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 953	movdqa     \TMP1, \XMM3
 954.endif
 955	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
 956	pxor	   \TMP1, \XMM4
 957.ifc \operation, dec
 958	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 959	movdqa     \TMP1, \XMM4
 960.else
 961	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 962	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 963	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 964	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 965.endif
 966
 967	add	   $64, %r11
 
 968	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 969	pxor	   \XMMDst, \XMM1
 970# combine GHASHed value with the corresponding ciphertext
 
 971	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 
 972	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 
 973	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 974
 975_initial_blocks_done\@:
 976
 977.endm
 978
 979/*
 980* encrypt 4 blocks at a time
 981* ghash the 4 previously encrypted ciphertext blocks
 982* arg1, %arg3, %arg4 are used as pointers only, not modified
 983* %r11 is the data offset value
 984*/
 985.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 986TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 987
 988	movdqa	  \XMM1, \XMM5
 989	movdqa	  \XMM2, \XMM6
 990	movdqa	  \XMM3, \XMM7
 991	movdqa	  \XMM4, \XMM8
 992
 993        movdqa    SHUF_MASK(%rip), %xmm15
 994        # multiply TMP5 * HashKey using karatsuba
 995
 996	movdqa	  \XMM5, \TMP4
 997	pshufd	  $78, \XMM5, \TMP6
 998	pxor	  \XMM5, \TMP6
 999	paddd     ONE(%rip), \XMM0		# INCR CNT
1000	movdqa	  HashKey_4(%arg2), \TMP5
1001	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1002	movdqa    \XMM0, \XMM1
1003	paddd     ONE(%rip), \XMM0		# INCR CNT
1004	movdqa    \XMM0, \XMM2
1005	paddd     ONE(%rip), \XMM0		# INCR CNT
1006	movdqa    \XMM0, \XMM3
1007	paddd     ONE(%rip), \XMM0		# INCR CNT
1008	movdqa    \XMM0, \XMM4
1009	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1010	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1011	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1012	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1013	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1014
1015	pxor	  (%arg1), \XMM1
1016	pxor	  (%arg1), \XMM2
1017	pxor	  (%arg1), \XMM3
1018	pxor	  (%arg1), \XMM4
1019	movdqa	  HashKey_4_k(%arg2), \TMP5
1020	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1021	movaps 0x10(%arg1), \TMP1
1022	AESENC	  \TMP1, \XMM1              # Round 1
1023	AESENC	  \TMP1, \XMM2
1024	AESENC	  \TMP1, \XMM3
1025	AESENC	  \TMP1, \XMM4
1026	movaps 0x20(%arg1), \TMP1
1027	AESENC	  \TMP1, \XMM1              # Round 2
1028	AESENC	  \TMP1, \XMM2
1029	AESENC	  \TMP1, \XMM3
1030	AESENC	  \TMP1, \XMM4
1031	movdqa	  \XMM6, \TMP1
1032	pshufd	  $78, \XMM6, \TMP2
1033	pxor	  \XMM6, \TMP2
1034	movdqa	  HashKey_3(%arg2), \TMP5
1035	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1036	movaps 0x30(%arg1), \TMP3
1037	AESENC    \TMP3, \XMM1              # Round 3
1038	AESENC    \TMP3, \XMM2
1039	AESENC    \TMP3, \XMM3
1040	AESENC    \TMP3, \XMM4
1041	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1042	movaps 0x40(%arg1), \TMP3
1043	AESENC	  \TMP3, \XMM1              # Round 4
1044	AESENC	  \TMP3, \XMM2
1045	AESENC	  \TMP3, \XMM3
1046	AESENC	  \TMP3, \XMM4
1047	movdqa	  HashKey_3_k(%arg2), \TMP5
1048	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1049	movaps 0x50(%arg1), \TMP3
1050	AESENC	  \TMP3, \XMM1              # Round 5
1051	AESENC	  \TMP3, \XMM2
1052	AESENC	  \TMP3, \XMM3
1053	AESENC	  \TMP3, \XMM4
1054	pxor	  \TMP1, \TMP4
1055# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1056	pxor	  \XMM6, \XMM5
1057	pxor	  \TMP2, \TMP6
1058	movdqa	  \XMM7, \TMP1
1059	pshufd	  $78, \XMM7, \TMP2
1060	pxor	  \XMM7, \TMP2
1061	movdqa	  HashKey_2(%arg2), \TMP5
1062
1063        # Multiply TMP5 * HashKey using karatsuba
1064
1065	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1066	movaps 0x60(%arg1), \TMP3
1067	AESENC	  \TMP3, \XMM1              # Round 6
1068	AESENC	  \TMP3, \XMM2
1069	AESENC	  \TMP3, \XMM3
1070	AESENC	  \TMP3, \XMM4
1071	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1072	movaps 0x70(%arg1), \TMP3
1073	AESENC	  \TMP3, \XMM1             # Round 7
1074	AESENC	  \TMP3, \XMM2
1075	AESENC	  \TMP3, \XMM3
1076	AESENC	  \TMP3, \XMM4
1077	movdqa	  HashKey_2_k(%arg2), \TMP5
1078	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1079	movaps 0x80(%arg1), \TMP3
1080	AESENC	  \TMP3, \XMM1             # Round 8
1081	AESENC	  \TMP3, \XMM2
1082	AESENC	  \TMP3, \XMM3
1083	AESENC	  \TMP3, \XMM4
1084	pxor	  \TMP1, \TMP4
1085# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1086	pxor	  \XMM7, \XMM5
1087	pxor	  \TMP2, \TMP6
1088
1089        # Multiply XMM8 * HashKey
1090        # XMM8 and TMP5 hold the values for the two operands
1091
1092	movdqa	  \XMM8, \TMP1
1093	pshufd	  $78, \XMM8, \TMP2
1094	pxor	  \XMM8, \TMP2
1095	movdqa	  HashKey(%arg2), \TMP5
1096	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1097	movaps 0x90(%arg1), \TMP3
1098	AESENC	  \TMP3, \XMM1            # Round 9
1099	AESENC	  \TMP3, \XMM2
1100	AESENC	  \TMP3, \XMM3
1101	AESENC	  \TMP3, \XMM4
1102	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1103	lea	  0xa0(%arg1),%r10
1104	mov	  keysize,%eax
1105	shr	  $2,%eax			# 128->4, 192->6, 256->8
1106	sub	  $4,%eax			# 128->0, 192->2, 256->4
1107	jz	  aes_loop_par_enc_done\@
1108
1109aes_loop_par_enc\@:
1110	MOVADQ	  (%r10),\TMP3
1111.irpc	index, 1234
1112	AESENC	  \TMP3, %xmm\index
1113.endr
1114	add	  $16,%r10
1115	sub	  $1,%eax
1116	jnz	  aes_loop_par_enc\@
1117
1118aes_loop_par_enc_done\@:
1119	MOVADQ	  (%r10), \TMP3
1120	AESENCLAST \TMP3, \XMM1           # Round 10
1121	AESENCLAST \TMP3, \XMM2
1122	AESENCLAST \TMP3, \XMM3
1123	AESENCLAST \TMP3, \XMM4
1124	movdqa    HashKey_k(%arg2), \TMP5
1125	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1126	movdqu	  (%arg4,%r11,1), \TMP3
1127	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1128	movdqu	  16(%arg4,%r11,1), \TMP3
1129	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1130	movdqu	  32(%arg4,%r11,1), \TMP3
1131	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1132	movdqu	  48(%arg4,%r11,1), \TMP3
1133	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1134        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1135        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1136        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1137        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1138	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1139	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1140	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1141	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1142
1143	pxor	  \TMP4, \TMP1
1144	pxor	  \XMM8, \XMM5
1145	pxor	  \TMP6, \TMP2
1146	pxor	  \TMP1, \TMP2
1147	pxor	  \XMM5, \TMP2
1148	movdqa	  \TMP2, \TMP3
1149	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1150	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1151	pxor	  \TMP3, \XMM5
1152	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1153
1154        # first phase of reduction
1155
1156	movdqa    \XMM5, \TMP2
1157	movdqa    \XMM5, \TMP3
1158	movdqa    \XMM5, \TMP4
1159# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1160	pslld     $31, \TMP2                   # packed right shift << 31
1161	pslld     $30, \TMP3                   # packed right shift << 30
1162	pslld     $25, \TMP4                   # packed right shift << 25
1163	pxor      \TMP3, \TMP2	               # xor the shifted versions
1164	pxor      \TMP4, \TMP2
1165	movdqa    \TMP2, \TMP5
1166	psrldq    $4, \TMP5                    # right shift T5 1 DW
1167	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1168	pxor      \TMP2, \XMM5
1169
1170        # second phase of reduction
1171
1172	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1173	movdqa    \XMM5,\TMP3
1174	movdqa    \XMM5,\TMP4
1175	psrld     $1, \TMP2                    # packed left shift >>1
1176	psrld     $2, \TMP3                    # packed left shift >>2
1177	psrld     $7, \TMP4                    # packed left shift >>7
1178	pxor      \TMP3,\TMP2		       # xor the shifted versions
1179	pxor      \TMP4,\TMP2
1180	pxor      \TMP5, \TMP2
1181	pxor      \TMP2, \XMM5
1182	pxor      \TMP1, \XMM5                 # result is in TMP1
1183
1184	pxor	  \XMM5, \XMM1
1185.endm
1186
1187/*
1188* decrypt 4 blocks at a time
1189* ghash the 4 previously decrypted ciphertext blocks
1190* arg1, %arg3, %arg4 are used as pointers only, not modified
1191* %r11 is the data offset value
1192*/
1193.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1195
1196	movdqa	  \XMM1, \XMM5
1197	movdqa	  \XMM2, \XMM6
1198	movdqa	  \XMM3, \XMM7
1199	movdqa	  \XMM4, \XMM8
1200
1201        movdqa    SHUF_MASK(%rip), %xmm15
1202        # multiply TMP5 * HashKey using karatsuba
1203
1204	movdqa	  \XMM5, \TMP4
1205	pshufd	  $78, \XMM5, \TMP6
1206	pxor	  \XMM5, \TMP6
1207	paddd     ONE(%rip), \XMM0		# INCR CNT
1208	movdqa	  HashKey_4(%arg2), \TMP5
1209	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1210	movdqa    \XMM0, \XMM1
1211	paddd     ONE(%rip), \XMM0		# INCR CNT
1212	movdqa    \XMM0, \XMM2
1213	paddd     ONE(%rip), \XMM0		# INCR CNT
1214	movdqa    \XMM0, \XMM3
1215	paddd     ONE(%rip), \XMM0		# INCR CNT
1216	movdqa    \XMM0, \XMM4
1217	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1218	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1219	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1220	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1221	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1222
1223	pxor	  (%arg1), \XMM1
1224	pxor	  (%arg1), \XMM2
1225	pxor	  (%arg1), \XMM3
1226	pxor	  (%arg1), \XMM4
1227	movdqa	  HashKey_4_k(%arg2), \TMP5
1228	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1229	movaps 0x10(%arg1), \TMP1
1230	AESENC	  \TMP1, \XMM1              # Round 1
1231	AESENC	  \TMP1, \XMM2
1232	AESENC	  \TMP1, \XMM3
1233	AESENC	  \TMP1, \XMM4
1234	movaps 0x20(%arg1), \TMP1
1235	AESENC	  \TMP1, \XMM1              # Round 2
1236	AESENC	  \TMP1, \XMM2
1237	AESENC	  \TMP1, \XMM3
1238	AESENC	  \TMP1, \XMM4
1239	movdqa	  \XMM6, \TMP1
1240	pshufd	  $78, \XMM6, \TMP2
1241	pxor	  \XMM6, \TMP2
1242	movdqa	  HashKey_3(%arg2), \TMP5
1243	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1244	movaps 0x30(%arg1), \TMP3
1245	AESENC    \TMP3, \XMM1              # Round 3
1246	AESENC    \TMP3, \XMM2
1247	AESENC    \TMP3, \XMM3
1248	AESENC    \TMP3, \XMM4
1249	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1250	movaps 0x40(%arg1), \TMP3
1251	AESENC	  \TMP3, \XMM1              # Round 4
1252	AESENC	  \TMP3, \XMM2
1253	AESENC	  \TMP3, \XMM3
1254	AESENC	  \TMP3, \XMM4
1255	movdqa	  HashKey_3_k(%arg2), \TMP5
1256	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1257	movaps 0x50(%arg1), \TMP3
1258	AESENC	  \TMP3, \XMM1              # Round 5
1259	AESENC	  \TMP3, \XMM2
1260	AESENC	  \TMP3, \XMM3
1261	AESENC	  \TMP3, \XMM4
1262	pxor	  \TMP1, \TMP4
1263# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1264	pxor	  \XMM6, \XMM5
1265	pxor	  \TMP2, \TMP6
1266	movdqa	  \XMM7, \TMP1
1267	pshufd	  $78, \XMM7, \TMP2
1268	pxor	  \XMM7, \TMP2
1269	movdqa	  HashKey_2(%arg2), \TMP5
1270
1271        # Multiply TMP5 * HashKey using karatsuba
1272
1273	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1274	movaps 0x60(%arg1), \TMP3
1275	AESENC	  \TMP3, \XMM1              # Round 6
1276	AESENC	  \TMP3, \XMM2
1277	AESENC	  \TMP3, \XMM3
1278	AESENC	  \TMP3, \XMM4
1279	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1280	movaps 0x70(%arg1), \TMP3
1281	AESENC	  \TMP3, \XMM1             # Round 7
1282	AESENC	  \TMP3, \XMM2
1283	AESENC	  \TMP3, \XMM3
1284	AESENC	  \TMP3, \XMM4
1285	movdqa	  HashKey_2_k(%arg2), \TMP5
1286	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1287	movaps 0x80(%arg1), \TMP3
1288	AESENC	  \TMP3, \XMM1             # Round 8
1289	AESENC	  \TMP3, \XMM2
1290	AESENC	  \TMP3, \XMM3
1291	AESENC	  \TMP3, \XMM4
1292	pxor	  \TMP1, \TMP4
1293# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1294	pxor	  \XMM7, \XMM5
1295	pxor	  \TMP2, \TMP6
1296
1297        # Multiply XMM8 * HashKey
1298        # XMM8 and TMP5 hold the values for the two operands
1299
1300	movdqa	  \XMM8, \TMP1
1301	pshufd	  $78, \XMM8, \TMP2
1302	pxor	  \XMM8, \TMP2
1303	movdqa	  HashKey(%arg2), \TMP5
1304	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1305	movaps 0x90(%arg1), \TMP3
1306	AESENC	  \TMP3, \XMM1            # Round 9
1307	AESENC	  \TMP3, \XMM2
1308	AESENC	  \TMP3, \XMM3
1309	AESENC	  \TMP3, \XMM4
1310	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1311	lea	  0xa0(%arg1),%r10
1312	mov	  keysize,%eax
1313	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1314	sub	  $4,%eax			# 128->0, 192->2, 256->4
1315	jz	  aes_loop_par_dec_done\@
1316
1317aes_loop_par_dec\@:
1318	MOVADQ	  (%r10),\TMP3
1319.irpc	index, 1234
1320	AESENC	  \TMP3, %xmm\index
1321.endr
1322	add	  $16,%r10
1323	sub	  $1,%eax
1324	jnz	  aes_loop_par_dec\@
1325
1326aes_loop_par_dec_done\@:
1327	MOVADQ	  (%r10), \TMP3
1328	AESENCLAST \TMP3, \XMM1           # last round
1329	AESENCLAST \TMP3, \XMM2
1330	AESENCLAST \TMP3, \XMM3
1331	AESENCLAST \TMP3, \XMM4
1332	movdqa    HashKey_k(%arg2), \TMP5
1333	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1334	movdqu	  (%arg4,%r11,1), \TMP3
1335	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1336	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1337	movdqa    \TMP3, \XMM1
1338	movdqu	  16(%arg4,%r11,1), \TMP3
1339	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1340	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1341	movdqa    \TMP3, \XMM2
1342	movdqu	  32(%arg4,%r11,1), \TMP3
1343	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1344	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1345	movdqa    \TMP3, \XMM3
1346	movdqu	  48(%arg4,%r11,1), \TMP3
1347	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1348	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1349	movdqa    \TMP3, \XMM4
1350	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1351	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1352	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1353	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1354
1355	pxor	  \TMP4, \TMP1
1356	pxor	  \XMM8, \XMM5
1357	pxor	  \TMP6, \TMP2
1358	pxor	  \TMP1, \TMP2
1359	pxor	  \XMM5, \TMP2
1360	movdqa	  \TMP2, \TMP3
1361	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1362	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1363	pxor	  \TMP3, \XMM5
1364	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1365
1366        # first phase of reduction
1367
1368	movdqa    \XMM5, \TMP2
1369	movdqa    \XMM5, \TMP3
1370	movdqa    \XMM5, \TMP4
1371# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1372	pslld     $31, \TMP2                   # packed right shift << 31
1373	pslld     $30, \TMP3                   # packed right shift << 30
1374	pslld     $25, \TMP4                   # packed right shift << 25
1375	pxor      \TMP3, \TMP2	               # xor the shifted versions
1376	pxor      \TMP4, \TMP2
1377	movdqa    \TMP2, \TMP5
1378	psrldq    $4, \TMP5                    # right shift T5 1 DW
1379	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1380	pxor      \TMP2, \XMM5
1381
1382        # second phase of reduction
1383
1384	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1385	movdqa    \XMM5,\TMP3
1386	movdqa    \XMM5,\TMP4
1387	psrld     $1, \TMP2                    # packed left shift >>1
1388	psrld     $2, \TMP3                    # packed left shift >>2
1389	psrld     $7, \TMP4                    # packed left shift >>7
1390	pxor      \TMP3,\TMP2		       # xor the shifted versions
1391	pxor      \TMP4,\TMP2
1392	pxor      \TMP5, \TMP2
1393	pxor      \TMP2, \XMM5
1394	pxor      \TMP1, \XMM5                 # result is in TMP1
1395
1396	pxor	  \XMM5, \XMM1
1397.endm
1398
1399/* GHASH the last 4 ciphertext blocks. */
1400.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1401TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1402
1403        # Multiply TMP6 * HashKey (using Karatsuba)
1404
1405	movdqa	  \XMM1, \TMP6
1406	pshufd	  $78, \XMM1, \TMP2
1407	pxor	  \XMM1, \TMP2
1408	movdqa	  HashKey_4(%arg2), \TMP5
1409	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1410	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1411	movdqa	  HashKey_4_k(%arg2), \TMP4
1412	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1413	movdqa	  \XMM1, \XMMDst
1414	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1415
1416        # Multiply TMP1 * HashKey (using Karatsuba)
1417
1418	movdqa	  \XMM2, \TMP1
1419	pshufd	  $78, \XMM2, \TMP2
1420	pxor	  \XMM2, \TMP2
1421	movdqa	  HashKey_3(%arg2), \TMP5
1422	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1423	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1424	movdqa	  HashKey_3_k(%arg2), \TMP4
1425	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1426	pxor	  \TMP1, \TMP6
1427	pxor	  \XMM2, \XMMDst
1428	pxor	  \TMP2, \XMM1
1429# results accumulated in TMP6, XMMDst, XMM1
1430
1431        # Multiply TMP1 * HashKey (using Karatsuba)
1432
1433	movdqa	  \XMM3, \TMP1
1434	pshufd	  $78, \XMM3, \TMP2
1435	pxor	  \XMM3, \TMP2
1436	movdqa	  HashKey_2(%arg2), \TMP5
1437	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1438	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1439	movdqa	  HashKey_2_k(%arg2), \TMP4
1440	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1441	pxor	  \TMP1, \TMP6
1442	pxor	  \XMM3, \XMMDst
1443	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1444
1445        # Multiply TMP1 * HashKey (using Karatsuba)
1446	movdqa	  \XMM4, \TMP1
1447	pshufd	  $78, \XMM4, \TMP2
1448	pxor	  \XMM4, \TMP2
1449	movdqa	  HashKey(%arg2), \TMP5
1450	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1451	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1452	movdqa	  HashKey_k(%arg2), \TMP4
1453	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1454	pxor	  \TMP1, \TMP6
1455	pxor	  \XMM4, \XMMDst
1456	pxor	  \XMM1, \TMP2
1457	pxor	  \TMP6, \TMP2
1458	pxor	  \XMMDst, \TMP2
1459	# middle section of the temp results combined as in karatsuba algorithm
1460	movdqa	  \TMP2, \TMP4
1461	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1462	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1463	pxor	  \TMP4, \XMMDst
1464	pxor	  \TMP2, \TMP6
1465# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1466	# first phase of the reduction
1467	movdqa    \XMMDst, \TMP2
1468	movdqa    \XMMDst, \TMP3
1469	movdqa    \XMMDst, \TMP4
1470# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1471	pslld     $31, \TMP2                # packed right shifting << 31
1472	pslld     $30, \TMP3                # packed right shifting << 30
1473	pslld     $25, \TMP4                # packed right shifting << 25
1474	pxor      \TMP3, \TMP2              # xor the shifted versions
1475	pxor      \TMP4, \TMP2
1476	movdqa    \TMP2, \TMP7
1477	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1478	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1479	pxor      \TMP2, \XMMDst
1480
1481        # second phase of the reduction
1482	movdqa    \XMMDst, \TMP2
1483	# make 3 copies of XMMDst for doing 3 shift operations
1484	movdqa    \XMMDst, \TMP3
1485	movdqa    \XMMDst, \TMP4
1486	psrld     $1, \TMP2                 # packed left shift >> 1
1487	psrld     $2, \TMP3                 # packed left shift >> 2
1488	psrld     $7, \TMP4                 # packed left shift >> 7
1489	pxor      \TMP3, \TMP2              # xor the shifted versions
1490	pxor      \TMP4, \TMP2
1491	pxor      \TMP7, \TMP2
1492	pxor      \TMP2, \XMMDst
1493	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1494.endm
1495
 
 
1496
1497/* Encryption of a single block
1498* uses eax & r10
1499*/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1500
1501.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1502
1503	pxor		(%arg1), \XMM0
1504	mov		keysize,%eax
1505	shr		$2,%eax			# 128->4, 192->6, 256->8
1506	add		$5,%eax			# 128->9, 192->11, 256->13
1507	lea		16(%arg1), %r10	  # get first expanded key address
1508
1509_esb_loop_\@:
1510	MOVADQ		(%r10),\TMP1
1511	AESENC		\TMP1,\XMM0
1512	add		$16,%r10
1513	sub		$1,%eax
1514	jnz		_esb_loop_\@
1515
1516	MOVADQ		(%r10),\TMP1
1517	AESENCLAST	\TMP1,\XMM0
1518.endm
1519/*****************************************************************************
1520* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1521*                   struct gcm_context_data *data
1522*                                      // Context data
1523*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1524*                   const u8 *in,      // Ciphertext input
1525*                   u64 plaintext_len, // Length of data in bytes for decryption.
1526*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1527*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1528*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1529*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1530*                   const u8 *aad,     // Additional Authentication Data (AAD)
1531*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1532*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1533*                                      // given authentication tag and only return the plaintext if they match.
1534*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1535*                                      // (most likely), 12 or 8.
1536*
1537* Assumptions:
1538*
1539* keys:
1540*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1541*       set of 11 keys in the data structure void *aes_ctx
1542*
1543* iv:
1544*       0                   1                   2                   3
1545*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1546*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547*       |                             Salt  (From the SA)               |
1548*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*       |                     Initialization Vector                     |
1550*       |         (This is the sequence number from IPSec header)       |
1551*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1552*       |                              0x1                              |
1553*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554*
1555*
1556*
1557* AAD:
1558*       AAD padded to 128 bits with 0
1559*       for example, assume AAD is a u32 vector
1560*
1561*       if AAD is 8 bytes:
1562*       AAD[3] = {A0, A1};
1563*       padded AAD in xmm register = {A1 A0 0 0}
1564*
1565*       0                   1                   2                   3
1566*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1567*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568*       |                               SPI (A1)                        |
1569*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570*       |                     32-bit Sequence Number (A0)               |
1571*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572*       |                              0x0                              |
1573*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1574*
1575*                                       AAD Format with 32-bit Sequence Number
1576*
1577*       if AAD is 12 bytes:
1578*       AAD[3] = {A0, A1, A2};
1579*       padded AAD in xmm register = {A2 A1 A0 0}
1580*
1581*       0                   1                   2                   3
1582*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586*       |                               SPI (A2)                        |
1587*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588*       |                 64-bit Extended Sequence Number {A1,A0}       |
1589*       |                                                               |
1590*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1591*       |                              0x0                              |
1592*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1593*
1594*                        AAD Format with 64-bit Extended Sequence Number
1595*
 
 
 
 
 
 
 
 
1596* poly = x^128 + x^127 + x^126 + x^121 + 1
1597*
1598*****************************************************************************/
 
1599ENTRY(aesni_gcm_dec)
1600	FUNC_SAVE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1601
1602	GCM_INIT %arg6, arg7, arg8, arg9
1603	GCM_ENC_DEC dec
1604	GCM_COMPLETE arg10, arg11
1605	FUNC_RESTORE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1606	ret
1607ENDPROC(aesni_gcm_dec)
1608
1609
1610/*****************************************************************************
1611* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1612*                    struct gcm_context_data *data
1613*                                        // Context data
1614*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1615*                    const u8 *in,       // Plaintext input
1616*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1617*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1618*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1619*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1620*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1621*                    const u8 *aad,      // Additional Authentication Data (AAD)
1622*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1623*                    u8 *auth_tag,       // Authenticated Tag output.
1624*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1625*                                        // 12 or 8.
1626*
1627* Assumptions:
1628*
1629* keys:
1630*       keys are pre-expanded and aligned to 16 bytes. we are using the
1631*       first set of 11 keys in the data structure void *aes_ctx
1632*
1633*
1634* iv:
1635*       0                   1                   2                   3
1636*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1637*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638*       |                             Salt  (From the SA)               |
1639*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*       |                     Initialization Vector                     |
1641*       |         (This is the sequence number from IPSec header)       |
1642*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643*       |                              0x1                              |
1644*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645*
1646*
1647*
1648* AAD:
1649*       AAD padded to 128 bits with 0
1650*       for example, assume AAD is a u32 vector
1651*
1652*       if AAD is 8 bytes:
1653*       AAD[3] = {A0, A1};
1654*       padded AAD in xmm register = {A1 A0 0 0}
1655*
1656*       0                   1                   2                   3
1657*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1658*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659*       |                               SPI (A1)                        |
1660*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661*       |                     32-bit Sequence Number (A0)               |
1662*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663*       |                              0x0                              |
1664*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1665*
1666*                                 AAD Format with 32-bit Sequence Number
1667*
1668*       if AAD is 12 bytes:
1669*       AAD[3] = {A0, A1, A2};
1670*       padded AAD in xmm register = {A2 A1 A0 0}
1671*
1672*       0                   1                   2                   3
1673*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1674*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675*       |                               SPI (A2)                        |
1676*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677*       |                 64-bit Extended Sequence Number {A1,A0}       |
1678*       |                                                               |
1679*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1680*       |                              0x0                              |
1681*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1682*
1683*                         AAD Format with 64-bit Extended Sequence Number
1684*
 
 
 
 
 
 
 
 
1685* poly = x^128 + x^127 + x^126 + x^121 + 1
1686***************************************************************************/
1687ENTRY(aesni_gcm_enc)
1688	FUNC_SAVE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1689
1690	GCM_INIT %arg6, arg7, arg8, arg9
1691	GCM_ENC_DEC enc
1692
1693	GCM_COMPLETE arg10, arg11
1694	FUNC_RESTORE
1695	ret
1696ENDPROC(aesni_gcm_enc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1697
1698/*****************************************************************************
1699* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1700*                     struct gcm_context_data *data,
1701*                                         // context data
1702*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1703*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1704*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1705*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1706*                     const u8 *aad,      // Additional Authentication Data (AAD)
1707*                     u64 aad_len)        // Length of AAD in bytes.
1708*/
1709ENTRY(aesni_gcm_init)
1710	FUNC_SAVE
1711	GCM_INIT %arg3, %arg4,%arg5, %arg6
1712	FUNC_RESTORE
1713	ret
1714ENDPROC(aesni_gcm_init)
1715
1716/*****************************************************************************
1717* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1718*                    struct gcm_context_data *data,
1719*                                        // context data
1720*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1721*                    const u8 *in,       // Plaintext input
1722*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1723*/
1724ENTRY(aesni_gcm_enc_update)
1725	FUNC_SAVE
1726	GCM_ENC_DEC enc
1727	FUNC_RESTORE
1728	ret
1729ENDPROC(aesni_gcm_enc_update)
1730
1731/*****************************************************************************
1732* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1733*                    struct gcm_context_data *data,
1734*                                        // context data
1735*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1736*                    const u8 *in,       // Plaintext input
1737*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1738*/
1739ENTRY(aesni_gcm_dec_update)
1740	FUNC_SAVE
1741	GCM_ENC_DEC dec
1742	FUNC_RESTORE
1743	ret
1744ENDPROC(aesni_gcm_dec_update)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1745
1746/*****************************************************************************
1747* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1748*                    struct gcm_context_data *data,
1749*                                        // context data
1750*                    u8 *auth_tag,       // Authenticated Tag output.
1751*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1752*                                        // 12 or 8.
1753*/
1754ENTRY(aesni_gcm_finalize)
1755	FUNC_SAVE
1756	GCM_COMPLETE %arg3 %arg4
1757	FUNC_RESTORE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1758	ret
1759ENDPROC(aesni_gcm_finalize)
1760
1761#endif
1762
1763
1764.align 4
1765_key_expansion_128:
1766_key_expansion_256a:
1767	pshufd $0b11111111, %xmm1, %xmm1
1768	shufps $0b00010000, %xmm0, %xmm4
1769	pxor %xmm4, %xmm0
1770	shufps $0b10001100, %xmm0, %xmm4
1771	pxor %xmm4, %xmm0
1772	pxor %xmm1, %xmm0
1773	movaps %xmm0, (TKEYP)
1774	add $0x10, TKEYP
1775	ret
1776ENDPROC(_key_expansion_128)
1777ENDPROC(_key_expansion_256a)
1778
1779.align 4
1780_key_expansion_192a:
1781	pshufd $0b01010101, %xmm1, %xmm1
1782	shufps $0b00010000, %xmm0, %xmm4
1783	pxor %xmm4, %xmm0
1784	shufps $0b10001100, %xmm0, %xmm4
1785	pxor %xmm4, %xmm0
1786	pxor %xmm1, %xmm0
1787
1788	movaps %xmm2, %xmm5
1789	movaps %xmm2, %xmm6
1790	pslldq $4, %xmm5
1791	pshufd $0b11111111, %xmm0, %xmm3
1792	pxor %xmm3, %xmm2
1793	pxor %xmm5, %xmm2
1794
1795	movaps %xmm0, %xmm1
1796	shufps $0b01000100, %xmm0, %xmm6
1797	movaps %xmm6, (TKEYP)
1798	shufps $0b01001110, %xmm2, %xmm1
1799	movaps %xmm1, 0x10(TKEYP)
1800	add $0x20, TKEYP
1801	ret
1802ENDPROC(_key_expansion_192a)
1803
1804.align 4
1805_key_expansion_192b:
1806	pshufd $0b01010101, %xmm1, %xmm1
1807	shufps $0b00010000, %xmm0, %xmm4
1808	pxor %xmm4, %xmm0
1809	shufps $0b10001100, %xmm0, %xmm4
1810	pxor %xmm4, %xmm0
1811	pxor %xmm1, %xmm0
1812
1813	movaps %xmm2, %xmm5
1814	pslldq $4, %xmm5
1815	pshufd $0b11111111, %xmm0, %xmm3
1816	pxor %xmm3, %xmm2
1817	pxor %xmm5, %xmm2
1818
1819	movaps %xmm0, (TKEYP)
1820	add $0x10, TKEYP
1821	ret
1822ENDPROC(_key_expansion_192b)
1823
1824.align 4
1825_key_expansion_256b:
1826	pshufd $0b10101010, %xmm1, %xmm1
1827	shufps $0b00010000, %xmm2, %xmm4
1828	pxor %xmm4, %xmm2
1829	shufps $0b10001100, %xmm2, %xmm4
1830	pxor %xmm4, %xmm2
1831	pxor %xmm1, %xmm2
1832	movaps %xmm2, (TKEYP)
1833	add $0x10, TKEYP
1834	ret
1835ENDPROC(_key_expansion_256b)
1836
1837/*
1838 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1839 *                   unsigned int key_len)
1840 */
1841ENTRY(aesni_set_key)
1842	FRAME_BEGIN
1843#ifndef __x86_64__
1844	pushl KEYP
1845	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1846	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1847	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1848#endif
1849	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1850	movaps %xmm0, (KEYP)
1851	lea 0x10(KEYP), TKEYP		# key addr
1852	movl %edx, 480(KEYP)
1853	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1854	cmp $24, %dl
1855	jb .Lenc_key128
1856	je .Lenc_key192
1857	movups 0x10(UKEYP), %xmm2	# other user key
1858	movaps %xmm2, (TKEYP)
1859	add $0x10, TKEYP
1860	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1861	call _key_expansion_256a
1862	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1863	call _key_expansion_256b
1864	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1865	call _key_expansion_256a
1866	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1867	call _key_expansion_256b
1868	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1869	call _key_expansion_256a
1870	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1871	call _key_expansion_256b
1872	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1873	call _key_expansion_256a
1874	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1875	call _key_expansion_256b
1876	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1877	call _key_expansion_256a
1878	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1879	call _key_expansion_256b
1880	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1881	call _key_expansion_256a
1882	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1883	call _key_expansion_256b
1884	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1885	call _key_expansion_256a
1886	jmp .Ldec_key
1887.Lenc_key192:
1888	movq 0x10(UKEYP), %xmm2		# other user key
1889	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1890	call _key_expansion_192a
1891	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1892	call _key_expansion_192b
1893	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1894	call _key_expansion_192a
1895	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1896	call _key_expansion_192b
1897	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1898	call _key_expansion_192a
1899	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1900	call _key_expansion_192b
1901	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1902	call _key_expansion_192a
1903	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1904	call _key_expansion_192b
1905	jmp .Ldec_key
1906.Lenc_key128:
1907	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1908	call _key_expansion_128
1909	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1910	call _key_expansion_128
1911	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1912	call _key_expansion_128
1913	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1914	call _key_expansion_128
1915	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1916	call _key_expansion_128
1917	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1918	call _key_expansion_128
1919	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1920	call _key_expansion_128
1921	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1922	call _key_expansion_128
1923	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1924	call _key_expansion_128
1925	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1926	call _key_expansion_128
1927.Ldec_key:
1928	sub $0x10, TKEYP
1929	movaps (KEYP), %xmm0
1930	movaps (TKEYP), %xmm1
1931	movaps %xmm0, 240(TKEYP)
1932	movaps %xmm1, 240(KEYP)
1933	add $0x10, KEYP
1934	lea 240-16(TKEYP), UKEYP
1935.align 4
1936.Ldec_key_loop:
1937	movaps (KEYP), %xmm0
1938	AESIMC %xmm0 %xmm1
1939	movaps %xmm1, (UKEYP)
1940	add $0x10, KEYP
1941	sub $0x10, UKEYP
1942	cmp TKEYP, KEYP
1943	jb .Ldec_key_loop
1944	xor AREG, AREG
1945#ifndef __x86_64__
1946	popl KEYP
1947#endif
1948	FRAME_END
1949	ret
1950ENDPROC(aesni_set_key)
1951
1952/*
1953 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1954 */
1955ENTRY(aesni_enc)
1956	FRAME_BEGIN
1957#ifndef __x86_64__
1958	pushl KEYP
1959	pushl KLEN
1960	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1961	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1962	movl (FRAME_OFFSET+20)(%esp), INP	# src
1963#endif
1964	movl 480(KEYP), KLEN		# key length
1965	movups (INP), STATE		# input
1966	call _aesni_enc1
1967	movups STATE, (OUTP)		# output
1968#ifndef __x86_64__
1969	popl KLEN
1970	popl KEYP
1971#endif
1972	FRAME_END
1973	ret
1974ENDPROC(aesni_enc)
1975
1976/*
1977 * _aesni_enc1:		internal ABI
1978 * input:
1979 *	KEYP:		key struct pointer
1980 *	KLEN:		round count
1981 *	STATE:		initial state (input)
1982 * output:
1983 *	STATE:		finial state (output)
1984 * changed:
1985 *	KEY
1986 *	TKEYP (T1)
1987 */
1988.align 4
1989_aesni_enc1:
1990	movaps (KEYP), KEY		# key
1991	mov KEYP, TKEYP
1992	pxor KEY, STATE		# round 0
1993	add $0x30, TKEYP
1994	cmp $24, KLEN
1995	jb .Lenc128
1996	lea 0x20(TKEYP), TKEYP
1997	je .Lenc192
1998	add $0x20, TKEYP
1999	movaps -0x60(TKEYP), KEY
2000	AESENC KEY STATE
2001	movaps -0x50(TKEYP), KEY
2002	AESENC KEY STATE
2003.align 4
2004.Lenc192:
2005	movaps -0x40(TKEYP), KEY
2006	AESENC KEY STATE
2007	movaps -0x30(TKEYP), KEY
2008	AESENC KEY STATE
2009.align 4
2010.Lenc128:
2011	movaps -0x20(TKEYP), KEY
2012	AESENC KEY STATE
2013	movaps -0x10(TKEYP), KEY
2014	AESENC KEY STATE
2015	movaps (TKEYP), KEY
2016	AESENC KEY STATE
2017	movaps 0x10(TKEYP), KEY
2018	AESENC KEY STATE
2019	movaps 0x20(TKEYP), KEY
2020	AESENC KEY STATE
2021	movaps 0x30(TKEYP), KEY
2022	AESENC KEY STATE
2023	movaps 0x40(TKEYP), KEY
2024	AESENC KEY STATE
2025	movaps 0x50(TKEYP), KEY
2026	AESENC KEY STATE
2027	movaps 0x60(TKEYP), KEY
2028	AESENC KEY STATE
2029	movaps 0x70(TKEYP), KEY
2030	AESENCLAST KEY STATE
2031	ret
2032ENDPROC(_aesni_enc1)
2033
2034/*
2035 * _aesni_enc4:	internal ABI
2036 * input:
2037 *	KEYP:		key struct pointer
2038 *	KLEN:		round count
2039 *	STATE1:		initial state (input)
2040 *	STATE2
2041 *	STATE3
2042 *	STATE4
2043 * output:
2044 *	STATE1:		finial state (output)
2045 *	STATE2
2046 *	STATE3
2047 *	STATE4
2048 * changed:
2049 *	KEY
2050 *	TKEYP (T1)
2051 */
2052.align 4
2053_aesni_enc4:
2054	movaps (KEYP), KEY		# key
2055	mov KEYP, TKEYP
2056	pxor KEY, STATE1		# round 0
2057	pxor KEY, STATE2
2058	pxor KEY, STATE3
2059	pxor KEY, STATE4
2060	add $0x30, TKEYP
2061	cmp $24, KLEN
2062	jb .L4enc128
2063	lea 0x20(TKEYP), TKEYP
2064	je .L4enc192
2065	add $0x20, TKEYP
2066	movaps -0x60(TKEYP), KEY
2067	AESENC KEY STATE1
2068	AESENC KEY STATE2
2069	AESENC KEY STATE3
2070	AESENC KEY STATE4
2071	movaps -0x50(TKEYP), KEY
2072	AESENC KEY STATE1
2073	AESENC KEY STATE2
2074	AESENC KEY STATE3
2075	AESENC KEY STATE4
2076#.align 4
2077.L4enc192:
2078	movaps -0x40(TKEYP), KEY
2079	AESENC KEY STATE1
2080	AESENC KEY STATE2
2081	AESENC KEY STATE3
2082	AESENC KEY STATE4
2083	movaps -0x30(TKEYP), KEY
2084	AESENC KEY STATE1
2085	AESENC KEY STATE2
2086	AESENC KEY STATE3
2087	AESENC KEY STATE4
2088#.align 4
2089.L4enc128:
2090	movaps -0x20(TKEYP), KEY
2091	AESENC KEY STATE1
2092	AESENC KEY STATE2
2093	AESENC KEY STATE3
2094	AESENC KEY STATE4
2095	movaps -0x10(TKEYP), KEY
2096	AESENC KEY STATE1
2097	AESENC KEY STATE2
2098	AESENC KEY STATE3
2099	AESENC KEY STATE4
2100	movaps (TKEYP), KEY
2101	AESENC KEY STATE1
2102	AESENC KEY STATE2
2103	AESENC KEY STATE3
2104	AESENC KEY STATE4
2105	movaps 0x10(TKEYP), KEY
2106	AESENC KEY STATE1
2107	AESENC KEY STATE2
2108	AESENC KEY STATE3
2109	AESENC KEY STATE4
2110	movaps 0x20(TKEYP), KEY
2111	AESENC KEY STATE1
2112	AESENC KEY STATE2
2113	AESENC KEY STATE3
2114	AESENC KEY STATE4
2115	movaps 0x30(TKEYP), KEY
2116	AESENC KEY STATE1
2117	AESENC KEY STATE2
2118	AESENC KEY STATE3
2119	AESENC KEY STATE4
2120	movaps 0x40(TKEYP), KEY
2121	AESENC KEY STATE1
2122	AESENC KEY STATE2
2123	AESENC KEY STATE3
2124	AESENC KEY STATE4
2125	movaps 0x50(TKEYP), KEY
2126	AESENC KEY STATE1
2127	AESENC KEY STATE2
2128	AESENC KEY STATE3
2129	AESENC KEY STATE4
2130	movaps 0x60(TKEYP), KEY
2131	AESENC KEY STATE1
2132	AESENC KEY STATE2
2133	AESENC KEY STATE3
2134	AESENC KEY STATE4
2135	movaps 0x70(TKEYP), KEY
2136	AESENCLAST KEY STATE1		# last round
2137	AESENCLAST KEY STATE2
2138	AESENCLAST KEY STATE3
2139	AESENCLAST KEY STATE4
2140	ret
2141ENDPROC(_aesni_enc4)
2142
2143/*
2144 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2145 */
2146ENTRY(aesni_dec)
2147	FRAME_BEGIN
2148#ifndef __x86_64__
2149	pushl KEYP
2150	pushl KLEN
2151	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2152	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2153	movl (FRAME_OFFSET+20)(%esp), INP	# src
2154#endif
2155	mov 480(KEYP), KLEN		# key length
2156	add $240, KEYP
2157	movups (INP), STATE		# input
2158	call _aesni_dec1
2159	movups STATE, (OUTP)		#output
2160#ifndef __x86_64__
2161	popl KLEN
2162	popl KEYP
2163#endif
2164	FRAME_END
2165	ret
2166ENDPROC(aesni_dec)
2167
2168/*
2169 * _aesni_dec1:		internal ABI
2170 * input:
2171 *	KEYP:		key struct pointer
2172 *	KLEN:		key length
2173 *	STATE:		initial state (input)
2174 * output:
2175 *	STATE:		finial state (output)
2176 * changed:
2177 *	KEY
2178 *	TKEYP (T1)
2179 */
2180.align 4
2181_aesni_dec1:
2182	movaps (KEYP), KEY		# key
2183	mov KEYP, TKEYP
2184	pxor KEY, STATE		# round 0
2185	add $0x30, TKEYP
2186	cmp $24, KLEN
2187	jb .Ldec128
2188	lea 0x20(TKEYP), TKEYP
2189	je .Ldec192
2190	add $0x20, TKEYP
2191	movaps -0x60(TKEYP), KEY
2192	AESDEC KEY STATE
2193	movaps -0x50(TKEYP), KEY
2194	AESDEC KEY STATE
2195.align 4
2196.Ldec192:
2197	movaps -0x40(TKEYP), KEY
2198	AESDEC KEY STATE
2199	movaps -0x30(TKEYP), KEY
2200	AESDEC KEY STATE
2201.align 4
2202.Ldec128:
2203	movaps -0x20(TKEYP), KEY
2204	AESDEC KEY STATE
2205	movaps -0x10(TKEYP), KEY
2206	AESDEC KEY STATE
2207	movaps (TKEYP), KEY
2208	AESDEC KEY STATE
2209	movaps 0x10(TKEYP), KEY
2210	AESDEC KEY STATE
2211	movaps 0x20(TKEYP), KEY
2212	AESDEC KEY STATE
2213	movaps 0x30(TKEYP), KEY
2214	AESDEC KEY STATE
2215	movaps 0x40(TKEYP), KEY
2216	AESDEC KEY STATE
2217	movaps 0x50(TKEYP), KEY
2218	AESDEC KEY STATE
2219	movaps 0x60(TKEYP), KEY
2220	AESDEC KEY STATE
2221	movaps 0x70(TKEYP), KEY
2222	AESDECLAST KEY STATE
2223	ret
2224ENDPROC(_aesni_dec1)
2225
2226/*
2227 * _aesni_dec4:	internal ABI
2228 * input:
2229 *	KEYP:		key struct pointer
2230 *	KLEN:		key length
2231 *	STATE1:		initial state (input)
2232 *	STATE2
2233 *	STATE3
2234 *	STATE4
2235 * output:
2236 *	STATE1:		finial state (output)
2237 *	STATE2
2238 *	STATE3
2239 *	STATE4
2240 * changed:
2241 *	KEY
2242 *	TKEYP (T1)
2243 */
2244.align 4
2245_aesni_dec4:
2246	movaps (KEYP), KEY		# key
2247	mov KEYP, TKEYP
2248	pxor KEY, STATE1		# round 0
2249	pxor KEY, STATE2
2250	pxor KEY, STATE3
2251	pxor KEY, STATE4
2252	add $0x30, TKEYP
2253	cmp $24, KLEN
2254	jb .L4dec128
2255	lea 0x20(TKEYP), TKEYP
2256	je .L4dec192
2257	add $0x20, TKEYP
2258	movaps -0x60(TKEYP), KEY
2259	AESDEC KEY STATE1
2260	AESDEC KEY STATE2
2261	AESDEC KEY STATE3
2262	AESDEC KEY STATE4
2263	movaps -0x50(TKEYP), KEY
2264	AESDEC KEY STATE1
2265	AESDEC KEY STATE2
2266	AESDEC KEY STATE3
2267	AESDEC KEY STATE4
2268.align 4
2269.L4dec192:
2270	movaps -0x40(TKEYP), KEY
2271	AESDEC KEY STATE1
2272	AESDEC KEY STATE2
2273	AESDEC KEY STATE3
2274	AESDEC KEY STATE4
2275	movaps -0x30(TKEYP), KEY
2276	AESDEC KEY STATE1
2277	AESDEC KEY STATE2
2278	AESDEC KEY STATE3
2279	AESDEC KEY STATE4
2280.align 4
2281.L4dec128:
2282	movaps -0x20(TKEYP), KEY
2283	AESDEC KEY STATE1
2284	AESDEC KEY STATE2
2285	AESDEC KEY STATE3
2286	AESDEC KEY STATE4
2287	movaps -0x10(TKEYP), KEY
2288	AESDEC KEY STATE1
2289	AESDEC KEY STATE2
2290	AESDEC KEY STATE3
2291	AESDEC KEY STATE4
2292	movaps (TKEYP), KEY
2293	AESDEC KEY STATE1
2294	AESDEC KEY STATE2
2295	AESDEC KEY STATE3
2296	AESDEC KEY STATE4
2297	movaps 0x10(TKEYP), KEY
2298	AESDEC KEY STATE1
2299	AESDEC KEY STATE2
2300	AESDEC KEY STATE3
2301	AESDEC KEY STATE4
2302	movaps 0x20(TKEYP), KEY
2303	AESDEC KEY STATE1
2304	AESDEC KEY STATE2
2305	AESDEC KEY STATE3
2306	AESDEC KEY STATE4
2307	movaps 0x30(TKEYP), KEY
2308	AESDEC KEY STATE1
2309	AESDEC KEY STATE2
2310	AESDEC KEY STATE3
2311	AESDEC KEY STATE4
2312	movaps 0x40(TKEYP), KEY
2313	AESDEC KEY STATE1
2314	AESDEC KEY STATE2
2315	AESDEC KEY STATE3
2316	AESDEC KEY STATE4
2317	movaps 0x50(TKEYP), KEY
2318	AESDEC KEY STATE1
2319	AESDEC KEY STATE2
2320	AESDEC KEY STATE3
2321	AESDEC KEY STATE4
2322	movaps 0x60(TKEYP), KEY
2323	AESDEC KEY STATE1
2324	AESDEC KEY STATE2
2325	AESDEC KEY STATE3
2326	AESDEC KEY STATE4
2327	movaps 0x70(TKEYP), KEY
2328	AESDECLAST KEY STATE1		# last round
2329	AESDECLAST KEY STATE2
2330	AESDECLAST KEY STATE3
2331	AESDECLAST KEY STATE4
2332	ret
2333ENDPROC(_aesni_dec4)
2334
2335/*
2336 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2337 *		      size_t len)
2338 */
2339ENTRY(aesni_ecb_enc)
2340	FRAME_BEGIN
2341#ifndef __x86_64__
2342	pushl LEN
2343	pushl KEYP
2344	pushl KLEN
2345	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2346	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2347	movl (FRAME_OFFSET+24)(%esp), INP	# src
2348	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2349#endif
2350	test LEN, LEN		# check length
2351	jz .Lecb_enc_ret
2352	mov 480(KEYP), KLEN
2353	cmp $16, LEN
2354	jb .Lecb_enc_ret
2355	cmp $64, LEN
2356	jb .Lecb_enc_loop1
2357.align 4
2358.Lecb_enc_loop4:
2359	movups (INP), STATE1
2360	movups 0x10(INP), STATE2
2361	movups 0x20(INP), STATE3
2362	movups 0x30(INP), STATE4
2363	call _aesni_enc4
2364	movups STATE1, (OUTP)
2365	movups STATE2, 0x10(OUTP)
2366	movups STATE3, 0x20(OUTP)
2367	movups STATE4, 0x30(OUTP)
2368	sub $64, LEN
2369	add $64, INP
2370	add $64, OUTP
2371	cmp $64, LEN
2372	jge .Lecb_enc_loop4
2373	cmp $16, LEN
2374	jb .Lecb_enc_ret
2375.align 4
2376.Lecb_enc_loop1:
2377	movups (INP), STATE1
2378	call _aesni_enc1
2379	movups STATE1, (OUTP)
2380	sub $16, LEN
2381	add $16, INP
2382	add $16, OUTP
2383	cmp $16, LEN
2384	jge .Lecb_enc_loop1
2385.Lecb_enc_ret:
2386#ifndef __x86_64__
2387	popl KLEN
2388	popl KEYP
2389	popl LEN
2390#endif
2391	FRAME_END
2392	ret
2393ENDPROC(aesni_ecb_enc)
2394
2395/*
2396 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2397 *		      size_t len);
2398 */
2399ENTRY(aesni_ecb_dec)
2400	FRAME_BEGIN
2401#ifndef __x86_64__
2402	pushl LEN
2403	pushl KEYP
2404	pushl KLEN
2405	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2406	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2407	movl (FRAME_OFFSET+24)(%esp), INP	# src
2408	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2409#endif
2410	test LEN, LEN
2411	jz .Lecb_dec_ret
2412	mov 480(KEYP), KLEN
2413	add $240, KEYP
2414	cmp $16, LEN
2415	jb .Lecb_dec_ret
2416	cmp $64, LEN
2417	jb .Lecb_dec_loop1
2418.align 4
2419.Lecb_dec_loop4:
2420	movups (INP), STATE1
2421	movups 0x10(INP), STATE2
2422	movups 0x20(INP), STATE3
2423	movups 0x30(INP), STATE4
2424	call _aesni_dec4
2425	movups STATE1, (OUTP)
2426	movups STATE2, 0x10(OUTP)
2427	movups STATE3, 0x20(OUTP)
2428	movups STATE4, 0x30(OUTP)
2429	sub $64, LEN
2430	add $64, INP
2431	add $64, OUTP
2432	cmp $64, LEN
2433	jge .Lecb_dec_loop4
2434	cmp $16, LEN
2435	jb .Lecb_dec_ret
2436.align 4
2437.Lecb_dec_loop1:
2438	movups (INP), STATE1
2439	call _aesni_dec1
2440	movups STATE1, (OUTP)
2441	sub $16, LEN
2442	add $16, INP
2443	add $16, OUTP
2444	cmp $16, LEN
2445	jge .Lecb_dec_loop1
2446.Lecb_dec_ret:
2447#ifndef __x86_64__
2448	popl KLEN
2449	popl KEYP
2450	popl LEN
2451#endif
2452	FRAME_END
2453	ret
2454ENDPROC(aesni_ecb_dec)
2455
2456/*
2457 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2458 *		      size_t len, u8 *iv)
2459 */
2460ENTRY(aesni_cbc_enc)
2461	FRAME_BEGIN
2462#ifndef __x86_64__
2463	pushl IVP
2464	pushl LEN
2465	pushl KEYP
2466	pushl KLEN
2467	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2468	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2469	movl (FRAME_OFFSET+28)(%esp), INP	# src
2470	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2471	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2472#endif
2473	cmp $16, LEN
2474	jb .Lcbc_enc_ret
2475	mov 480(KEYP), KLEN
2476	movups (IVP), STATE	# load iv as initial state
2477.align 4
2478.Lcbc_enc_loop:
2479	movups (INP), IN	# load input
2480	pxor IN, STATE
2481	call _aesni_enc1
2482	movups STATE, (OUTP)	# store output
2483	sub $16, LEN
2484	add $16, INP
2485	add $16, OUTP
2486	cmp $16, LEN
2487	jge .Lcbc_enc_loop
2488	movups STATE, (IVP)
2489.Lcbc_enc_ret:
2490#ifndef __x86_64__
2491	popl KLEN
2492	popl KEYP
2493	popl LEN
2494	popl IVP
2495#endif
2496	FRAME_END
2497	ret
2498ENDPROC(aesni_cbc_enc)
2499
2500/*
2501 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2502 *		      size_t len, u8 *iv)
2503 */
2504ENTRY(aesni_cbc_dec)
2505	FRAME_BEGIN
2506#ifndef __x86_64__
2507	pushl IVP
2508	pushl LEN
2509	pushl KEYP
2510	pushl KLEN
2511	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2512	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2513	movl (FRAME_OFFSET+28)(%esp), INP	# src
2514	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2515	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2516#endif
2517	cmp $16, LEN
2518	jb .Lcbc_dec_just_ret
2519	mov 480(KEYP), KLEN
2520	add $240, KEYP
2521	movups (IVP), IV
2522	cmp $64, LEN
2523	jb .Lcbc_dec_loop1
2524.align 4
2525.Lcbc_dec_loop4:
2526	movups (INP), IN1
2527	movaps IN1, STATE1
2528	movups 0x10(INP), IN2
2529	movaps IN2, STATE2
2530#ifdef __x86_64__
2531	movups 0x20(INP), IN3
2532	movaps IN3, STATE3
2533	movups 0x30(INP), IN4
2534	movaps IN4, STATE4
2535#else
2536	movups 0x20(INP), IN1
2537	movaps IN1, STATE3
2538	movups 0x30(INP), IN2
2539	movaps IN2, STATE4
2540#endif
2541	call _aesni_dec4
2542	pxor IV, STATE1
2543#ifdef __x86_64__
2544	pxor IN1, STATE2
2545	pxor IN2, STATE3
2546	pxor IN3, STATE4
2547	movaps IN4, IV
2548#else
 
 
2549	pxor IN1, STATE4
2550	movaps IN2, IV
2551	movups (INP), IN1
2552	pxor IN1, STATE2
2553	movups 0x10(INP), IN2
2554	pxor IN2, STATE3
2555#endif
2556	movups STATE1, (OUTP)
2557	movups STATE2, 0x10(OUTP)
2558	movups STATE3, 0x20(OUTP)
2559	movups STATE4, 0x30(OUTP)
2560	sub $64, LEN
2561	add $64, INP
2562	add $64, OUTP
2563	cmp $64, LEN
2564	jge .Lcbc_dec_loop4
2565	cmp $16, LEN
2566	jb .Lcbc_dec_ret
2567.align 4
2568.Lcbc_dec_loop1:
2569	movups (INP), IN
2570	movaps IN, STATE
2571	call _aesni_dec1
2572	pxor IV, STATE
2573	movups STATE, (OUTP)
2574	movaps IN, IV
2575	sub $16, LEN
2576	add $16, INP
2577	add $16, OUTP
2578	cmp $16, LEN
2579	jge .Lcbc_dec_loop1
2580.Lcbc_dec_ret:
2581	movups IV, (IVP)
2582.Lcbc_dec_just_ret:
2583#ifndef __x86_64__
2584	popl KLEN
2585	popl KEYP
2586	popl LEN
2587	popl IVP
2588#endif
2589	FRAME_END
2590	ret
2591ENDPROC(aesni_cbc_dec)
2592
2593#ifdef __x86_64__
2594.pushsection .rodata
2595.align 16
2596.Lbswap_mask:
2597	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2598.popsection
2599
2600/*
2601 * _aesni_inc_init:	internal ABI
2602 *	setup registers used by _aesni_inc
2603 * input:
2604 *	IV
2605 * output:
2606 *	CTR:	== IV, in little endian
2607 *	TCTR_LOW: == lower qword of CTR
2608 *	INC:	== 1, in little endian
2609 *	BSWAP_MASK == endian swapping mask
2610 */
2611.align 4
2612_aesni_inc_init:
2613	movaps .Lbswap_mask, BSWAP_MASK
2614	movaps IV, CTR
2615	PSHUFB_XMM BSWAP_MASK CTR
2616	mov $1, TCTR_LOW
2617	MOVQ_R64_XMM TCTR_LOW INC
2618	MOVQ_R64_XMM CTR TCTR_LOW
2619	ret
2620ENDPROC(_aesni_inc_init)
2621
2622/*
2623 * _aesni_inc:		internal ABI
2624 *	Increase IV by 1, IV is in big endian
2625 * input:
2626 *	IV
2627 *	CTR:	== IV, in little endian
2628 *	TCTR_LOW: == lower qword of CTR
2629 *	INC:	== 1, in little endian
2630 *	BSWAP_MASK == endian swapping mask
2631 * output:
2632 *	IV:	Increase by 1
2633 * changed:
2634 *	CTR:	== output IV, in little endian
2635 *	TCTR_LOW: == lower qword of CTR
2636 */
2637.align 4
2638_aesni_inc:
2639	paddq INC, CTR
2640	add $1, TCTR_LOW
2641	jnc .Linc_low
2642	pslldq $8, INC
2643	paddq INC, CTR
2644	psrldq $8, INC
2645.Linc_low:
2646	movaps CTR, IV
2647	PSHUFB_XMM BSWAP_MASK IV
2648	ret
2649ENDPROC(_aesni_inc)
2650
2651/*
2652 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2653 *		      size_t len, u8 *iv)
2654 */
2655ENTRY(aesni_ctr_enc)
2656	FRAME_BEGIN
2657	cmp $16, LEN
2658	jb .Lctr_enc_just_ret
2659	mov 480(KEYP), KLEN
2660	movups (IVP), IV
2661	call _aesni_inc_init
2662	cmp $64, LEN
2663	jb .Lctr_enc_loop1
2664.align 4
2665.Lctr_enc_loop4:
2666	movaps IV, STATE1
2667	call _aesni_inc
2668	movups (INP), IN1
2669	movaps IV, STATE2
2670	call _aesni_inc
2671	movups 0x10(INP), IN2
2672	movaps IV, STATE3
2673	call _aesni_inc
2674	movups 0x20(INP), IN3
2675	movaps IV, STATE4
2676	call _aesni_inc
2677	movups 0x30(INP), IN4
2678	call _aesni_enc4
2679	pxor IN1, STATE1
2680	movups STATE1, (OUTP)
2681	pxor IN2, STATE2
2682	movups STATE2, 0x10(OUTP)
2683	pxor IN3, STATE3
2684	movups STATE3, 0x20(OUTP)
2685	pxor IN4, STATE4
2686	movups STATE4, 0x30(OUTP)
2687	sub $64, LEN
2688	add $64, INP
2689	add $64, OUTP
2690	cmp $64, LEN
2691	jge .Lctr_enc_loop4
2692	cmp $16, LEN
2693	jb .Lctr_enc_ret
2694.align 4
2695.Lctr_enc_loop1:
2696	movaps IV, STATE
2697	call _aesni_inc
2698	movups (INP), IN
2699	call _aesni_enc1
2700	pxor IN, STATE
2701	movups STATE, (OUTP)
2702	sub $16, LEN
2703	add $16, INP
2704	add $16, OUTP
2705	cmp $16, LEN
2706	jge .Lctr_enc_loop1
2707.Lctr_enc_ret:
2708	movups IV, (IVP)
2709.Lctr_enc_just_ret:
2710	FRAME_END
2711	ret
2712ENDPROC(aesni_ctr_enc)
2713
2714/*
2715 * _aesni_gf128mul_x_ble:		internal ABI
2716 *	Multiply in GF(2^128) for XTS IVs
2717 * input:
2718 *	IV:	current IV
2719 *	GF128MUL_MASK == mask with 0x87 and 0x01
2720 * output:
2721 *	IV:	next IV
2722 * changed:
2723 *	CTR:	== temporary value
2724 */
2725#define _aesni_gf128mul_x_ble() \
2726	pshufd $0x13, IV, CTR; \
2727	paddq IV, IV; \
2728	psrad $31, CTR; \
2729	pand GF128MUL_MASK, CTR; \
2730	pxor CTR, IV;
2731
2732/*
2733 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2734 *			 bool enc, u8 *iv)
2735 */
2736ENTRY(aesni_xts_crypt8)
2737	FRAME_BEGIN
2738	cmpb $0, %cl
2739	movl $0, %ecx
2740	movl $240, %r10d
2741	leaq _aesni_enc4, %r11
2742	leaq _aesni_dec4, %rax
2743	cmovel %r10d, %ecx
2744	cmoveq %rax, %r11
2745
2746	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2747	movups (IVP), IV
2748
2749	mov 480(KEYP), KLEN
2750	addq %rcx, KEYP
2751
2752	movdqa IV, STATE1
2753	movdqu 0x00(INP), INC
2754	pxor INC, STATE1
2755	movdqu IV, 0x00(OUTP)
2756
2757	_aesni_gf128mul_x_ble()
2758	movdqa IV, STATE2
2759	movdqu 0x10(INP), INC
2760	pxor INC, STATE2
2761	movdqu IV, 0x10(OUTP)
2762
2763	_aesni_gf128mul_x_ble()
2764	movdqa IV, STATE3
2765	movdqu 0x20(INP), INC
2766	pxor INC, STATE3
2767	movdqu IV, 0x20(OUTP)
2768
2769	_aesni_gf128mul_x_ble()
2770	movdqa IV, STATE4
2771	movdqu 0x30(INP), INC
2772	pxor INC, STATE4
2773	movdqu IV, 0x30(OUTP)
2774
2775	CALL_NOSPEC %r11
2776
2777	movdqu 0x00(OUTP), INC
2778	pxor INC, STATE1
2779	movdqu STATE1, 0x00(OUTP)
2780
2781	_aesni_gf128mul_x_ble()
2782	movdqa IV, STATE1
2783	movdqu 0x40(INP), INC
2784	pxor INC, STATE1
2785	movdqu IV, 0x40(OUTP)
2786
2787	movdqu 0x10(OUTP), INC
2788	pxor INC, STATE2
2789	movdqu STATE2, 0x10(OUTP)
2790
2791	_aesni_gf128mul_x_ble()
2792	movdqa IV, STATE2
2793	movdqu 0x50(INP), INC
2794	pxor INC, STATE2
2795	movdqu IV, 0x50(OUTP)
2796
2797	movdqu 0x20(OUTP), INC
2798	pxor INC, STATE3
2799	movdqu STATE3, 0x20(OUTP)
2800
2801	_aesni_gf128mul_x_ble()
2802	movdqa IV, STATE3
2803	movdqu 0x60(INP), INC
2804	pxor INC, STATE3
2805	movdqu IV, 0x60(OUTP)
2806
2807	movdqu 0x30(OUTP), INC
2808	pxor INC, STATE4
2809	movdqu STATE4, 0x30(OUTP)
2810
2811	_aesni_gf128mul_x_ble()
2812	movdqa IV, STATE4
2813	movdqu 0x70(INP), INC
2814	pxor INC, STATE4
2815	movdqu IV, 0x70(OUTP)
2816
2817	_aesni_gf128mul_x_ble()
2818	movups IV, (IVP)
2819
2820	CALL_NOSPEC %r11
2821
2822	movdqu 0x40(OUTP), INC
2823	pxor INC, STATE1
2824	movdqu STATE1, 0x40(OUTP)
2825
2826	movdqu 0x50(OUTP), INC
2827	pxor INC, STATE2
2828	movdqu STATE2, 0x50(OUTP)
2829
2830	movdqu 0x60(OUTP), INC
2831	pxor INC, STATE3
2832	movdqu STATE3, 0x60(OUTP)
2833
2834	movdqu 0x70(OUTP), INC
2835	pxor INC, STATE4
2836	movdqu STATE4, 0x70(OUTP)
2837
2838	FRAME_END
2839	ret
2840ENDPROC(aesni_xts_crypt8)
2841
2842#endif