aesni-intel_asm.S - arch/x86/crypto/aesni-intel_asm.S - Linux diff v6.13.7

   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Implement AES algorithm in Intel AES-NI instructions.
   4 *
   5 * The white paper of AES-NI instructions can be downloaded from:
   6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7 *
   8 * Copyright (C) 2008, Intel Corp.
   9 *    Author: Huang Ying <ying.huang@intel.com>
  10 *            Vinodh Gopal <vinodh.gopal@intel.com>
  11 *            Kahraman Akdemir
  12 *
  13 * Copyright (c) 2010, Intel Corporation.
 
 
 
 
 
 
 
 
 
  14 *
  15 * Ported x86_64 version to x86:
  16 *    Author: Mathias Krause <minipli@googlemail.com>
  17 */
  18
  19#include <linux/linkage.h>
  20#include <asm/frame.h>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  21
  22#define STATE1	%xmm0
  23#define STATE2	%xmm4
  24#define STATE3	%xmm5
  25#define STATE4	%xmm6
  26#define STATE	STATE1
  27#define IN1	%xmm1
  28#define IN2	%xmm7
  29#define IN3	%xmm8
  30#define IN4	%xmm9
  31#define IN	IN1
  32#define KEY	%xmm2
  33#define IV	%xmm3
  34
  35#define BSWAP_MASK %xmm10
  36#define CTR	%xmm11
  37#define INC	%xmm12
  38
  39#define GF128MUL_MASK %xmm7
  40
  41#ifdef __x86_64__
  42#define AREG	%rax
  43#define KEYP	%rdi
  44#define OUTP	%rsi
  45#define UKEYP	OUTP
  46#define INP	%rdx
  47#define LEN	%rcx
  48#define IVP	%r8
  49#define KLEN	%r9d
  50#define T1	%r10
  51#define TKEYP	T1
  52#define T2	%r11
  53#define TCTR_LOW T2
  54#else
  55#define AREG	%eax
  56#define KEYP	%edi
  57#define OUTP	AREG
  58#define UKEYP	OUTP
  59#define INP	%edx
  60#define LEN	%esi
  61#define IVP	%ebp
  62#define KLEN	%ebx
  63#define T1	%ecx
  64#define TKEYP	T1
  65#endif
  66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  67SYM_FUNC_START_LOCAL(_key_expansion_256a)
  68	pshufd $0b11111111, %xmm1, %xmm1
  69	shufps $0b00010000, %xmm0, %xmm4
  70	pxor %xmm4, %xmm0
  71	shufps $0b10001100, %xmm0, %xmm4
  72	pxor %xmm4, %xmm0
  73	pxor %xmm1, %xmm0
  74	movaps %xmm0, (TKEYP)
  75	add $0x10, TKEYP
  76	RET
  77SYM_FUNC_END(_key_expansion_256a)
  78SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
  79
  80SYM_FUNC_START_LOCAL(_key_expansion_192a)
  81	pshufd $0b01010101, %xmm1, %xmm1
  82	shufps $0b00010000, %xmm0, %xmm4
  83	pxor %xmm4, %xmm0
  84	shufps $0b10001100, %xmm0, %xmm4
  85	pxor %xmm4, %xmm0
  86	pxor %xmm1, %xmm0
  87
  88	movaps %xmm2, %xmm5
  89	movaps %xmm2, %xmm6
  90	pslldq $4, %xmm5
  91	pshufd $0b11111111, %xmm0, %xmm3
  92	pxor %xmm3, %xmm2
  93	pxor %xmm5, %xmm2
  94
  95	movaps %xmm0, %xmm1
  96	shufps $0b01000100, %xmm0, %xmm6
  97	movaps %xmm6, (TKEYP)
  98	shufps $0b01001110, %xmm2, %xmm1
  99	movaps %xmm1, 0x10(TKEYP)
 100	add $0x20, TKEYP
 101	RET
 102SYM_FUNC_END(_key_expansion_192a)
 103
 104SYM_FUNC_START_LOCAL(_key_expansion_192b)
 105	pshufd $0b01010101, %xmm1, %xmm1
 106	shufps $0b00010000, %xmm0, %xmm4
 107	pxor %xmm4, %xmm0
 108	shufps $0b10001100, %xmm0, %xmm4
 109	pxor %xmm4, %xmm0
 110	pxor %xmm1, %xmm0
 111
 112	movaps %xmm2, %xmm5
 113	pslldq $4, %xmm5
 114	pshufd $0b11111111, %xmm0, %xmm3
 115	pxor %xmm3, %xmm2
 116	pxor %xmm5, %xmm2
 117
 118	movaps %xmm0, (TKEYP)
 119	add $0x10, TKEYP
 120	RET
 121SYM_FUNC_END(_key_expansion_192b)
 122
 123SYM_FUNC_START_LOCAL(_key_expansion_256b)
 124	pshufd $0b10101010, %xmm1, %xmm1
 125	shufps $0b00010000, %xmm2, %xmm4
 126	pxor %xmm4, %xmm2
 127	shufps $0b10001100, %xmm2, %xmm4
 128	pxor %xmm4, %xmm2
 129	pxor %xmm1, %xmm2
 130	movaps %xmm2, (TKEYP)
 131	add $0x10, TKEYP
 132	RET
 133SYM_FUNC_END(_key_expansion_256b)
 134
 135/*
 136 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 137 *                    unsigned int key_len)
 138 */
 139SYM_FUNC_START(aesni_set_key)
 140	FRAME_BEGIN
 141#ifndef __x86_64__
 142	pushl KEYP
 143	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
 144	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
 145	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
 146#endif
 147	movups (UKEYP), %xmm0		# user key (first 16 bytes)
 148	movaps %xmm0, (KEYP)
 149	lea 0x10(KEYP), TKEYP		# key addr
 150	movl %edx, 480(KEYP)
 151	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
 152	cmp $24, %dl
 153	jb .Lenc_key128
 154	je .Lenc_key192
 155	movups 0x10(UKEYP), %xmm2	# other user key
 156	movaps %xmm2, (TKEYP)
 157	add $0x10, TKEYP
 158	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
 159	call _key_expansion_256a
 160	aeskeygenassist $0x1, %xmm0, %xmm1
 161	call _key_expansion_256b
 162	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
 163	call _key_expansion_256a
 164	aeskeygenassist $0x2, %xmm0, %xmm1
 165	call _key_expansion_256b
 166	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
 167	call _key_expansion_256a
 168	aeskeygenassist $0x4, %xmm0, %xmm1
 169	call _key_expansion_256b
 170	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
 171	call _key_expansion_256a
 172	aeskeygenassist $0x8, %xmm0, %xmm1
 173	call _key_expansion_256b
 174	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
 175	call _key_expansion_256a
 176	aeskeygenassist $0x10, %xmm0, %xmm1
 177	call _key_expansion_256b
 178	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
 179	call _key_expansion_256a
 180	aeskeygenassist $0x20, %xmm0, %xmm1
 181	call _key_expansion_256b
 182	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
 183	call _key_expansion_256a
 184	jmp .Ldec_key
 185.Lenc_key192:
 186	movq 0x10(UKEYP), %xmm2		# other user key
 187	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
 188	call _key_expansion_192a
 189	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
 190	call _key_expansion_192b
 191	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
 192	call _key_expansion_192a
 193	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
 194	call _key_expansion_192b
 195	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
 196	call _key_expansion_192a
 197	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
 198	call _key_expansion_192b
 199	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
 200	call _key_expansion_192a
 201	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
 202	call _key_expansion_192b
 203	jmp .Ldec_key
 204.Lenc_key128:
 205	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
 206	call _key_expansion_128
 207	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
 208	call _key_expansion_128
 209	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
 210	call _key_expansion_128
 211	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
 212	call _key_expansion_128
 213	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
 214	call _key_expansion_128
 215	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
 216	call _key_expansion_128
 217	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
 218	call _key_expansion_128
 219	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
 220	call _key_expansion_128
 221	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
 222	call _key_expansion_128
 223	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
 224	call _key_expansion_128
 225.Ldec_key:
 226	sub $0x10, TKEYP
 227	movaps (KEYP), %xmm0
 228	movaps (TKEYP), %xmm1
 229	movaps %xmm0, 240(TKEYP)
 230	movaps %xmm1, 240(KEYP)
 231	add $0x10, KEYP
 232	lea 240-16(TKEYP), UKEYP
 233.align 4
 234.Ldec_key_loop:
 235	movaps (KEYP), %xmm0
 236	aesimc %xmm0, %xmm1
 237	movaps %xmm1, (UKEYP)
 238	add $0x10, KEYP
 239	sub $0x10, UKEYP
 240	cmp TKEYP, KEYP
 241	jb .Ldec_key_loop
 
 242#ifndef __x86_64__
 243	popl KEYP
 244#endif
 245	FRAME_END
 246	RET
 247SYM_FUNC_END(aesni_set_key)
 248
 249/*
 250 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
 251 */
 252SYM_FUNC_START(aesni_enc)
 253	FRAME_BEGIN
 254#ifndef __x86_64__
 255	pushl KEYP
 256	pushl KLEN
 257	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
 258	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
 259	movl (FRAME_OFFSET+20)(%esp), INP	# src
 260#endif
 261	movl 480(KEYP), KLEN		# key length
 262	movups (INP), STATE		# input
 263	call _aesni_enc1
 264	movups STATE, (OUTP)		# output
 265#ifndef __x86_64__
 266	popl KLEN
 267	popl KEYP
 268#endif
 269	FRAME_END
 270	RET
 271SYM_FUNC_END(aesni_enc)
 272
 273/*
 274 * _aesni_enc1:		internal ABI
 275 * input:
 276 *	KEYP:		key struct pointer
 277 *	KLEN:		round count
 278 *	STATE:		initial state (input)
 279 * output:
 280 *	STATE:		finial state (output)
 281 * changed:
 282 *	KEY
 283 *	TKEYP (T1)
 284 */
 285SYM_FUNC_START_LOCAL(_aesni_enc1)
 286	movaps (KEYP), KEY		# key
 287	mov KEYP, TKEYP
 288	pxor KEY, STATE		# round 0
 289	add $0x30, TKEYP
 290	cmp $24, KLEN
 291	jb .Lenc128
 292	lea 0x20(TKEYP), TKEYP
 293	je .Lenc192
 294	add $0x20, TKEYP
 295	movaps -0x60(TKEYP), KEY
 296	aesenc KEY, STATE
 297	movaps -0x50(TKEYP), KEY
 298	aesenc KEY, STATE
 299.align 4
 300.Lenc192:
 301	movaps -0x40(TKEYP), KEY
 302	aesenc KEY, STATE
 303	movaps -0x30(TKEYP), KEY
 304	aesenc KEY, STATE
 305.align 4
 306.Lenc128:
 307	movaps -0x20(TKEYP), KEY
 308	aesenc KEY, STATE
 309	movaps -0x10(TKEYP), KEY
 310	aesenc KEY, STATE
 311	movaps (TKEYP), KEY
 312	aesenc KEY, STATE
 313	movaps 0x10(TKEYP), KEY
 314	aesenc KEY, STATE
 315	movaps 0x20(TKEYP), KEY
 316	aesenc KEY, STATE
 317	movaps 0x30(TKEYP), KEY
 318	aesenc KEY, STATE
 319	movaps 0x40(TKEYP), KEY
 320	aesenc KEY, STATE
 321	movaps 0x50(TKEYP), KEY
 322	aesenc KEY, STATE
 323	movaps 0x60(TKEYP), KEY
 324	aesenc KEY, STATE
 325	movaps 0x70(TKEYP), KEY
 326	aesenclast KEY, STATE
 327	RET
 328SYM_FUNC_END(_aesni_enc1)
 329
 330/*
 331 * _aesni_enc4:	internal ABI
 332 * input:
 333 *	KEYP:		key struct pointer
 334 *	KLEN:		round count
 335 *	STATE1:		initial state (input)
 336 *	STATE2
 337 *	STATE3
 338 *	STATE4
 339 * output:
 340 *	STATE1:		finial state (output)
 341 *	STATE2
 342 *	STATE3
 343 *	STATE4
 344 * changed:
 345 *	KEY
 346 *	TKEYP (T1)
 347 */
 348SYM_FUNC_START_LOCAL(_aesni_enc4)
 349	movaps (KEYP), KEY		# key
 350	mov KEYP, TKEYP
 351	pxor KEY, STATE1		# round 0
 352	pxor KEY, STATE2
 353	pxor KEY, STATE3
 354	pxor KEY, STATE4
 355	add $0x30, TKEYP
 356	cmp $24, KLEN
 357	jb .L4enc128
 358	lea 0x20(TKEYP), TKEYP
 359	je .L4enc192
 360	add $0x20, TKEYP
 361	movaps -0x60(TKEYP), KEY
 362	aesenc KEY, STATE1
 363	aesenc KEY, STATE2
 364	aesenc KEY, STATE3
 365	aesenc KEY, STATE4
 366	movaps -0x50(TKEYP), KEY
 367	aesenc KEY, STATE1
 368	aesenc KEY, STATE2
 369	aesenc KEY, STATE3
 370	aesenc KEY, STATE4
 371#.align 4
 372.L4enc192:
 373	movaps -0x40(TKEYP), KEY
 374	aesenc KEY, STATE1
 375	aesenc KEY, STATE2
 376	aesenc KEY, STATE3
 377	aesenc KEY, STATE4
 378	movaps -0x30(TKEYP), KEY
 379	aesenc KEY, STATE1
 380	aesenc KEY, STATE2
 381	aesenc KEY, STATE3
 382	aesenc KEY, STATE4
 383#.align 4
 384.L4enc128:
 385	movaps -0x20(TKEYP), KEY
 386	aesenc KEY, STATE1
 387	aesenc KEY, STATE2
 388	aesenc KEY, STATE3
 389	aesenc KEY, STATE4
 390	movaps -0x10(TKEYP), KEY
 391	aesenc KEY, STATE1
 392	aesenc KEY, STATE2
 393	aesenc KEY, STATE3
 394	aesenc KEY, STATE4
 395	movaps (TKEYP), KEY
 396	aesenc KEY, STATE1
 397	aesenc KEY, STATE2
 398	aesenc KEY, STATE3
 399	aesenc KEY, STATE4
 400	movaps 0x10(TKEYP), KEY
 401	aesenc KEY, STATE1
 402	aesenc KEY, STATE2
 403	aesenc KEY, STATE3
 404	aesenc KEY, STATE4
 405	movaps 0x20(TKEYP), KEY
 406	aesenc KEY, STATE1
 407	aesenc KEY, STATE2
 408	aesenc KEY, STATE3
 409	aesenc KEY, STATE4
 410	movaps 0x30(TKEYP), KEY
 411	aesenc KEY, STATE1
 412	aesenc KEY, STATE2
 413	aesenc KEY, STATE3
 414	aesenc KEY, STATE4
 415	movaps 0x40(TKEYP), KEY
 416	aesenc KEY, STATE1
 417	aesenc KEY, STATE2
 418	aesenc KEY, STATE3
 419	aesenc KEY, STATE4
 420	movaps 0x50(TKEYP), KEY
 421	aesenc KEY, STATE1
 422	aesenc KEY, STATE2
 423	aesenc KEY, STATE3
 424	aesenc KEY, STATE4
 425	movaps 0x60(TKEYP), KEY
 426	aesenc KEY, STATE1
 427	aesenc KEY, STATE2
 428	aesenc KEY, STATE3
 429	aesenc KEY, STATE4
 430	movaps 0x70(TKEYP), KEY
 431	aesenclast KEY, STATE1		# last round
 432	aesenclast KEY, STATE2
 433	aesenclast KEY, STATE3
 434	aesenclast KEY, STATE4
 435	RET
 436SYM_FUNC_END(_aesni_enc4)
 437
 438/*
 439 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
 440 */
 441SYM_FUNC_START(aesni_dec)
 442	FRAME_BEGIN
 443#ifndef __x86_64__
 444	pushl KEYP
 445	pushl KLEN
 446	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
 447	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
 448	movl (FRAME_OFFSET+20)(%esp), INP	# src
 449#endif
 450	mov 480(KEYP), KLEN		# key length
 451	add $240, KEYP
 452	movups (INP), STATE		# input
 453	call _aesni_dec1
 454	movups STATE, (OUTP)		#output
 455#ifndef __x86_64__
 456	popl KLEN
 457	popl KEYP
 458#endif
 459	FRAME_END
 460	RET
 461SYM_FUNC_END(aesni_dec)
 462
 463/*
 464 * _aesni_dec1:		internal ABI
 465 * input:
 466 *	KEYP:		key struct pointer
 467 *	KLEN:		key length
 468 *	STATE:		initial state (input)
 469 * output:
 470 *	STATE:		finial state (output)
 471 * changed:
 472 *	KEY
 473 *	TKEYP (T1)
 474 */
 475SYM_FUNC_START_LOCAL(_aesni_dec1)
 476	movaps (KEYP), KEY		# key
 477	mov KEYP, TKEYP
 478	pxor KEY, STATE		# round 0
 479	add $0x30, TKEYP
 480	cmp $24, KLEN
 481	jb .Ldec128
 482	lea 0x20(TKEYP), TKEYP
 483	je .Ldec192
 484	add $0x20, TKEYP
 485	movaps -0x60(TKEYP), KEY
 486	aesdec KEY, STATE
 487	movaps -0x50(TKEYP), KEY
 488	aesdec KEY, STATE
 489.align 4
 490.Ldec192:
 491	movaps -0x40(TKEYP), KEY
 492	aesdec KEY, STATE
 493	movaps -0x30(TKEYP), KEY
 494	aesdec KEY, STATE
 495.align 4
 496.Ldec128:
 497	movaps -0x20(TKEYP), KEY
 498	aesdec KEY, STATE
 499	movaps -0x10(TKEYP), KEY
 500	aesdec KEY, STATE
 501	movaps (TKEYP), KEY
 502	aesdec KEY, STATE
 503	movaps 0x10(TKEYP), KEY
 504	aesdec KEY, STATE
 505	movaps 0x20(TKEYP), KEY
 506	aesdec KEY, STATE
 507	movaps 0x30(TKEYP), KEY
 508	aesdec KEY, STATE
 509	movaps 0x40(TKEYP), KEY
 510	aesdec KEY, STATE
 511	movaps 0x50(TKEYP), KEY
 512	aesdec KEY, STATE
 513	movaps 0x60(TKEYP), KEY
 514	aesdec KEY, STATE
 515	movaps 0x70(TKEYP), KEY
 516	aesdeclast KEY, STATE
 517	RET
 518SYM_FUNC_END(_aesni_dec1)
 519
 520/*
 521 * _aesni_dec4:	internal ABI
 522 * input:
 523 *	KEYP:		key struct pointer
 524 *	KLEN:		key length
 525 *	STATE1:		initial state (input)
 526 *	STATE2
 527 *	STATE3
 528 *	STATE4
 529 * output:
 530 *	STATE1:		finial state (output)
 531 *	STATE2
 532 *	STATE3
 533 *	STATE4
 534 * changed:
 535 *	KEY
 536 *	TKEYP (T1)
 537 */
 538SYM_FUNC_START_LOCAL(_aesni_dec4)
 539	movaps (KEYP), KEY		# key
 540	mov KEYP, TKEYP
 541	pxor KEY, STATE1		# round 0
 542	pxor KEY, STATE2
 543	pxor KEY, STATE3
 544	pxor KEY, STATE4
 545	add $0x30, TKEYP
 546	cmp $24, KLEN
 547	jb .L4dec128
 548	lea 0x20(TKEYP), TKEYP
 549	je .L4dec192
 550	add $0x20, TKEYP
 551	movaps -0x60(TKEYP), KEY
 552	aesdec KEY, STATE1
 553	aesdec KEY, STATE2
 554	aesdec KEY, STATE3
 555	aesdec KEY, STATE4
 556	movaps -0x50(TKEYP), KEY
 557	aesdec KEY, STATE1
 558	aesdec KEY, STATE2
 559	aesdec KEY, STATE3
 560	aesdec KEY, STATE4
 561.align 4
 562.L4dec192:
 563	movaps -0x40(TKEYP), KEY
 564	aesdec KEY, STATE1
 565	aesdec KEY, STATE2
 566	aesdec KEY, STATE3
 567	aesdec KEY, STATE4
 568	movaps -0x30(TKEYP), KEY
 569	aesdec KEY, STATE1
 570	aesdec KEY, STATE2
 571	aesdec KEY, STATE3
 572	aesdec KEY, STATE4
 573.align 4
 574.L4dec128:
 575	movaps -0x20(TKEYP), KEY
 576	aesdec KEY, STATE1
 577	aesdec KEY, STATE2
 578	aesdec KEY, STATE3
 579	aesdec KEY, STATE4
 580	movaps -0x10(TKEYP), KEY
 581	aesdec KEY, STATE1
 582	aesdec KEY, STATE2
 583	aesdec KEY, STATE3
 584	aesdec KEY, STATE4
 585	movaps (TKEYP), KEY
 586	aesdec KEY, STATE1
 587	aesdec KEY, STATE2
 588	aesdec KEY, STATE3
 589	aesdec KEY, STATE4
 590	movaps 0x10(TKEYP), KEY
 591	aesdec KEY, STATE1
 592	aesdec KEY, STATE2
 593	aesdec KEY, STATE3
 594	aesdec KEY, STATE4
 595	movaps 0x20(TKEYP), KEY
 596	aesdec KEY, STATE1
 597	aesdec KEY, STATE2
 598	aesdec KEY, STATE3
 599	aesdec KEY, STATE4
 600	movaps 0x30(TKEYP), KEY
 601	aesdec KEY, STATE1
 602	aesdec KEY, STATE2
 603	aesdec KEY, STATE3
 604	aesdec KEY, STATE4
 605	movaps 0x40(TKEYP), KEY
 606	aesdec KEY, STATE1
 607	aesdec KEY, STATE2
 608	aesdec KEY, STATE3
 609	aesdec KEY, STATE4
 610	movaps 0x50(TKEYP), KEY
 611	aesdec KEY, STATE1
 612	aesdec KEY, STATE2
 613	aesdec KEY, STATE3
 614	aesdec KEY, STATE4
 615	movaps 0x60(TKEYP), KEY
 616	aesdec KEY, STATE1
 617	aesdec KEY, STATE2
 618	aesdec KEY, STATE3
 619	aesdec KEY, STATE4
 620	movaps 0x70(TKEYP), KEY
 621	aesdeclast KEY, STATE1		# last round
 622	aesdeclast KEY, STATE2
 623	aesdeclast KEY, STATE3
 624	aesdeclast KEY, STATE4
 625	RET
 626SYM_FUNC_END(_aesni_dec4)
 627
 628/*
 629 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 630 *		      size_t len)
 631 */
 632SYM_FUNC_START(aesni_ecb_enc)
 633	FRAME_BEGIN
 634#ifndef __x86_64__
 635	pushl LEN
 636	pushl KEYP
 637	pushl KLEN
 638	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
 639	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
 640	movl (FRAME_OFFSET+24)(%esp), INP	# src
 641	movl (FRAME_OFFSET+28)(%esp), LEN	# len
 642#endif
 643	test LEN, LEN		# check length
 644	jz .Lecb_enc_ret
 645	mov 480(KEYP), KLEN
 646	cmp $16, LEN
 647	jb .Lecb_enc_ret
 648	cmp $64, LEN
 649	jb .Lecb_enc_loop1
 650.align 4
 651.Lecb_enc_loop4:
 652	movups (INP), STATE1
 653	movups 0x10(INP), STATE2
 654	movups 0x20(INP), STATE3
 655	movups 0x30(INP), STATE4
 656	call _aesni_enc4
 657	movups STATE1, (OUTP)
 658	movups STATE2, 0x10(OUTP)
 659	movups STATE3, 0x20(OUTP)
 660	movups STATE4, 0x30(OUTP)
 661	sub $64, LEN
 662	add $64, INP
 663	add $64, OUTP
 664	cmp $64, LEN
 665	jge .Lecb_enc_loop4
 666	cmp $16, LEN
 667	jb .Lecb_enc_ret
 668.align 4
 669.Lecb_enc_loop1:
 670	movups (INP), STATE1
 671	call _aesni_enc1
 672	movups STATE1, (OUTP)
 673	sub $16, LEN
 674	add $16, INP
 675	add $16, OUTP
 676	cmp $16, LEN
 677	jge .Lecb_enc_loop1
 678.Lecb_enc_ret:
 679#ifndef __x86_64__
 680	popl KLEN
 681	popl KEYP
 682	popl LEN
 683#endif
 684	FRAME_END
 685	RET
 686SYM_FUNC_END(aesni_ecb_enc)
 687
 688/*
 689 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 690 *		      size_t len);
 691 */
 692SYM_FUNC_START(aesni_ecb_dec)
 693	FRAME_BEGIN
 694#ifndef __x86_64__
 695	pushl LEN
 696	pushl KEYP
 697	pushl KLEN
 698	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
 699	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
 700	movl (FRAME_OFFSET+24)(%esp), INP	# src
 701	movl (FRAME_OFFSET+28)(%esp), LEN	# len
 702#endif
 703	test LEN, LEN
 704	jz .Lecb_dec_ret
 705	mov 480(KEYP), KLEN
 706	add $240, KEYP
 707	cmp $16, LEN
 708	jb .Lecb_dec_ret
 709	cmp $64, LEN
 710	jb .Lecb_dec_loop1
 711.align 4
 712.Lecb_dec_loop4:
 713	movups (INP), STATE1
 714	movups 0x10(INP), STATE2
 715	movups 0x20(INP), STATE3
 716	movups 0x30(INP), STATE4
 717	call _aesni_dec4
 718	movups STATE1, (OUTP)
 719	movups STATE2, 0x10(OUTP)
 720	movups STATE3, 0x20(OUTP)
 721	movups STATE4, 0x30(OUTP)
 722	sub $64, LEN
 723	add $64, INP
 724	add $64, OUTP
 725	cmp $64, LEN
 726	jge .Lecb_dec_loop4
 727	cmp $16, LEN
 728	jb .Lecb_dec_ret
 729.align 4
 730.Lecb_dec_loop1:
 731	movups (INP), STATE1
 732	call _aesni_dec1
 733	movups STATE1, (OUTP)
 734	sub $16, LEN
 735	add $16, INP
 736	add $16, OUTP
 737	cmp $16, LEN
 738	jge .Lecb_dec_loop1
 739.Lecb_dec_ret:
 740#ifndef __x86_64__
 741	popl KLEN
 742	popl KEYP
 743	popl LEN
 744#endif
 745	FRAME_END
 746	RET
 747SYM_FUNC_END(aesni_ecb_dec)
 748
 749/*
 750 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 751 *		      size_t len, u8 *iv)
 752 */
 753SYM_FUNC_START(aesni_cbc_enc)
 754	FRAME_BEGIN
 755#ifndef __x86_64__
 756	pushl IVP
 757	pushl LEN
 758	pushl KEYP
 759	pushl KLEN
 760	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
 761	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
 762	movl (FRAME_OFFSET+28)(%esp), INP	# src
 763	movl (FRAME_OFFSET+32)(%esp), LEN	# len
 764	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
 765#endif
 766	cmp $16, LEN
 767	jb .Lcbc_enc_ret
 768	mov 480(KEYP), KLEN
 769	movups (IVP), STATE	# load iv as initial state
 770.align 4
 771.Lcbc_enc_loop:
 772	movups (INP), IN	# load input
 773	pxor IN, STATE
 774	call _aesni_enc1
 775	movups STATE, (OUTP)	# store output
 776	sub $16, LEN
 777	add $16, INP
 778	add $16, OUTP
 779	cmp $16, LEN
 780	jge .Lcbc_enc_loop
 781	movups STATE, (IVP)
 782.Lcbc_enc_ret:
 783#ifndef __x86_64__
 784	popl KLEN
 785	popl KEYP
 786	popl LEN
 787	popl IVP
 788#endif
 789	FRAME_END
 790	RET
 791SYM_FUNC_END(aesni_cbc_enc)
 792
 793/*
 794 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 795 *		      size_t len, u8 *iv)
 796 */
 797SYM_FUNC_START(aesni_cbc_dec)
 798	FRAME_BEGIN
 799#ifndef __x86_64__
 800	pushl IVP
 801	pushl LEN
 802	pushl KEYP
 803	pushl KLEN
 804	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
 805	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
 806	movl (FRAME_OFFSET+28)(%esp), INP	# src
 807	movl (FRAME_OFFSET+32)(%esp), LEN	# len
 808	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
 809#endif
 810	cmp $16, LEN
 811	jb .Lcbc_dec_just_ret
 812	mov 480(KEYP), KLEN
 813	add $240, KEYP
 814	movups (IVP), IV
 815	cmp $64, LEN
 816	jb .Lcbc_dec_loop1
 817.align 4
 818.Lcbc_dec_loop4:
 819	movups (INP), IN1
 820	movaps IN1, STATE1
 821	movups 0x10(INP), IN2
 822	movaps IN2, STATE2
 823#ifdef __x86_64__
 824	movups 0x20(INP), IN3
 825	movaps IN3, STATE3
 826	movups 0x30(INP), IN4
 827	movaps IN4, STATE4
 828#else
 829	movups 0x20(INP), IN1
 830	movaps IN1, STATE3
 831	movups 0x30(INP), IN2
 832	movaps IN2, STATE4
 833#endif
 834	call _aesni_dec4
 835	pxor IV, STATE1
 836#ifdef __x86_64__
 837	pxor IN1, STATE2
 838	pxor IN2, STATE3
 839	pxor IN3, STATE4
 840	movaps IN4, IV
 841#else
 842	pxor IN1, STATE4
 843	movaps IN2, IV
 844	movups (INP), IN1
 845	pxor IN1, STATE2
 846	movups 0x10(INP), IN2
 847	pxor IN2, STATE3
 848#endif
 849	movups STATE1, (OUTP)
 850	movups STATE2, 0x10(OUTP)
 851	movups STATE3, 0x20(OUTP)
 852	movups STATE4, 0x30(OUTP)
 853	sub $64, LEN
 854	add $64, INP
 855	add $64, OUTP
 856	cmp $64, LEN
 857	jge .Lcbc_dec_loop4
 858	cmp $16, LEN
 859	jb .Lcbc_dec_ret
 860.align 4
 861.Lcbc_dec_loop1:
 862	movups (INP), IN
 863	movaps IN, STATE
 864	call _aesni_dec1
 865	pxor IV, STATE
 866	movups STATE, (OUTP)
 867	movaps IN, IV
 868	sub $16, LEN
 869	add $16, INP
 870	add $16, OUTP
 871	cmp $16, LEN
 872	jge .Lcbc_dec_loop1
 873.Lcbc_dec_ret:
 874	movups IV, (IVP)
 875.Lcbc_dec_just_ret:
 876#ifndef __x86_64__
 877	popl KLEN
 878	popl KEYP
 879	popl LEN
 880	popl IVP
 881#endif
 882	FRAME_END
 883	RET
 884SYM_FUNC_END(aesni_cbc_dec)
 885
 886/*
 887 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 888 *			  size_t len, u8 *iv)
 889 */
 890SYM_FUNC_START(aesni_cts_cbc_enc)
 891	FRAME_BEGIN
 892#ifndef __x86_64__
 893	pushl IVP
 894	pushl LEN
 895	pushl KEYP
 896	pushl KLEN
 897	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
 898	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
 899	movl (FRAME_OFFSET+28)(%esp), INP	# src
 900	movl (FRAME_OFFSET+32)(%esp), LEN	# len
 901	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
 902	lea .Lcts_permute_table, T1
 903#else
 904	lea .Lcts_permute_table(%rip), T1
 905#endif
 906	mov 480(KEYP), KLEN
 907	movups (IVP), STATE
 908	sub $16, LEN
 909	mov T1, IVP
 910	add $32, IVP
 911	add LEN, T1
 912	sub LEN, IVP
 913	movups (T1), %xmm4
 914	movups (IVP), %xmm5
 915
 916	movups (INP), IN1
 917	add LEN, INP
 918	movups (INP), IN2
 919
 920	pxor IN1, STATE
 921	call _aesni_enc1
 922
 923	pshufb %xmm5, IN2
 924	pxor STATE, IN2
 925	pshufb %xmm4, STATE
 926	add OUTP, LEN
 927	movups STATE, (LEN)
 928
 929	movaps IN2, STATE
 930	call _aesni_enc1
 931	movups STATE, (OUTP)
 932
 933#ifndef __x86_64__
 934	popl KLEN
 935	popl KEYP
 936	popl LEN
 937	popl IVP
 938#endif
 939	FRAME_END
 940	RET
 941SYM_FUNC_END(aesni_cts_cbc_enc)
 942
 943/*
 944 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 945 *			  size_t len, u8 *iv)
 946 */
 947SYM_FUNC_START(aesni_cts_cbc_dec)
 948	FRAME_BEGIN
 949#ifndef __x86_64__
 950	pushl IVP
 951	pushl LEN
 952	pushl KEYP
 953	pushl KLEN
 954	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
 955	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
 956	movl (FRAME_OFFSET+28)(%esp), INP	# src
 957	movl (FRAME_OFFSET+32)(%esp), LEN	# len
 958	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
 959	lea .Lcts_permute_table, T1
 960#else
 961	lea .Lcts_permute_table(%rip), T1
 962#endif
 963	mov 480(KEYP), KLEN
 964	add $240, KEYP
 965	movups (IVP), IV
 966	sub $16, LEN
 967	mov T1, IVP
 968	add $32, IVP
 969	add LEN, T1
 970	sub LEN, IVP
 971	movups (T1), %xmm4
 972
 973	movups (INP), STATE
 974	add LEN, INP
 975	movups (INP), IN1
 976
 977	call _aesni_dec1
 978	movaps STATE, IN2
 979	pshufb %xmm4, STATE
 980	pxor IN1, STATE
 981
 982	add OUTP, LEN
 983	movups STATE, (LEN)
 984
 985	movups (IVP), %xmm0
 986	pshufb %xmm0, IN1
 987	pblendvb IN2, IN1
 988	movaps IN1, STATE
 989	call _aesni_dec1
 990
 991	pxor IV, STATE
 992	movups STATE, (OUTP)
 993
 994#ifndef __x86_64__
 995	popl KLEN
 996	popl KEYP
 997	popl LEN
 998	popl IVP
 999#endif
1000	FRAME_END
1001	RET
1002SYM_FUNC_END(aesni_cts_cbc_dec)
1003
1004.pushsection .rodata
1005.align 16
1006.Lcts_permute_table:
1007	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1008	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1009	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
1010	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
1011	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1012	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1013#ifdef __x86_64__
1014.Lbswap_mask:
1015	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1016#endif
1017.popsection
1018
1019#ifdef __x86_64__
1020/*
1021 * _aesni_inc_init:	internal ABI
1022 *	setup registers used by _aesni_inc
1023 * input:
1024 *	IV
1025 * output:
1026 *	CTR:	== IV, in little endian
1027 *	TCTR_LOW: == lower qword of CTR
1028 *	INC:	== 1, in little endian
1029 *	BSWAP_MASK == endian swapping mask
1030 */
1031SYM_FUNC_START_LOCAL(_aesni_inc_init)
1032	movaps .Lbswap_mask(%rip), BSWAP_MASK
1033	movaps IV, CTR
1034	pshufb BSWAP_MASK, CTR
1035	mov $1, TCTR_LOW
1036	movq TCTR_LOW, INC
1037	movq CTR, TCTR_LOW
1038	RET
1039SYM_FUNC_END(_aesni_inc_init)
1040
1041/*
1042 * _aesni_inc:		internal ABI
1043 *	Increase IV by 1, IV is in big endian
1044 * input:
1045 *	IV
1046 *	CTR:	== IV, in little endian
1047 *	TCTR_LOW: == lower qword of CTR
1048 *	INC:	== 1, in little endian
1049 *	BSWAP_MASK == endian swapping mask
1050 * output:
1051 *	IV:	Increase by 1
1052 * changed:
1053 *	CTR:	== output IV, in little endian
1054 *	TCTR_LOW: == lower qword of CTR
1055 */
1056SYM_FUNC_START_LOCAL(_aesni_inc)
1057	paddq INC, CTR
1058	add $1, TCTR_LOW
1059	jnc .Linc_low
1060	pslldq $8, INC
1061	paddq INC, CTR
1062	psrldq $8, INC
1063.Linc_low:
1064	movaps CTR, IV
1065	pshufb BSWAP_MASK, IV
1066	RET
1067SYM_FUNC_END(_aesni_inc)
1068
1069/*
1070 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1071 *		      size_t len, u8 *iv)
1072 */
1073SYM_FUNC_START(aesni_ctr_enc)
1074	FRAME_BEGIN
1075	cmp $16, LEN
1076	jb .Lctr_enc_just_ret
1077	mov 480(KEYP), KLEN
1078	movups (IVP), IV
1079	call _aesni_inc_init
1080	cmp $64, LEN
1081	jb .Lctr_enc_loop1
1082.align 4
1083.Lctr_enc_loop4:
1084	movaps IV, STATE1
1085	call _aesni_inc
1086	movups (INP), IN1
1087	movaps IV, STATE2
1088	call _aesni_inc
1089	movups 0x10(INP), IN2
1090	movaps IV, STATE3
1091	call _aesni_inc
1092	movups 0x20(INP), IN3
1093	movaps IV, STATE4
1094	call _aesni_inc
1095	movups 0x30(INP), IN4
1096	call _aesni_enc4
1097	pxor IN1, STATE1
1098	movups STATE1, (OUTP)
1099	pxor IN2, STATE2
1100	movups STATE2, 0x10(OUTP)
1101	pxor IN3, STATE3
1102	movups STATE3, 0x20(OUTP)
1103	pxor IN4, STATE4
1104	movups STATE4, 0x30(OUTP)
1105	sub $64, LEN
1106	add $64, INP
1107	add $64, OUTP
1108	cmp $64, LEN
1109	jge .Lctr_enc_loop4
1110	cmp $16, LEN
1111	jb .Lctr_enc_ret
1112.align 4
1113.Lctr_enc_loop1:
1114	movaps IV, STATE
1115	call _aesni_inc
1116	movups (INP), IN
1117	call _aesni_enc1
1118	pxor IN, STATE
1119	movups STATE, (OUTP)
1120	sub $16, LEN
1121	add $16, INP
1122	add $16, OUTP
1123	cmp $16, LEN
1124	jge .Lctr_enc_loop1
1125.Lctr_enc_ret:
1126	movups IV, (IVP)
1127.Lctr_enc_just_ret:
1128	FRAME_END
1129	RET
1130SYM_FUNC_END(aesni_ctr_enc)
1131
1132#endif
1133
1134.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
1135.align 16
1136.Lgf128mul_x_ble_mask:
1137	.octa 0x00000000000000010000000000000087
1138.previous
1139
1140/*
1141 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
 
1142 * input:
1143 *	IV:	current IV
1144 *	GF128MUL_MASK == mask with 0x87 and 0x01
1145 * output:
1146 *	IV:	next IV
1147 * changed:
1148 *	KEY:	== temporary value
1149 */
1150.macro _aesni_gf128mul_x_ble
1151	pshufd $0x13, IV, KEY
1152	paddq IV, IV
1153	psrad $31, KEY
1154	pand GF128MUL_MASK, KEY
1155	pxor KEY, IV
1156.endm
1157
1158.macro	_aesni_xts_crypt	enc
 
 
 
 
1159	FRAME_BEGIN
1160#ifndef __x86_64__
1161	pushl IVP
1162	pushl LEN
1163	pushl KEYP
1164	pushl KLEN
1165	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
1166	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
1167	movl (FRAME_OFFSET+28)(%esp), INP	# src
1168	movl (FRAME_OFFSET+32)(%esp), LEN	# len
1169	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
1170	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
1171#else
1172	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
1173#endif
1174	movups (IVP), IV
1175
1176	mov 480(KEYP), KLEN
1177.if !\enc
1178	add $240, KEYP
1179
1180	test $15, LEN
1181	jz .Lxts_loop4\@
1182	sub $16, LEN
1183.endif
1184
1185.Lxts_loop4\@:
1186	sub $64, LEN
1187	jl .Lxts_1x\@
1188
1189	movdqa IV, STATE1
1190	movdqu 0x00(INP), IN
1191	pxor IN, STATE1
1192	movdqu IV, 0x00(OUTP)
1193
1194	_aesni_gf128mul_x_ble
1195	movdqa IV, STATE2
1196	movdqu 0x10(INP), IN
1197	pxor IN, STATE2
1198	movdqu IV, 0x10(OUTP)
1199
1200	_aesni_gf128mul_x_ble
1201	movdqa IV, STATE3
1202	movdqu 0x20(INP), IN
1203	pxor IN, STATE3
1204	movdqu IV, 0x20(OUTP)
1205
1206	_aesni_gf128mul_x_ble
1207	movdqa IV, STATE4
1208	movdqu 0x30(INP), IN
1209	pxor IN, STATE4
1210	movdqu IV, 0x30(OUTP)
1211
1212.if \enc
1213	call _aesni_enc4
1214.else
1215	call _aesni_dec4
1216.endif
1217
1218	movdqu 0x00(OUTP), IN
1219	pxor IN, STATE1
1220	movdqu STATE1, 0x00(OUTP)
1221
1222	movdqu 0x10(OUTP), IN
1223	pxor IN, STATE2
 
 
 
 
 
 
1224	movdqu STATE2, 0x10(OUTP)
1225
1226	movdqu 0x20(OUTP), IN
1227	pxor IN, STATE3
 
 
 
 
 
 
1228	movdqu STATE3, 0x20(OUTP)
1229
1230	movdqu 0x30(OUTP), IN
1231	pxor IN, STATE4
1232	movdqu STATE4, 0x30(OUTP)
 
 
1233
1234	_aesni_gf128mul_x_ble
 
 
1235
1236	add $64, INP
1237	add $64, OUTP
1238	test LEN, LEN
1239	jnz .Lxts_loop4\@
 
1240
1241.Lxts_ret_iv\@:
1242	movups IV, (IVP)
1243
1244.Lxts_ret\@:
1245#ifndef __x86_64__
1246	popl KLEN
1247	popl KEYP
1248	popl LEN
1249	popl IVP
1250#endif
1251	FRAME_END
1252	RET
1253
1254.Lxts_1x\@:
1255	add $64, LEN
1256	jz .Lxts_ret_iv\@
1257.if \enc
1258	sub $16, LEN
1259	jl .Lxts_cts4\@
1260.endif
1261
1262.Lxts_loop1\@:
1263	movdqu (INP), STATE
1264.if \enc
1265	pxor IV, STATE
1266	call _aesni_enc1
1267.else
1268	add $16, INP
1269	sub $16, LEN
1270	jl .Lxts_cts1\@
1271	pxor IV, STATE
1272	call _aesni_dec1
1273.endif
1274	pxor IV, STATE
1275	_aesni_gf128mul_x_ble
1276
1277	test LEN, LEN
1278	jz .Lxts_out\@
1279
1280.if \enc
1281	add $16, INP
1282	sub $16, LEN
1283	jl .Lxts_cts1\@
1284.endif
1285
1286	movdqu STATE, (OUTP)
1287	add $16, OUTP
1288	jmp .Lxts_loop1\@
1289
1290.Lxts_out\@:
1291	movdqu STATE, (OUTP)
1292	jmp .Lxts_ret_iv\@
1293
1294.if \enc
1295.Lxts_cts4\@:
1296	movdqa STATE4, STATE
1297	sub $16, OUTP
1298.Lxts_cts1\@:
1299.else
1300.Lxts_cts1\@:
1301	movdqa IV, STATE4
1302	_aesni_gf128mul_x_ble
1303
1304	pxor IV, STATE
1305	call _aesni_dec1
1306	pxor IV, STATE
1307.endif
1308#ifndef __x86_64__
1309	lea .Lcts_permute_table, T1
1310#else
1311	lea .Lcts_permute_table(%rip), T1
1312#endif
1313	add LEN, INP		/* rewind input pointer */
1314	add $16, LEN		/* # bytes in final block */
1315	movups (INP), IN1
1316
1317	mov T1, IVP
1318	add $32, IVP
1319	add LEN, T1
1320	sub LEN, IVP
1321	add OUTP, LEN
1322
1323	movups (T1), %xmm4
1324	movaps STATE, IN2
1325	pshufb %xmm4, STATE
1326	movups STATE, (LEN)
1327
1328	movups (IVP), %xmm0
1329	pshufb %xmm0, IN1
1330	pblendvb IN2, IN1
1331	movaps IN1, STATE
1332
1333.if \enc
1334	pxor IV, STATE
1335	call _aesni_enc1
1336	pxor IV, STATE
1337.else
1338	pxor STATE4, STATE
1339	call _aesni_dec1
1340	pxor STATE4, STATE
1341.endif
1342
1343	movups STATE, (OUTP)
1344	jmp .Lxts_ret\@
1345.endm
1346
1347/*
1348 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
1349 *		      const u8 *src, unsigned int len, le128 *iv)
1350 */
1351SYM_FUNC_START(aesni_xts_enc)
1352	_aesni_xts_crypt	1
1353SYM_FUNC_END(aesni_xts_enc)
1354
1355/*
1356 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
1357 *		      const u8 *src, unsigned int len, le128 *iv)
1358 */
1359SYM_FUNC_START(aesni_xts_dec)
1360	_aesni_xts_crypt	0
1361SYM_FUNC_END(aesni_xts_dec)

   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Implement AES algorithm in Intel AES-NI instructions.
   4 *
   5 * The white paper of AES-NI instructions can be downloaded from:
   6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7 *
   8 * Copyright (C) 2008, Intel Corp.
   9 *    Author: Huang Ying <ying.huang@intel.com>
  10 *            Vinodh Gopal <vinodh.gopal@intel.com>
  11 *            Kahraman Akdemir
  12 *
  13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14 * interface for 64-bit kernels.
  15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17 *             Adrian Hoban <adrian.hoban@intel.com>
  18 *             James Guilford (james.guilford@intel.com)
  19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20 *             Tadeusz Struk (tadeusz.struk@intel.com)
  21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22 *    Copyright (c) 2010, Intel Corporation.
  23 *
  24 * Ported x86_64 version to x86:
  25 *    Author: Mathias Krause <minipli@googlemail.com>
  26 */
  27
  28#include <linux/linkage.h>
  29#include <asm/frame.h>
  30#include <asm/nospec-branch.h>
  31
  32/*
  33 * The following macros are used to move an (un)aligned 16 byte value to/from
  34 * an XMM register.  This can done for either FP or integer values, for FP use
  35 * movaps (move aligned packed single) or integer use movdqa (move double quad
  36 * aligned).  It doesn't make a performance difference which instruction is used
  37 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  38 * shorter, so that is the one we'll use for now. (same for unaligned).
  39 */
  40#define MOVADQ	movaps
  41#define MOVUDQ	movups
  42
  43#ifdef __x86_64__
  44
  45# constants in mergeable sections, linker can reorder and merge
  46.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  47.align 16
  48.Lgf128mul_x_ble_mask:
  49	.octa 0x00000000000000010000000000000087
  50.section	.rodata.cst16.POLY, "aM", @progbits, 16
  51.align 16
  52POLY:   .octa 0xC2000000000000000000000000000001
  53.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
  54.align 16
  55TWOONE: .octa 0x00000001000000000000000000000001
  56
  57.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  58.align 16
  59SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  60.section	.rodata.cst16.MASK1, "aM", @progbits, 16
  61.align 16
  62MASK1:      .octa 0x0000000000000000ffffffffffffffff
  63.section	.rodata.cst16.MASK2, "aM", @progbits, 16
  64.align 16
  65MASK2:      .octa 0xffffffffffffffff0000000000000000
  66.section	.rodata.cst16.ONE, "aM", @progbits, 16
  67.align 16
  68ONE:        .octa 0x00000000000000000000000000000001
  69.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  70.align 16
  71F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  72.section	.rodata.cst16.dec, "aM", @progbits, 16
  73.align 16
  74dec:        .octa 0x1
  75.section	.rodata.cst16.enc, "aM", @progbits, 16
  76.align 16
  77enc:        .octa 0x2
  78
  79# order of these constants should not change.
  80# more specifically, ALL_F should follow SHIFT_MASK,
  81# and zero should follow ALL_F
  82.section	.rodata, "a", @progbits
  83.align 16
  84SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  85ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  86            .octa 0x00000000000000000000000000000000
  87
  88.text
  89
  90
  91#define	STACK_OFFSET    8*3
  92
  93#define AadHash 16*0
  94#define AadLen 16*1
  95#define InLen (16*1)+8
  96#define PBlockEncKey 16*2
  97#define OrigIV 16*3
  98#define CurCount 16*4
  99#define PBlockLen 16*5
 100#define	HashKey		16*6	// store HashKey <<1 mod poly here
 101#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
 102#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
 103#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
 104#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
 105				// bits of  HashKey <<1 mod poly here
 106				//(for Karatsuba purposes)
 107#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
 108				// bits of  HashKey^2 <<1 mod poly here
 109				// (for Karatsuba purposes)
 110#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
 111				// bits of  HashKey^3 <<1 mod poly here
 112				// (for Karatsuba purposes)
 113#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
 114				// bits of  HashKey^4 <<1 mod poly here
 115				// (for Karatsuba purposes)
 116
 117#define arg1 rdi
 118#define arg2 rsi
 119#define arg3 rdx
 120#define arg4 rcx
 121#define arg5 r8
 122#define arg6 r9
 123#define arg7 STACK_OFFSET+8(%rsp)
 124#define arg8 STACK_OFFSET+16(%rsp)
 125#define arg9 STACK_OFFSET+24(%rsp)
 126#define arg10 STACK_OFFSET+32(%rsp)
 127#define arg11 STACK_OFFSET+40(%rsp)
 128#define keysize 2*15*16(%arg1)
 129#endif
 130
 131
 132#define STATE1	%xmm0
 133#define STATE2	%xmm4
 134#define STATE3	%xmm5
 135#define STATE4	%xmm6
 136#define STATE	STATE1
 137#define IN1	%xmm1
 138#define IN2	%xmm7
 139#define IN3	%xmm8
 140#define IN4	%xmm9
 141#define IN	IN1
 142#define KEY	%xmm2
 143#define IV	%xmm3
 144
 145#define BSWAP_MASK %xmm10
 146#define CTR	%xmm11
 147#define INC	%xmm12
 148
 149#define GF128MUL_MASK %xmm10
 150
 151#ifdef __x86_64__
 152#define AREG	%rax
 153#define KEYP	%rdi
 154#define OUTP	%rsi
 155#define UKEYP	OUTP
 156#define INP	%rdx
 157#define LEN	%rcx
 158#define IVP	%r8
 159#define KLEN	%r9d
 160#define T1	%r10
 161#define TKEYP	T1
 162#define T2	%r11
 163#define TCTR_LOW T2
 164#else
 165#define AREG	%eax
 166#define KEYP	%edi
 167#define OUTP	AREG
 168#define UKEYP	OUTP
 169#define INP	%edx
 170#define LEN	%esi
 171#define IVP	%ebp
 172#define KLEN	%ebx
 173#define T1	%ecx
 174#define TKEYP	T1
 175#endif
 176
 177.macro FUNC_SAVE
 178	push	%r12
 179	push	%r13
 180	push	%r14
 181#
 182# states of %xmm registers %xmm6:%xmm15 not saved
 183# all %xmm registers are clobbered
 184#
 185.endm
 186
 187
 188.macro FUNC_RESTORE
 189	pop	%r14
 190	pop	%r13
 191	pop	%r12
 192.endm
 193
 194# Precompute hashkeys.
 195# Input: Hash subkey.
 196# Output: HashKeys stored in gcm_context_data.  Only needs to be called
 197# once per key.
 198# clobbers r12, and tmp xmm registers.
 199.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 200	mov	\SUBKEY, %r12
 201	movdqu	(%r12), \TMP3
 202	movdqa	SHUF_MASK(%rip), \TMP2
 203	pshufb	\TMP2, \TMP3
 204
 205	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 206
 207	movdqa	\TMP3, \TMP2
 208	psllq	$1, \TMP3
 209	psrlq	$63, \TMP2
 210	movdqa	\TMP2, \TMP1
 211	pslldq	$8, \TMP2
 212	psrldq	$8, \TMP1
 213	por	\TMP2, \TMP3
 214
 215	# reduce HashKey<<1
 216
 217	pshufd	$0x24, \TMP1, \TMP2
 218	pcmpeqd TWOONE(%rip), \TMP2
 219	pand	POLY(%rip), \TMP2
 220	pxor	\TMP2, \TMP3
 221	movdqu	\TMP3, HashKey(%arg2)
 222
 223	movdqa	   \TMP3, \TMP5
 224	pshufd	   $78, \TMP3, \TMP1
 225	pxor	   \TMP3, \TMP1
 226	movdqu	   \TMP1, HashKey_k(%arg2)
 227
 228	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 229# TMP5 = HashKey^2<<1 (mod poly)
 230	movdqu	   \TMP5, HashKey_2(%arg2)
 231# HashKey_2 = HashKey^2<<1 (mod poly)
 232	pshufd	   $78, \TMP5, \TMP1
 233	pxor	   \TMP5, \TMP1
 234	movdqu	   \TMP1, HashKey_2_k(%arg2)
 235
 236	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 237# TMP5 = HashKey^3<<1 (mod poly)
 238	movdqu	   \TMP5, HashKey_3(%arg2)
 239	pshufd	   $78, \TMP5, \TMP1
 240	pxor	   \TMP5, \TMP1
 241	movdqu	   \TMP1, HashKey_3_k(%arg2)
 242
 243	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 244# TMP5 = HashKey^3<<1 (mod poly)
 245	movdqu	   \TMP5, HashKey_4(%arg2)
 246	pshufd	   $78, \TMP5, \TMP1
 247	pxor	   \TMP5, \TMP1
 248	movdqu	   \TMP1, HashKey_4_k(%arg2)
 249.endm
 250
 251# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 252# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 253.macro GCM_INIT Iv SUBKEY AAD AADLEN
 254	mov \AADLEN, %r11
 255	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 256	xor %r11d, %r11d
 257	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 258	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 259	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 260	mov \Iv, %rax
 261	movdqu (%rax), %xmm0
 262	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 263
 264	movdqa  SHUF_MASK(%rip), %xmm2
 265	pshufb %xmm2, %xmm0
 266	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 267
 268	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
 269	movdqu HashKey(%arg2), %xmm13
 270
 271	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 272	%xmm4, %xmm5, %xmm6
 273.endm
 274
 275# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 276# struct has been initialized by GCM_INIT.
 277# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 278# Clobbers rax, r10-r13, and xmm0-xmm15
 279.macro GCM_ENC_DEC operation
 280	movdqu AadHash(%arg2), %xmm8
 281	movdqu HashKey(%arg2), %xmm13
 282	add %arg5, InLen(%arg2)
 283
 284	xor %r11d, %r11d # initialise the data pointer offset as zero
 285	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 286
 287	sub %r11, %arg5		# sub partial block data used
 288	mov %arg5, %r13		# save the number of bytes
 289
 290	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
 291	mov %r13, %r12
 292	# Encrypt/Decrypt first few blocks
 293
 294	and	$(3<<4), %r12
 295	jz	_initial_num_blocks_is_0_\@
 296	cmp	$(2<<4), %r12
 297	jb	_initial_num_blocks_is_1_\@
 298	je	_initial_num_blocks_is_2_\@
 299_initial_num_blocks_is_3_\@:
 300	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 301%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 302	sub	$48, %r13
 303	jmp	_initial_blocks_\@
 304_initial_num_blocks_is_2_\@:
 305	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 307	sub	$32, %r13
 308	jmp	_initial_blocks_\@
 309_initial_num_blocks_is_1_\@:
 310	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 312	sub	$16, %r13
 313	jmp	_initial_blocks_\@
 314_initial_num_blocks_is_0_\@:
 315	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 317_initial_blocks_\@:
 318
 319	# Main loop - Encrypt/Decrypt remaining blocks
 320
 321	cmp	$0, %r13
 322	je	_zero_cipher_left_\@
 323	sub	$64, %r13
 324	je	_four_cipher_left_\@
 325_crypt_by_4_\@:
 326	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
 327	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 328	%xmm7, %xmm8, enc
 329	add	$64, %r11
 330	sub	$64, %r13
 331	jne	_crypt_by_4_\@
 332_four_cipher_left_\@:
 333	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 334%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 335_zero_cipher_left_\@:
 336	movdqu %xmm8, AadHash(%arg2)
 337	movdqu %xmm0, CurCount(%arg2)
 338
 339	mov	%arg5, %r13
 340	and	$15, %r13			# %r13 = arg5 (mod 16)
 341	je	_multiple_of_16_bytes_\@
 342
 343	mov %r13, PBlockLen(%arg2)
 344
 345	# Handle the last <16 Byte block separately
 346	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 347	movdqu %xmm0, CurCount(%arg2)
 348	movdqa SHUF_MASK(%rip), %xmm10
 349	pshufb %xmm10, %xmm0
 350
 351	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
 352	movdqu %xmm0, PBlockEncKey(%arg2)
 353
 354	cmp	$16, %arg5
 355	jge _large_enough_update_\@
 356
 357	lea (%arg4,%r11,1), %r10
 358	mov %r13, %r12
 359	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 360	jmp _data_read_\@
 361
 362_large_enough_update_\@:
 363	sub	$16, %r11
 364	add	%r13, %r11
 365
 366	# receive the last <16 Byte block
 367	movdqu	(%arg4, %r11, 1), %xmm1
 368
 369	sub	%r13, %r11
 370	add	$16, %r11
 371
 372	lea	SHIFT_MASK+16(%rip), %r12
 373	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 374	# (r13 is the number of bytes in plaintext mod 16)
 375	sub	%r13, %r12
 376	# get the appropriate shuffle mask
 377	movdqu	(%r12), %xmm2
 378	# shift right 16-r13 bytes
 379	pshufb  %xmm2, %xmm1
 380
 381_data_read_\@:
 382	lea ALL_F+16(%rip), %r12
 383	sub %r13, %r12
 384
 385.ifc \operation, dec
 386	movdqa  %xmm1, %xmm2
 387.endif
 388	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
 389	movdqu	(%r12), %xmm1
 390	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
 391	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 392.ifc \operation, dec
 393	pand    %xmm1, %xmm2
 394	movdqa SHUF_MASK(%rip), %xmm10
 395	pshufb %xmm10 ,%xmm2
 396
 397	pxor %xmm2, %xmm8
 398.else
 399	movdqa SHUF_MASK(%rip), %xmm10
 400	pshufb %xmm10,%xmm0
 401
 402	pxor	%xmm0, %xmm8
 403.endif
 404
 405	movdqu %xmm8, AadHash(%arg2)
 406.ifc \operation, enc
 407	# GHASH computation for the last <16 byte block
 408	movdqa SHUF_MASK(%rip), %xmm10
 409	# shuffle xmm0 back to output as ciphertext
 410	pshufb %xmm10, %xmm0
 411.endif
 412
 413	# Output %r13 bytes
 414	movq %xmm0, %rax
 415	cmp $8, %r13
 416	jle _less_than_8_bytes_left_\@
 417	mov %rax, (%arg3 , %r11, 1)
 418	add $8, %r11
 419	psrldq $8, %xmm0
 420	movq %xmm0, %rax
 421	sub $8, %r13
 422_less_than_8_bytes_left_\@:
 423	mov %al,  (%arg3, %r11, 1)
 424	add $1, %r11
 425	shr $8, %rax
 426	sub $1, %r13
 427	jne _less_than_8_bytes_left_\@
 428_multiple_of_16_bytes_\@:
 429.endm
 430
 431# GCM_COMPLETE Finishes update of tag of last partial block
 432# Output: Authorization Tag (AUTH_TAG)
 433# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 434.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 435	movdqu AadHash(%arg2), %xmm8
 436	movdqu HashKey(%arg2), %xmm13
 437
 438	mov PBlockLen(%arg2), %r12
 439
 440	cmp $0, %r12
 441	je _partial_done\@
 442
 443	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 444
 445_partial_done\@:
 446	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 447	shl	$3, %r12		  # convert into number of bits
 448	movd	%r12d, %xmm15		  # len(A) in %xmm15
 449	mov InLen(%arg2), %r12
 450	shl     $3, %r12                  # len(C) in bits (*128)
 451	movq    %r12, %xmm1
 452
 453	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
 454	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
 455	pxor	%xmm15, %xmm8
 456	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 457	# final GHASH computation
 458	movdqa SHUF_MASK(%rip), %xmm10
 459	pshufb %xmm10, %xmm8
 460
 461	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 462	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
 463	pxor	%xmm8, %xmm0
 464_return_T_\@:
 465	mov	\AUTHTAG, %r10                     # %r10 = authTag
 466	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 467	cmp	$16, %r11
 468	je	_T_16_\@
 469	cmp	$8, %r11
 470	jl	_T_4_\@
 471_T_8_\@:
 472	movq	%xmm0, %rax
 473	mov	%rax, (%r10)
 474	add	$8, %r10
 475	sub	$8, %r11
 476	psrldq	$8, %xmm0
 477	cmp	$0, %r11
 478	je	_return_T_done_\@
 479_T_4_\@:
 480	movd	%xmm0, %eax
 481	mov	%eax, (%r10)
 482	add	$4, %r10
 483	sub	$4, %r11
 484	psrldq	$4, %xmm0
 485	cmp	$0, %r11
 486	je	_return_T_done_\@
 487_T_123_\@:
 488	movd	%xmm0, %eax
 489	cmp	$2, %r11
 490	jl	_T_1_\@
 491	mov	%ax, (%r10)
 492	cmp	$2, %r11
 493	je	_return_T_done_\@
 494	add	$2, %r10
 495	sar	$16, %eax
 496_T_1_\@:
 497	mov	%al, (%r10)
 498	jmp	_return_T_done_\@
 499_T_16_\@:
 500	movdqu	%xmm0, (%r10)
 501_return_T_done_\@:
 502.endm
 503
 504#ifdef __x86_64__
 505/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 506*
 507*
 508* Input: A and B (128-bits each, bit-reflected)
 509* Output: C = A*B*x mod poly, (i.e. >>1 )
 510* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 511* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 512*
 513*/
 514.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 515	movdqa	  \GH, \TMP1
 516	pshufd	  $78, \GH, \TMP2
 517	pshufd	  $78, \HK, \TMP3
 518	pxor	  \GH, \TMP2            # TMP2 = a1+a0
 519	pxor	  \HK, \TMP3            # TMP3 = b1+b0
 520	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
 521	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
 522	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 523	pxor	  \GH, \TMP2
 524	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 525	movdqa	  \TMP2, \TMP3
 526	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
 527	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
 528	pxor	  \TMP3, \GH
 529	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 530
 531        # first phase of the reduction
 532
 533	movdqa    \GH, \TMP2
 534	movdqa    \GH, \TMP3
 535	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 536					# in in order to perform
 537					# independent shifts
 538	pslld     $31, \TMP2            # packed right shift <<31
 539	pslld     $30, \TMP3            # packed right shift <<30
 540	pslld     $25, \TMP4            # packed right shift <<25
 541	pxor      \TMP3, \TMP2          # xor the shifted versions
 542	pxor      \TMP4, \TMP2
 543	movdqa    \TMP2, \TMP5
 544	psrldq    $4, \TMP5             # right shift TMP5 1 DW
 545	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 546	pxor      \TMP2, \GH
 547
 548        # second phase of the reduction
 549
 550	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 551					# in in order to perform
 552					# independent shifts
 553	movdqa    \GH,\TMP3
 554	movdqa    \GH,\TMP4
 555	psrld     $1,\TMP2              # packed left shift >>1
 556	psrld     $2,\TMP3              # packed left shift >>2
 557	psrld     $7,\TMP4              # packed left shift >>7
 558	pxor      \TMP3,\TMP2		# xor the shifted versions
 559	pxor      \TMP4,\TMP2
 560	pxor      \TMP5, \TMP2
 561	pxor      \TMP2, \GH
 562	pxor      \TMP1, \GH            # result is in TMP1
 563.endm
 564
 565# Reads DLEN bytes starting at DPTR and stores in XMMDst
 566# where 0 < DLEN < 16
 567# Clobbers %rax, DLEN and XMM1
 568.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 569        cmp $8, \DLEN
 570        jl _read_lt8_\@
 571        mov (\DPTR), %rax
 572        movq %rax, \XMMDst
 573        sub $8, \DLEN
 574        jz _done_read_partial_block_\@
 575	xor %eax, %eax
 576_read_next_byte_\@:
 577        shl $8, %rax
 578        mov 7(\DPTR, \DLEN, 1), %al
 579        dec \DLEN
 580        jnz _read_next_byte_\@
 581        movq %rax, \XMM1
 582	pslldq $8, \XMM1
 583        por \XMM1, \XMMDst
 584	jmp _done_read_partial_block_\@
 585_read_lt8_\@:
 586	xor %eax, %eax
 587_read_next_byte_lt8_\@:
 588        shl $8, %rax
 589        mov -1(\DPTR, \DLEN, 1), %al
 590        dec \DLEN
 591        jnz _read_next_byte_lt8_\@
 592        movq %rax, \XMMDst
 593_done_read_partial_block_\@:
 594.endm
 595
 596# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 597# clobbers r10-11, xmm14
 598.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 599	TMP6 TMP7
 600	MOVADQ	   SHUF_MASK(%rip), %xmm14
 601	mov	   \AAD, %r10		# %r10 = AAD
 602	mov	   \AADLEN, %r11		# %r11 = aadLen
 603	pxor	   \TMP7, \TMP7
 604	pxor	   \TMP6, \TMP6
 605
 606	cmp	   $16, %r11
 607	jl	   _get_AAD_rest\@
 608_get_AAD_blocks\@:
 609	movdqu	   (%r10), \TMP7
 610	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
 611	pxor	   \TMP7, \TMP6
 612	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 613	add	   $16, %r10
 614	sub	   $16, %r11
 615	cmp	   $16, %r11
 616	jge	   _get_AAD_blocks\@
 617
 618	movdqu	   \TMP6, \TMP7
 619
 620	/* read the last <16B of AAD */
 621_get_AAD_rest\@:
 622	cmp	   $0, %r11
 623	je	   _get_AAD_done\@
 624
 625	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 626	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
 627	pxor	   \TMP6, \TMP7
 628	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 629	movdqu \TMP7, \TMP6
 630
 631_get_AAD_done\@:
 632	movdqu \TMP6, AadHash(%arg2)
 633.endm
 634
 635# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 636# between update calls.
 637# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 638# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 639# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 640.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 641	AAD_HASH operation
 642	mov 	PBlockLen(%arg2), %r13
 643	cmp	$0, %r13
 644	je	_partial_block_done_\@	# Leave Macro if no partial blocks
 645	# Read in input data without over reading
 646	cmp	$16, \PLAIN_CYPH_LEN
 647	jl	_fewer_than_16_bytes_\@
 648	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
 649	jmp	_data_read_\@
 650
 651_fewer_than_16_bytes_\@:
 652	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 653	mov	\PLAIN_CYPH_LEN, %r12
 654	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 655
 656	mov PBlockLen(%arg2), %r13
 657
 658_data_read_\@:				# Finished reading in data
 659
 660	movdqu	PBlockEncKey(%arg2), %xmm9
 661	movdqu	HashKey(%arg2), %xmm13
 662
 663	lea	SHIFT_MASK(%rip), %r12
 664
 665	# adjust the shuffle mask pointer to be able to shift r13 bytes
 666	# r16-r13 is the number of bytes in plaintext mod 16)
 667	add	%r13, %r12
 668	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
 669	pshufb	%xmm2, %xmm9		# shift right r13 bytes
 670
 671.ifc \operation, dec
 672	movdqa	%xmm1, %xmm3
 673	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
 674
 675	mov	\PLAIN_CYPH_LEN, %r10
 676	add	%r13, %r10
 677	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 678	sub	$16, %r10
 679	# Determine if if partial block is not being filled and
 680	# shift mask accordingly
 681	jge	_no_extra_mask_1_\@
 682	sub	%r10, %r12
 683_no_extra_mask_1_\@:
 684
 685	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
 686	# get the appropriate mask to mask out bottom r13 bytes of xmm9
 687	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
 688
 689	pand	%xmm1, %xmm3
 690	movdqa	SHUF_MASK(%rip), %xmm10
 691	pshufb	%xmm10, %xmm3
 692	pshufb	%xmm2, %xmm3
 693	pxor	%xmm3, \AAD_HASH
 694
 695	cmp	$0, %r10
 696	jl	_partial_incomplete_1_\@
 697
 698	# GHASH computation for the last <16 Byte block
 699	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 700	xor	%eax, %eax
 701
 702	mov	%rax, PBlockLen(%arg2)
 703	jmp	_dec_done_\@
 704_partial_incomplete_1_\@:
 705	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
 706_dec_done_\@:
 707	movdqu	\AAD_HASH, AadHash(%arg2)
 708.else
 709	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
 710
 711	mov	\PLAIN_CYPH_LEN, %r10
 712	add	%r13, %r10
 713	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 714	sub	$16, %r10
 715	# Determine if if partial block is not being filled and
 716	# shift mask accordingly
 717	jge	_no_extra_mask_2_\@
 718	sub	%r10, %r12
 719_no_extra_mask_2_\@:
 720
 721	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
 722	# get the appropriate mask to mask out bottom r13 bytes of xmm9
 723	pand	%xmm1, %xmm9
 724
 725	movdqa	SHUF_MASK(%rip), %xmm1
 726	pshufb	%xmm1, %xmm9
 727	pshufb	%xmm2, %xmm9
 728	pxor	%xmm9, \AAD_HASH
 729
 730	cmp	$0, %r10
 731	jl	_partial_incomplete_2_\@
 732
 733	# GHASH computation for the last <16 Byte block
 734	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 735	xor	%eax, %eax
 736
 737	mov	%rax, PBlockLen(%arg2)
 738	jmp	_encode_done_\@
 739_partial_incomplete_2_\@:
 740	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
 741_encode_done_\@:
 742	movdqu	\AAD_HASH, AadHash(%arg2)
 743
 744	movdqa	SHUF_MASK(%rip), %xmm10
 745	# shuffle xmm9 back to output as ciphertext
 746	pshufb	%xmm10, %xmm9
 747	pshufb	%xmm2, %xmm9
 748.endif
 749	# output encrypted Bytes
 750	cmp	$0, %r10
 751	jl	_partial_fill_\@
 752	mov	%r13, %r12
 753	mov	$16, %r13
 754	# Set r13 to be the number of bytes to write out
 755	sub	%r12, %r13
 756	jmp	_count_set_\@
 757_partial_fill_\@:
 758	mov	\PLAIN_CYPH_LEN, %r13
 759_count_set_\@:
 760	movdqa	%xmm9, %xmm0
 761	movq	%xmm0, %rax
 762	cmp	$8, %r13
 763	jle	_less_than_8_bytes_left_\@
 764
 765	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 766	add	$8, \DATA_OFFSET
 767	psrldq	$8, %xmm0
 768	movq	%xmm0, %rax
 769	sub	$8, %r13
 770_less_than_8_bytes_left_\@:
 771	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 772	add	$1, \DATA_OFFSET
 773	shr	$8, %rax
 774	sub	$1, %r13
 775	jne	_less_than_8_bytes_left_\@
 776_partial_block_done_\@:
 777.endm # PARTIAL_BLOCK
 778
 779/*
 780* if a = number of total plaintext bytes
 781* b = floor(a/16)
 782* num_initial_blocks = b mod 4
 783* encrypt the initial num_initial_blocks blocks and apply ghash on
 784* the ciphertext
 785* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 786* are clobbered
 787* arg1, %arg2, %arg3 are used as a pointer only, not modified
 788*/
 789
 790
 791.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 792	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 793	MOVADQ		SHUF_MASK(%rip), %xmm14
 794
 795	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
 796
 797	# start AES for num_initial_blocks blocks
 798
 799	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 800
 801.if (\i == 5) || (\i == 6) || (\i == 7)
 802
 803	MOVADQ		ONE(%RIP),\TMP1
 804	MOVADQ		0(%arg1),\TMP2
 805.irpc index, \i_seq
 806	paddd		\TMP1, \XMM0                 # INCR Y0
 807.ifc \operation, dec
 808        movdqa     \XMM0, %xmm\index
 809.else
 810	MOVADQ		\XMM0, %xmm\index
 811.endif
 812	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
 813	pxor		\TMP2, %xmm\index
 814.endr
 815	lea	0x10(%arg1),%r10
 816	mov	keysize,%eax
 817	shr	$2,%eax				# 128->4, 192->6, 256->8
 818	add	$5,%eax			      # 128->9, 192->11, 256->13
 819
 820aes_loop_initial_\@:
 821	MOVADQ	(%r10),\TMP1
 822.irpc	index, \i_seq
 823	aesenc	\TMP1, %xmm\index
 824.endr
 825	add	$16,%r10
 826	sub	$1,%eax
 827	jnz	aes_loop_initial_\@
 828
 829	MOVADQ	(%r10), \TMP1
 830.irpc index, \i_seq
 831	aesenclast \TMP1, %xmm\index         # Last Round
 832.endr
 833.irpc index, \i_seq
 834	movdqu	   (%arg4 , %r11, 1), \TMP1
 835	pxor	   \TMP1, %xmm\index
 836	movdqu	   %xmm\index, (%arg3 , %r11, 1)
 837	# write back plaintext/ciphertext for num_initial_blocks
 838	add	   $16, %r11
 839
 840.ifc \operation, dec
 841	movdqa     \TMP1, %xmm\index
 842.endif
 843	pshufb	   %xmm14, %xmm\index
 844
 845		# prepare plaintext/ciphertext for GHASH computation
 846.endr
 847.endif
 848
 849        # apply GHASH on num_initial_blocks blocks
 850
 851.if \i == 5
 852        pxor       %xmm5, %xmm6
 853	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 854        pxor       %xmm6, %xmm7
 855	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 856        pxor       %xmm7, %xmm8
 857	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 858.elseif \i == 6
 859        pxor       %xmm6, %xmm7
 860	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 861        pxor       %xmm7, %xmm8
 862	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 863.elseif \i == 7
 864        pxor       %xmm7, %xmm8
 865	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 866.endif
 867	cmp	   $64, %r13
 868	jl	_initial_blocks_done\@
 869	# no need for precomputed values
 870/*
 871*
 872* Precomputations for HashKey parallel with encryption of first 4 blocks.
 873* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 874*/
 875	MOVADQ	   ONE(%RIP),\TMP1
 876	paddd	   \TMP1, \XMM0              # INCR Y0
 877	MOVADQ	   \XMM0, \XMM1
 878	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
 879
 880	paddd	   \TMP1, \XMM0              # INCR Y0
 881	MOVADQ	   \XMM0, \XMM2
 882	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
 883
 884	paddd	   \TMP1, \XMM0              # INCR Y0
 885	MOVADQ	   \XMM0, \XMM3
 886	pshufb %xmm14, \XMM3        # perform a 16 byte swap
 887
 888	paddd	   \TMP1, \XMM0              # INCR Y0
 889	MOVADQ	   \XMM0, \XMM4
 890	pshufb %xmm14, \XMM4        # perform a 16 byte swap
 891
 892	MOVADQ	   0(%arg1),\TMP1
 893	pxor	   \TMP1, \XMM1
 894	pxor	   \TMP1, \XMM2
 895	pxor	   \TMP1, \XMM3
 896	pxor	   \TMP1, \XMM4
 897.irpc index, 1234 # do 4 rounds
 898	movaps 0x10*\index(%arg1), \TMP1
 899	aesenc	   \TMP1, \XMM1
 900	aesenc	   \TMP1, \XMM2
 901	aesenc	   \TMP1, \XMM3
 902	aesenc	   \TMP1, \XMM4
 903.endr
 904.irpc index, 56789 # do next 5 rounds
 905	movaps 0x10*\index(%arg1), \TMP1
 906	aesenc	   \TMP1, \XMM1
 907	aesenc	   \TMP1, \XMM2
 908	aesenc	   \TMP1, \XMM3
 909	aesenc	   \TMP1, \XMM4
 910.endr
 911	lea	   0xa0(%arg1),%r10
 912	mov	   keysize,%eax
 913	shr	   $2,%eax			# 128->4, 192->6, 256->8
 914	sub	   $4,%eax			# 128->0, 192->2, 256->4
 915	jz	   aes_loop_pre_done\@
 916
 917aes_loop_pre_\@:
 918	MOVADQ	   (%r10),\TMP2
 919.irpc	index, 1234
 920	aesenc	   \TMP2, %xmm\index
 921.endr
 922	add	   $16,%r10
 923	sub	   $1,%eax
 924	jnz	   aes_loop_pre_\@
 925
 926aes_loop_pre_done\@:
 927	MOVADQ	   (%r10), \TMP2
 928	aesenclast \TMP2, \XMM1
 929	aesenclast \TMP2, \XMM2
 930	aesenclast \TMP2, \XMM3
 931	aesenclast \TMP2, \XMM4
 932	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
 933	pxor	   \TMP1, \XMM1
 934.ifc \operation, dec
 935	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 936	movdqa     \TMP1, \XMM1
 937.endif
 938	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
 939	pxor	   \TMP1, \XMM2
 940.ifc \operation, dec
 941	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 942	movdqa     \TMP1, \XMM2
 943.endif
 944	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
 945	pxor	   \TMP1, \XMM3
 946.ifc \operation, dec
 947	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 948	movdqa     \TMP1, \XMM3
 949.endif
 950	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
 951	pxor	   \TMP1, \XMM4
 952.ifc \operation, dec
 953	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 954	movdqa     \TMP1, \XMM4
 955.else
 956	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 957	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 958	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 959	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 960.endif
 961
 962	add	   $64, %r11
 963	pshufb %xmm14, \XMM1 # perform a 16 byte swap
 964	pxor	   \XMMDst, \XMM1
 965# combine GHASHed value with the corresponding ciphertext
 966	pshufb %xmm14, \XMM2 # perform a 16 byte swap
 967	pshufb %xmm14, \XMM3 # perform a 16 byte swap
 968	pshufb %xmm14, \XMM4 # perform a 16 byte swap
 969
 970_initial_blocks_done\@:
 971
 972.endm
 973
 974/*
 975* encrypt 4 blocks at a time
 976* ghash the 4 previously encrypted ciphertext blocks
 977* arg1, %arg3, %arg4 are used as pointers only, not modified
 978* %r11 is the data offset value
 979*/
 980.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
 981TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 982
 983	movdqa	  \XMM1, \XMM5
 984	movdqa	  \XMM2, \XMM6
 985	movdqa	  \XMM3, \XMM7
 986	movdqa	  \XMM4, \XMM8
 987
 988        movdqa    SHUF_MASK(%rip), %xmm15
 989        # multiply TMP5 * HashKey using karatsuba
 990
 991	movdqa	  \XMM5, \TMP4
 992	pshufd	  $78, \XMM5, \TMP6
 993	pxor	  \XMM5, \TMP6
 994	paddd     ONE(%rip), \XMM0		# INCR CNT
 995	movdqu	  HashKey_4(%arg2), \TMP5
 996	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 997	movdqa    \XMM0, \XMM1
 998	paddd     ONE(%rip), \XMM0		# INCR CNT
 999	movdqa    \XMM0, \XMM2
1000	paddd     ONE(%rip), \XMM0		# INCR CNT
1001	movdqa    \XMM0, \XMM3
1002	paddd     ONE(%rip), \XMM0		# INCR CNT
1003	movdqa    \XMM0, \XMM4
1004	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1005	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1006	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1007	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1008	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1009
1010	pxor	  (%arg1), \XMM1
1011	pxor	  (%arg1), \XMM2
1012	pxor	  (%arg1), \XMM3
1013	pxor	  (%arg1), \XMM4
1014	movdqu	  HashKey_4_k(%arg2), \TMP5
1015	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1016	movaps 0x10(%arg1), \TMP1
1017	aesenc	  \TMP1, \XMM1              # Round 1
1018	aesenc	  \TMP1, \XMM2
1019	aesenc	  \TMP1, \XMM3
1020	aesenc	  \TMP1, \XMM4
1021	movaps 0x20(%arg1), \TMP1
1022	aesenc	  \TMP1, \XMM1              # Round 2
1023	aesenc	  \TMP1, \XMM2
1024	aesenc	  \TMP1, \XMM3
1025	aesenc	  \TMP1, \XMM4
1026	movdqa	  \XMM6, \TMP1
1027	pshufd	  $78, \XMM6, \TMP2
1028	pxor	  \XMM6, \TMP2
1029	movdqu	  HashKey_3(%arg2), \TMP5
1030	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1031	movaps 0x30(%arg1), \TMP3
1032	aesenc    \TMP3, \XMM1              # Round 3
1033	aesenc    \TMP3, \XMM2
1034	aesenc    \TMP3, \XMM3
1035	aesenc    \TMP3, \XMM4
1036	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1037	movaps 0x40(%arg1), \TMP3
1038	aesenc	  \TMP3, \XMM1              # Round 4
1039	aesenc	  \TMP3, \XMM2
1040	aesenc	  \TMP3, \XMM3
1041	aesenc	  \TMP3, \XMM4
1042	movdqu	  HashKey_3_k(%arg2), \TMP5
1043	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1044	movaps 0x50(%arg1), \TMP3
1045	aesenc	  \TMP3, \XMM1              # Round 5
1046	aesenc	  \TMP3, \XMM2
1047	aesenc	  \TMP3, \XMM3
1048	aesenc	  \TMP3, \XMM4
1049	pxor	  \TMP1, \TMP4
1050# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1051	pxor	  \XMM6, \XMM5
1052	pxor	  \TMP2, \TMP6
1053	movdqa	  \XMM7, \TMP1
1054	pshufd	  $78, \XMM7, \TMP2
1055	pxor	  \XMM7, \TMP2
1056	movdqu	  HashKey_2(%arg2), \TMP5
1057
1058        # Multiply TMP5 * HashKey using karatsuba
1059
1060	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1061	movaps 0x60(%arg1), \TMP3
1062	aesenc	  \TMP3, \XMM1              # Round 6
1063	aesenc	  \TMP3, \XMM2
1064	aesenc	  \TMP3, \XMM3
1065	aesenc	  \TMP3, \XMM4
1066	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1067	movaps 0x70(%arg1), \TMP3
1068	aesenc	  \TMP3, \XMM1              # Round 7
1069	aesenc	  \TMP3, \XMM2
1070	aesenc	  \TMP3, \XMM3
1071	aesenc	  \TMP3, \XMM4
1072	movdqu	  HashKey_2_k(%arg2), \TMP5
1073	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1074	movaps 0x80(%arg1), \TMP3
1075	aesenc	  \TMP3, \XMM1              # Round 8
1076	aesenc	  \TMP3, \XMM2
1077	aesenc	  \TMP3, \XMM3
1078	aesenc	  \TMP3, \XMM4
1079	pxor	  \TMP1, \TMP4
1080# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1081	pxor	  \XMM7, \XMM5
1082	pxor	  \TMP2, \TMP6
1083
1084        # Multiply XMM8 * HashKey
1085        # XMM8 and TMP5 hold the values for the two operands
1086
1087	movdqa	  \XMM8, \TMP1
1088	pshufd	  $78, \XMM8, \TMP2
1089	pxor	  \XMM8, \TMP2
1090	movdqu	  HashKey(%arg2), \TMP5
1091	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1092	movaps 0x90(%arg1), \TMP3
1093	aesenc	  \TMP3, \XMM1             # Round 9
1094	aesenc	  \TMP3, \XMM2
1095	aesenc	  \TMP3, \XMM3
1096	aesenc	  \TMP3, \XMM4
1097	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1098	lea	  0xa0(%arg1),%r10
1099	mov	  keysize,%eax
1100	shr	  $2,%eax			# 128->4, 192->6, 256->8
1101	sub	  $4,%eax			# 128->0, 192->2, 256->4
1102	jz	  aes_loop_par_enc_done\@
1103
1104aes_loop_par_enc\@:
1105	MOVADQ	  (%r10),\TMP3
1106.irpc	index, 1234
1107	aesenc	  \TMP3, %xmm\index
1108.endr
1109	add	  $16,%r10
1110	sub	  $1,%eax
1111	jnz	  aes_loop_par_enc\@
1112
1113aes_loop_par_enc_done\@:
1114	MOVADQ	  (%r10), \TMP3
1115	aesenclast \TMP3, \XMM1           # Round 10
1116	aesenclast \TMP3, \XMM2
1117	aesenclast \TMP3, \XMM3
1118	aesenclast \TMP3, \XMM4
1119	movdqu    HashKey_k(%arg2), \TMP5
1120	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1121	movdqu	  (%arg4,%r11,1), \TMP3
1122	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1123	movdqu	  16(%arg4,%r11,1), \TMP3
1124	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1125	movdqu	  32(%arg4,%r11,1), \TMP3
1126	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1127	movdqu	  48(%arg4,%r11,1), \TMP3
1128	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1129        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1130        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1131        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1132        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1133	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1134	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1135	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1136	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1137
1138	pxor	  \TMP4, \TMP1
1139	pxor	  \XMM8, \XMM5
1140	pxor	  \TMP6, \TMP2
1141	pxor	  \TMP1, \TMP2
1142	pxor	  \XMM5, \TMP2
1143	movdqa	  \TMP2, \TMP3
1144	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1145	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1146	pxor	  \TMP3, \XMM5
1147	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1148
1149        # first phase of reduction
1150
1151	movdqa    \XMM5, \TMP2
1152	movdqa    \XMM5, \TMP3
1153	movdqa    \XMM5, \TMP4
1154# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1155	pslld     $31, \TMP2                   # packed right shift << 31
1156	pslld     $30, \TMP3                   # packed right shift << 30
1157	pslld     $25, \TMP4                   # packed right shift << 25
1158	pxor      \TMP3, \TMP2	               # xor the shifted versions
1159	pxor      \TMP4, \TMP2
1160	movdqa    \TMP2, \TMP5
1161	psrldq    $4, \TMP5                    # right shift T5 1 DW
1162	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1163	pxor      \TMP2, \XMM5
1164
1165        # second phase of reduction
1166
1167	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1168	movdqa    \XMM5,\TMP3
1169	movdqa    \XMM5,\TMP4
1170	psrld     $1, \TMP2                    # packed left shift >>1
1171	psrld     $2, \TMP3                    # packed left shift >>2
1172	psrld     $7, \TMP4                    # packed left shift >>7
1173	pxor      \TMP3,\TMP2		       # xor the shifted versions
1174	pxor      \TMP4,\TMP2
1175	pxor      \TMP5, \TMP2
1176	pxor      \TMP2, \XMM5
1177	pxor      \TMP1, \XMM5                 # result is in TMP1
1178
1179	pxor	  \XMM5, \XMM1
1180.endm
1181
1182/*
1183* decrypt 4 blocks at a time
1184* ghash the 4 previously decrypted ciphertext blocks
1185* arg1, %arg3, %arg4 are used as pointers only, not modified
1186* %r11 is the data offset value
1187*/
1188.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1189TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1190
1191	movdqa	  \XMM1, \XMM5
1192	movdqa	  \XMM2, \XMM6
1193	movdqa	  \XMM3, \XMM7
1194	movdqa	  \XMM4, \XMM8
1195
1196        movdqa    SHUF_MASK(%rip), %xmm15
1197        # multiply TMP5 * HashKey using karatsuba
1198
1199	movdqa	  \XMM5, \TMP4
1200	pshufd	  $78, \XMM5, \TMP6
1201	pxor	  \XMM5, \TMP6
1202	paddd     ONE(%rip), \XMM0		# INCR CNT
1203	movdqu	  HashKey_4(%arg2), \TMP5
1204	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1205	movdqa    \XMM0, \XMM1
1206	paddd     ONE(%rip), \XMM0		# INCR CNT
1207	movdqa    \XMM0, \XMM2
1208	paddd     ONE(%rip), \XMM0		# INCR CNT
1209	movdqa    \XMM0, \XMM3
1210	paddd     ONE(%rip), \XMM0		# INCR CNT
1211	movdqa    \XMM0, \XMM4
1212	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1213	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1214	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1215	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1216	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1217
1218	pxor	  (%arg1), \XMM1
1219	pxor	  (%arg1), \XMM2
1220	pxor	  (%arg1), \XMM3
1221	pxor	  (%arg1), \XMM4
1222	movdqu	  HashKey_4_k(%arg2), \TMP5
1223	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1224	movaps 0x10(%arg1), \TMP1
1225	aesenc	  \TMP1, \XMM1              # Round 1
1226	aesenc	  \TMP1, \XMM2
1227	aesenc	  \TMP1, \XMM3
1228	aesenc	  \TMP1, \XMM4
1229	movaps 0x20(%arg1), \TMP1
1230	aesenc	  \TMP1, \XMM1              # Round 2
1231	aesenc	  \TMP1, \XMM2
1232	aesenc	  \TMP1, \XMM3
1233	aesenc	  \TMP1, \XMM4
1234	movdqa	  \XMM6, \TMP1
1235	pshufd	  $78, \XMM6, \TMP2
1236	pxor	  \XMM6, \TMP2
1237	movdqu	  HashKey_3(%arg2), \TMP5
1238	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1239	movaps 0x30(%arg1), \TMP3
1240	aesenc    \TMP3, \XMM1              # Round 3
1241	aesenc    \TMP3, \XMM2
1242	aesenc    \TMP3, \XMM3
1243	aesenc    \TMP3, \XMM4
1244	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1245	movaps 0x40(%arg1), \TMP3
1246	aesenc	  \TMP3, \XMM1              # Round 4
1247	aesenc	  \TMP3, \XMM2
1248	aesenc	  \TMP3, \XMM3
1249	aesenc	  \TMP3, \XMM4
1250	movdqu	  HashKey_3_k(%arg2), \TMP5
1251	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1252	movaps 0x50(%arg1), \TMP3
1253	aesenc	  \TMP3, \XMM1              # Round 5
1254	aesenc	  \TMP3, \XMM2
1255	aesenc	  \TMP3, \XMM3
1256	aesenc	  \TMP3, \XMM4
1257	pxor	  \TMP1, \TMP4
1258# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1259	pxor	  \XMM6, \XMM5
1260	pxor	  \TMP2, \TMP6
1261	movdqa	  \XMM7, \TMP1
1262	pshufd	  $78, \XMM7, \TMP2
1263	pxor	  \XMM7, \TMP2
1264	movdqu	  HashKey_2(%arg2), \TMP5
1265
1266        # Multiply TMP5 * HashKey using karatsuba
1267
1268	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1269	movaps 0x60(%arg1), \TMP3
1270	aesenc	  \TMP3, \XMM1              # Round 6
1271	aesenc	  \TMP3, \XMM2
1272	aesenc	  \TMP3, \XMM3
1273	aesenc	  \TMP3, \XMM4
1274	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1275	movaps 0x70(%arg1), \TMP3
1276	aesenc	  \TMP3, \XMM1              # Round 7
1277	aesenc	  \TMP3, \XMM2
1278	aesenc	  \TMP3, \XMM3
1279	aesenc	  \TMP3, \XMM4
1280	movdqu	  HashKey_2_k(%arg2), \TMP5
1281	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1282	movaps 0x80(%arg1), \TMP3
1283	aesenc	  \TMP3, \XMM1              # Round 8
1284	aesenc	  \TMP3, \XMM2
1285	aesenc	  \TMP3, \XMM3
1286	aesenc	  \TMP3, \XMM4
1287	pxor	  \TMP1, \TMP4
1288# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1289	pxor	  \XMM7, \XMM5
1290	pxor	  \TMP2, \TMP6
1291
1292        # Multiply XMM8 * HashKey
1293        # XMM8 and TMP5 hold the values for the two operands
1294
1295	movdqa	  \XMM8, \TMP1
1296	pshufd	  $78, \XMM8, \TMP2
1297	pxor	  \XMM8, \TMP2
1298	movdqu	  HashKey(%arg2), \TMP5
1299	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1300	movaps 0x90(%arg1), \TMP3
1301	aesenc	  \TMP3, \XMM1             # Round 9
1302	aesenc	  \TMP3, \XMM2
1303	aesenc	  \TMP3, \XMM3
1304	aesenc	  \TMP3, \XMM4
1305	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1306	lea	  0xa0(%arg1),%r10
1307	mov	  keysize,%eax
1308	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1309	sub	  $4,%eax			# 128->0, 192->2, 256->4
1310	jz	  aes_loop_par_dec_done\@
1311
1312aes_loop_par_dec\@:
1313	MOVADQ	  (%r10),\TMP3
1314.irpc	index, 1234
1315	aesenc	  \TMP3, %xmm\index
1316.endr
1317	add	  $16,%r10
1318	sub	  $1,%eax
1319	jnz	  aes_loop_par_dec\@
1320
1321aes_loop_par_dec_done\@:
1322	MOVADQ	  (%r10), \TMP3
1323	aesenclast \TMP3, \XMM1           # last round
1324	aesenclast \TMP3, \XMM2
1325	aesenclast \TMP3, \XMM3
1326	aesenclast \TMP3, \XMM4
1327	movdqu    HashKey_k(%arg2), \TMP5
1328	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1329	movdqu	  (%arg4,%r11,1), \TMP3
1330	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1331	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1332	movdqa    \TMP3, \XMM1
1333	movdqu	  16(%arg4,%r11,1), \TMP3
1334	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1335	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1336	movdqa    \TMP3, \XMM2
1337	movdqu	  32(%arg4,%r11,1), \TMP3
1338	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1339	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1340	movdqa    \TMP3, \XMM3
1341	movdqu	  48(%arg4,%r11,1), \TMP3
1342	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1343	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1344	movdqa    \TMP3, \XMM4
1345	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1346	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1347	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1348	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1349
1350	pxor	  \TMP4, \TMP1
1351	pxor	  \XMM8, \XMM5
1352	pxor	  \TMP6, \TMP2
1353	pxor	  \TMP1, \TMP2
1354	pxor	  \XMM5, \TMP2
1355	movdqa	  \TMP2, \TMP3
1356	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1357	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1358	pxor	  \TMP3, \XMM5
1359	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1360
1361        # first phase of reduction
1362
1363	movdqa    \XMM5, \TMP2
1364	movdqa    \XMM5, \TMP3
1365	movdqa    \XMM5, \TMP4
1366# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1367	pslld     $31, \TMP2                   # packed right shift << 31
1368	pslld     $30, \TMP3                   # packed right shift << 30
1369	pslld     $25, \TMP4                   # packed right shift << 25
1370	pxor      \TMP3, \TMP2	               # xor the shifted versions
1371	pxor      \TMP4, \TMP2
1372	movdqa    \TMP2, \TMP5
1373	psrldq    $4, \TMP5                    # right shift T5 1 DW
1374	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1375	pxor      \TMP2, \XMM5
1376
1377        # second phase of reduction
1378
1379	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1380	movdqa    \XMM5,\TMP3
1381	movdqa    \XMM5,\TMP4
1382	psrld     $1, \TMP2                    # packed left shift >>1
1383	psrld     $2, \TMP3                    # packed left shift >>2
1384	psrld     $7, \TMP4                    # packed left shift >>7
1385	pxor      \TMP3,\TMP2		       # xor the shifted versions
1386	pxor      \TMP4,\TMP2
1387	pxor      \TMP5, \TMP2
1388	pxor      \TMP2, \XMM5
1389	pxor      \TMP1, \XMM5                 # result is in TMP1
1390
1391	pxor	  \XMM5, \XMM1
1392.endm
1393
1394/* GHASH the last 4 ciphertext blocks. */
1395.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1396TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1397
1398        # Multiply TMP6 * HashKey (using Karatsuba)
1399
1400	movdqa	  \XMM1, \TMP6
1401	pshufd	  $78, \XMM1, \TMP2
1402	pxor	  \XMM1, \TMP2
1403	movdqu	  HashKey_4(%arg2), \TMP5
1404	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1405	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1406	movdqu	  HashKey_4_k(%arg2), \TMP4
1407	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1408	movdqa	  \XMM1, \XMMDst
1409	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1410
1411        # Multiply TMP1 * HashKey (using Karatsuba)
1412
1413	movdqa	  \XMM2, \TMP1
1414	pshufd	  $78, \XMM2, \TMP2
1415	pxor	  \XMM2, \TMP2
1416	movdqu	  HashKey_3(%arg2), \TMP5
1417	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1418	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1419	movdqu	  HashKey_3_k(%arg2), \TMP4
1420	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1421	pxor	  \TMP1, \TMP6
1422	pxor	  \XMM2, \XMMDst
1423	pxor	  \TMP2, \XMM1
1424# results accumulated in TMP6, XMMDst, XMM1
1425
1426        # Multiply TMP1 * HashKey (using Karatsuba)
1427
1428	movdqa	  \XMM3, \TMP1
1429	pshufd	  $78, \XMM3, \TMP2
1430	pxor	  \XMM3, \TMP2
1431	movdqu	  HashKey_2(%arg2), \TMP5
1432	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1433	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1434	movdqu	  HashKey_2_k(%arg2), \TMP4
1435	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1436	pxor	  \TMP1, \TMP6
1437	pxor	  \XMM3, \XMMDst
1438	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1439
1440        # Multiply TMP1 * HashKey (using Karatsuba)
1441	movdqa	  \XMM4, \TMP1
1442	pshufd	  $78, \XMM4, \TMP2
1443	pxor	  \XMM4, \TMP2
1444	movdqu	  HashKey(%arg2), \TMP5
1445	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1446	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1447	movdqu	  HashKey_k(%arg2), \TMP4
1448	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1449	pxor	  \TMP1, \TMP6
1450	pxor	  \XMM4, \XMMDst
1451	pxor	  \XMM1, \TMP2
1452	pxor	  \TMP6, \TMP2
1453	pxor	  \XMMDst, \TMP2
1454	# middle section of the temp results combined as in karatsuba algorithm
1455	movdqa	  \TMP2, \TMP4
1456	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1457	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1458	pxor	  \TMP4, \XMMDst
1459	pxor	  \TMP2, \TMP6
1460# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1461	# first phase of the reduction
1462	movdqa    \XMMDst, \TMP2
1463	movdqa    \XMMDst, \TMP3
1464	movdqa    \XMMDst, \TMP4
1465# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1466	pslld     $31, \TMP2                # packed right shifting << 31
1467	pslld     $30, \TMP3                # packed right shifting << 30
1468	pslld     $25, \TMP4                # packed right shifting << 25
1469	pxor      \TMP3, \TMP2              # xor the shifted versions
1470	pxor      \TMP4, \TMP2
1471	movdqa    \TMP2, \TMP7
1472	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1473	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1474	pxor      \TMP2, \XMMDst
1475
1476        # second phase of the reduction
1477	movdqa    \XMMDst, \TMP2
1478	# make 3 copies of XMMDst for doing 3 shift operations
1479	movdqa    \XMMDst, \TMP3
1480	movdqa    \XMMDst, \TMP4
1481	psrld     $1, \TMP2                 # packed left shift >> 1
1482	psrld     $2, \TMP3                 # packed left shift >> 2
1483	psrld     $7, \TMP4                 # packed left shift >> 7
1484	pxor      \TMP3, \TMP2              # xor the shifted versions
1485	pxor      \TMP4, \TMP2
1486	pxor      \TMP7, \TMP2
1487	pxor      \TMP2, \XMMDst
1488	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1489.endm
1490
1491
1492/* Encryption of a single block
1493* uses eax & r10
1494*/
1495
1496.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1497
1498	pxor		(%arg1), \XMM0
1499	mov		keysize,%eax
1500	shr		$2,%eax			# 128->4, 192->6, 256->8
1501	add		$5,%eax			# 128->9, 192->11, 256->13
1502	lea		16(%arg1), %r10	  # get first expanded key address
1503
1504_esb_loop_\@:
1505	MOVADQ		(%r10),\TMP1
1506	aesenc		\TMP1,\XMM0
1507	add		$16,%r10
1508	sub		$1,%eax
1509	jnz		_esb_loop_\@
1510
1511	MOVADQ		(%r10),\TMP1
1512	aesenclast	\TMP1,\XMM0
1513.endm
1514/*****************************************************************************
1515* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1516*                   struct gcm_context_data *data
1517*                                      // Context data
1518*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1519*                   const u8 *in,      // Ciphertext input
1520*                   u64 plaintext_len, // Length of data in bytes for decryption.
1521*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1522*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1523*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1524*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1525*                   const u8 *aad,     // Additional Authentication Data (AAD)
1526*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1527*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1528*                                      // given authentication tag and only return the plaintext if they match.
1529*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1530*                                      // (most likely), 12 or 8.
1531*
1532* Assumptions:
1533*
1534* keys:
1535*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1536*       set of 11 keys in the data structure void *aes_ctx
1537*
1538* iv:
1539*       0                   1                   2                   3
1540*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1541*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1542*       |                             Salt  (From the SA)               |
1543*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544*       |                     Initialization Vector                     |
1545*       |         (This is the sequence number from IPSec header)       |
1546*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547*       |                              0x1                              |
1548*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*
1550*
1551*
1552* AAD:
1553*       AAD padded to 128 bits with 0
1554*       for example, assume AAD is a u32 vector
1555*
1556*       if AAD is 8 bytes:
1557*       AAD[3] = {A0, A1};
1558*       padded AAD in xmm register = {A1 A0 0 0}
1559*
1560*       0                   1                   2                   3
1561*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1562*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563*       |                               SPI (A1)                        |
1564*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*       |                     32-bit Sequence Number (A0)               |
1566*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1567*       |                              0x0                              |
1568*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569*
1570*                                       AAD Format with 32-bit Sequence Number
1571*
1572*       if AAD is 12 bytes:
1573*       AAD[3] = {A0, A1, A2};
1574*       padded AAD in xmm register = {A2 A1 A0 0}
1575*
1576*       0                   1                   2                   3
1577*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1578*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1580*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1581*       |                               SPI (A2)                        |
1582*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583*       |                 64-bit Extended Sequence Number {A1,A0}       |
1584*       |                                                               |
1585*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586*       |                              0x0                              |
1587*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588*
1589*                        AAD Format with 64-bit Extended Sequence Number
1590*
1591* poly = x^128 + x^127 + x^126 + x^121 + 1
1592*
1593*****************************************************************************/
1594SYM_FUNC_START(aesni_gcm_dec)
1595	FUNC_SAVE
1596
1597	GCM_INIT %arg6, arg7, arg8, arg9
1598	GCM_ENC_DEC dec
1599	GCM_COMPLETE arg10, arg11
1600	FUNC_RESTORE
1601	ret
1602SYM_FUNC_END(aesni_gcm_dec)
1603
1604
1605/*****************************************************************************
1606* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1607*                    struct gcm_context_data *data
1608*                                        // Context data
1609*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1610*                    const u8 *in,       // Plaintext input
1611*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1612*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1613*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1614*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1615*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1616*                    const u8 *aad,      // Additional Authentication Data (AAD)
1617*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1618*                    u8 *auth_tag,       // Authenticated Tag output.
1619*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1620*                                        // 12 or 8.
1621*
1622* Assumptions:
1623*
1624* keys:
1625*       keys are pre-expanded and aligned to 16 bytes. we are using the
1626*       first set of 11 keys in the data structure void *aes_ctx
1627*
1628*
1629* iv:
1630*       0                   1                   2                   3
1631*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1632*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1633*       |                             Salt  (From the SA)               |
1634*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635*       |                     Initialization Vector                     |
1636*       |         (This is the sequence number from IPSec header)       |
1637*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638*       |                              0x1                              |
1639*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*
1641*
1642*
1643* AAD:
1644*       AAD padded to 128 bits with 0
1645*       for example, assume AAD is a u32 vector
1646*
1647*       if AAD is 8 bytes:
1648*       AAD[3] = {A0, A1};
1649*       padded AAD in xmm register = {A1 A0 0 0}
1650*
1651*       0                   1                   2                   3
1652*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1653*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654*       |                               SPI (A1)                        |
1655*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*       |                     32-bit Sequence Number (A0)               |
1657*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658*       |                              0x0                              |
1659*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660*
1661*                                 AAD Format with 32-bit Sequence Number
1662*
1663*       if AAD is 12 bytes:
1664*       AAD[3] = {A0, A1, A2};
1665*       padded AAD in xmm register = {A2 A1 A0 0}
1666*
1667*       0                   1                   2                   3
1668*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1669*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1670*       |                               SPI (A2)                        |
1671*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672*       |                 64-bit Extended Sequence Number {A1,A0}       |
1673*       |                                                               |
1674*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675*       |                              0x0                              |
1676*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677*
1678*                         AAD Format with 64-bit Extended Sequence Number
1679*
1680* poly = x^128 + x^127 + x^126 + x^121 + 1
1681***************************************************************************/
1682SYM_FUNC_START(aesni_gcm_enc)
1683	FUNC_SAVE
1684
1685	GCM_INIT %arg6, arg7, arg8, arg9
1686	GCM_ENC_DEC enc
1687
1688	GCM_COMPLETE arg10, arg11
1689	FUNC_RESTORE
1690	ret
1691SYM_FUNC_END(aesni_gcm_enc)
1692
1693/*****************************************************************************
1694* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1695*                     struct gcm_context_data *data,
1696*                                         // context data
1697*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1698*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1699*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1700*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1701*                     const u8 *aad,      // Additional Authentication Data (AAD)
1702*                     u64 aad_len)        // Length of AAD in bytes.
1703*/
1704SYM_FUNC_START(aesni_gcm_init)
1705	FUNC_SAVE
1706	GCM_INIT %arg3, %arg4,%arg5, %arg6
1707	FUNC_RESTORE
1708	ret
1709SYM_FUNC_END(aesni_gcm_init)
1710
1711/*****************************************************************************
1712* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1713*                    struct gcm_context_data *data,
1714*                                        // context data
1715*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1716*                    const u8 *in,       // Plaintext input
1717*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1718*/
1719SYM_FUNC_START(aesni_gcm_enc_update)
1720	FUNC_SAVE
1721	GCM_ENC_DEC enc
1722	FUNC_RESTORE
1723	ret
1724SYM_FUNC_END(aesni_gcm_enc_update)
1725
1726/*****************************************************************************
1727* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1728*                    struct gcm_context_data *data,
1729*                                        // context data
1730*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1731*                    const u8 *in,       // Plaintext input
1732*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1733*/
1734SYM_FUNC_START(aesni_gcm_dec_update)
1735	FUNC_SAVE
1736	GCM_ENC_DEC dec
1737	FUNC_RESTORE
1738	ret
1739SYM_FUNC_END(aesni_gcm_dec_update)
1740
1741/*****************************************************************************
1742* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1743*                    struct gcm_context_data *data,
1744*                                        // context data
1745*                    u8 *auth_tag,       // Authenticated Tag output.
1746*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1747*                                        // 12 or 8.
1748*/
1749SYM_FUNC_START(aesni_gcm_finalize)
1750	FUNC_SAVE
1751	GCM_COMPLETE %arg3 %arg4
1752	FUNC_RESTORE
1753	ret
1754SYM_FUNC_END(aesni_gcm_finalize)
1755
1756#endif
1757
1758
1759SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1760SYM_FUNC_START_LOCAL(_key_expansion_256a)
1761	pshufd $0b11111111, %xmm1, %xmm1
1762	shufps $0b00010000, %xmm0, %xmm4
1763	pxor %xmm4, %xmm0
1764	shufps $0b10001100, %xmm0, %xmm4
1765	pxor %xmm4, %xmm0
1766	pxor %xmm1, %xmm0
1767	movaps %xmm0, (TKEYP)
1768	add $0x10, TKEYP
1769	ret
1770SYM_FUNC_END(_key_expansion_256a)
1771SYM_FUNC_END_ALIAS(_key_expansion_128)
1772
1773SYM_FUNC_START_LOCAL(_key_expansion_192a)
1774	pshufd $0b01010101, %xmm1, %xmm1
1775	shufps $0b00010000, %xmm0, %xmm4
1776	pxor %xmm4, %xmm0
1777	shufps $0b10001100, %xmm0, %xmm4
1778	pxor %xmm4, %xmm0
1779	pxor %xmm1, %xmm0
1780
1781	movaps %xmm2, %xmm5
1782	movaps %xmm2, %xmm6
1783	pslldq $4, %xmm5
1784	pshufd $0b11111111, %xmm0, %xmm3
1785	pxor %xmm3, %xmm2
1786	pxor %xmm5, %xmm2
1787
1788	movaps %xmm0, %xmm1
1789	shufps $0b01000100, %xmm0, %xmm6
1790	movaps %xmm6, (TKEYP)
1791	shufps $0b01001110, %xmm2, %xmm1
1792	movaps %xmm1, 0x10(TKEYP)
1793	add $0x20, TKEYP
1794	ret
1795SYM_FUNC_END(_key_expansion_192a)
1796
1797SYM_FUNC_START_LOCAL(_key_expansion_192b)
1798	pshufd $0b01010101, %xmm1, %xmm1
1799	shufps $0b00010000, %xmm0, %xmm4
1800	pxor %xmm4, %xmm0
1801	shufps $0b10001100, %xmm0, %xmm4
1802	pxor %xmm4, %xmm0
1803	pxor %xmm1, %xmm0
1804
1805	movaps %xmm2, %xmm5
1806	pslldq $4, %xmm5
1807	pshufd $0b11111111, %xmm0, %xmm3
1808	pxor %xmm3, %xmm2
1809	pxor %xmm5, %xmm2
1810
1811	movaps %xmm0, (TKEYP)
1812	add $0x10, TKEYP
1813	ret
1814SYM_FUNC_END(_key_expansion_192b)
1815
1816SYM_FUNC_START_LOCAL(_key_expansion_256b)
1817	pshufd $0b10101010, %xmm1, %xmm1
1818	shufps $0b00010000, %xmm2, %xmm4
1819	pxor %xmm4, %xmm2
1820	shufps $0b10001100, %xmm2, %xmm4
1821	pxor %xmm4, %xmm2
1822	pxor %xmm1, %xmm2
1823	movaps %xmm2, (TKEYP)
1824	add $0x10, TKEYP
1825	ret
1826SYM_FUNC_END(_key_expansion_256b)
1827
1828/*
1829 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1830 *                   unsigned int key_len)
1831 */
1832SYM_FUNC_START(aesni_set_key)
1833	FRAME_BEGIN
1834#ifndef __x86_64__
1835	pushl KEYP
1836	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1837	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1838	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1839#endif
1840	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1841	movaps %xmm0, (KEYP)
1842	lea 0x10(KEYP), TKEYP		# key addr
1843	movl %edx, 480(KEYP)
1844	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1845	cmp $24, %dl
1846	jb .Lenc_key128
1847	je .Lenc_key192
1848	movups 0x10(UKEYP), %xmm2	# other user key
1849	movaps %xmm2, (TKEYP)
1850	add $0x10, TKEYP
1851	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1852	call _key_expansion_256a
1853	aeskeygenassist $0x1, %xmm0, %xmm1
1854	call _key_expansion_256b
1855	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1856	call _key_expansion_256a
1857	aeskeygenassist $0x2, %xmm0, %xmm1
1858	call _key_expansion_256b
1859	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1860	call _key_expansion_256a
1861	aeskeygenassist $0x4, %xmm0, %xmm1
1862	call _key_expansion_256b
1863	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1864	call _key_expansion_256a
1865	aeskeygenassist $0x8, %xmm0, %xmm1
1866	call _key_expansion_256b
1867	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1868	call _key_expansion_256a
1869	aeskeygenassist $0x10, %xmm0, %xmm1
1870	call _key_expansion_256b
1871	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1872	call _key_expansion_256a
1873	aeskeygenassist $0x20, %xmm0, %xmm1
1874	call _key_expansion_256b
1875	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1876	call _key_expansion_256a
1877	jmp .Ldec_key
1878.Lenc_key192:
1879	movq 0x10(UKEYP), %xmm2		# other user key
1880	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1881	call _key_expansion_192a
1882	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1883	call _key_expansion_192b
1884	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1885	call _key_expansion_192a
1886	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1887	call _key_expansion_192b
1888	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1889	call _key_expansion_192a
1890	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1891	call _key_expansion_192b
1892	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1893	call _key_expansion_192a
1894	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
1895	call _key_expansion_192b
1896	jmp .Ldec_key
1897.Lenc_key128:
1898	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
1899	call _key_expansion_128
1900	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
1901	call _key_expansion_128
1902	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
1903	call _key_expansion_128
1904	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
1905	call _key_expansion_128
1906	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
1907	call _key_expansion_128
1908	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
1909	call _key_expansion_128
1910	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
1911	call _key_expansion_128
1912	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
1913	call _key_expansion_128
1914	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
1915	call _key_expansion_128
1916	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
1917	call _key_expansion_128
1918.Ldec_key:
1919	sub $0x10, TKEYP
1920	movaps (KEYP), %xmm0
1921	movaps (TKEYP), %xmm1
1922	movaps %xmm0, 240(TKEYP)
1923	movaps %xmm1, 240(KEYP)
1924	add $0x10, KEYP
1925	lea 240-16(TKEYP), UKEYP
1926.align 4
1927.Ldec_key_loop:
1928	movaps (KEYP), %xmm0
1929	aesimc %xmm0, %xmm1
1930	movaps %xmm1, (UKEYP)
1931	add $0x10, KEYP
1932	sub $0x10, UKEYP
1933	cmp TKEYP, KEYP
1934	jb .Ldec_key_loop
1935	xor AREG, AREG
1936#ifndef __x86_64__
1937	popl KEYP
1938#endif
1939	FRAME_END
1940	ret
1941SYM_FUNC_END(aesni_set_key)
1942
1943/*
1944 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1945 */
1946SYM_FUNC_START(aesni_enc)
1947	FRAME_BEGIN
1948#ifndef __x86_64__
1949	pushl KEYP
1950	pushl KLEN
1951	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1952	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1953	movl (FRAME_OFFSET+20)(%esp), INP	# src
1954#endif
1955	movl 480(KEYP), KLEN		# key length
1956	movups (INP), STATE		# input
1957	call _aesni_enc1
1958	movups STATE, (OUTP)		# output
1959#ifndef __x86_64__
1960	popl KLEN
1961	popl KEYP
1962#endif
1963	FRAME_END
1964	ret
1965SYM_FUNC_END(aesni_enc)
1966
1967/*
1968 * _aesni_enc1:		internal ABI
1969 * input:
1970 *	KEYP:		key struct pointer
1971 *	KLEN:		round count
1972 *	STATE:		initial state (input)
1973 * output:
1974 *	STATE:		finial state (output)
1975 * changed:
1976 *	KEY
1977 *	TKEYP (T1)
1978 */
1979SYM_FUNC_START_LOCAL(_aesni_enc1)
1980	movaps (KEYP), KEY		# key
1981	mov KEYP, TKEYP
1982	pxor KEY, STATE		# round 0
1983	add $0x30, TKEYP
1984	cmp $24, KLEN
1985	jb .Lenc128
1986	lea 0x20(TKEYP), TKEYP
1987	je .Lenc192
1988	add $0x20, TKEYP
1989	movaps -0x60(TKEYP), KEY
1990	aesenc KEY, STATE
1991	movaps -0x50(TKEYP), KEY
1992	aesenc KEY, STATE
1993.align 4
1994.Lenc192:
1995	movaps -0x40(TKEYP), KEY
1996	aesenc KEY, STATE
1997	movaps -0x30(TKEYP), KEY
1998	aesenc KEY, STATE
1999.align 4
2000.Lenc128:
2001	movaps -0x20(TKEYP), KEY
2002	aesenc KEY, STATE
2003	movaps -0x10(TKEYP), KEY
2004	aesenc KEY, STATE
2005	movaps (TKEYP), KEY
2006	aesenc KEY, STATE
2007	movaps 0x10(TKEYP), KEY
2008	aesenc KEY, STATE
2009	movaps 0x20(TKEYP), KEY
2010	aesenc KEY, STATE
2011	movaps 0x30(TKEYP), KEY
2012	aesenc KEY, STATE
2013	movaps 0x40(TKEYP), KEY
2014	aesenc KEY, STATE
2015	movaps 0x50(TKEYP), KEY
2016	aesenc KEY, STATE
2017	movaps 0x60(TKEYP), KEY
2018	aesenc KEY, STATE
2019	movaps 0x70(TKEYP), KEY
2020	aesenclast KEY, STATE
2021	ret
2022SYM_FUNC_END(_aesni_enc1)
2023
2024/*
2025 * _aesni_enc4:	internal ABI
2026 * input:
2027 *	KEYP:		key struct pointer
2028 *	KLEN:		round count
2029 *	STATE1:		initial state (input)
2030 *	STATE2
2031 *	STATE3
2032 *	STATE4
2033 * output:
2034 *	STATE1:		finial state (output)
2035 *	STATE2
2036 *	STATE3
2037 *	STATE4
2038 * changed:
2039 *	KEY
2040 *	TKEYP (T1)
2041 */
2042SYM_FUNC_START_LOCAL(_aesni_enc4)
2043	movaps (KEYP), KEY		# key
2044	mov KEYP, TKEYP
2045	pxor KEY, STATE1		# round 0
2046	pxor KEY, STATE2
2047	pxor KEY, STATE3
2048	pxor KEY, STATE4
2049	add $0x30, TKEYP
2050	cmp $24, KLEN
2051	jb .L4enc128
2052	lea 0x20(TKEYP), TKEYP
2053	je .L4enc192
2054	add $0x20, TKEYP
2055	movaps -0x60(TKEYP), KEY
2056	aesenc KEY, STATE1
2057	aesenc KEY, STATE2
2058	aesenc KEY, STATE3
2059	aesenc KEY, STATE4
2060	movaps -0x50(TKEYP), KEY
2061	aesenc KEY, STATE1
2062	aesenc KEY, STATE2
2063	aesenc KEY, STATE3
2064	aesenc KEY, STATE4
2065#.align 4
2066.L4enc192:
2067	movaps -0x40(TKEYP), KEY
2068	aesenc KEY, STATE1
2069	aesenc KEY, STATE2
2070	aesenc KEY, STATE3
2071	aesenc KEY, STATE4
2072	movaps -0x30(TKEYP), KEY
2073	aesenc KEY, STATE1
2074	aesenc KEY, STATE2
2075	aesenc KEY, STATE3
2076	aesenc KEY, STATE4
2077#.align 4
2078.L4enc128:
2079	movaps -0x20(TKEYP), KEY
2080	aesenc KEY, STATE1
2081	aesenc KEY, STATE2
2082	aesenc KEY, STATE3
2083	aesenc KEY, STATE4
2084	movaps -0x10(TKEYP), KEY
2085	aesenc KEY, STATE1
2086	aesenc KEY, STATE2
2087	aesenc KEY, STATE3
2088	aesenc KEY, STATE4
2089	movaps (TKEYP), KEY
2090	aesenc KEY, STATE1
2091	aesenc KEY, STATE2
2092	aesenc KEY, STATE3
2093	aesenc KEY, STATE4
2094	movaps 0x10(TKEYP), KEY
2095	aesenc KEY, STATE1
2096	aesenc KEY, STATE2
2097	aesenc KEY, STATE3
2098	aesenc KEY, STATE4
2099	movaps 0x20(TKEYP), KEY
2100	aesenc KEY, STATE1
2101	aesenc KEY, STATE2
2102	aesenc KEY, STATE3
2103	aesenc KEY, STATE4
2104	movaps 0x30(TKEYP), KEY
2105	aesenc KEY, STATE1
2106	aesenc KEY, STATE2
2107	aesenc KEY, STATE3
2108	aesenc KEY, STATE4
2109	movaps 0x40(TKEYP), KEY
2110	aesenc KEY, STATE1
2111	aesenc KEY, STATE2
2112	aesenc KEY, STATE3
2113	aesenc KEY, STATE4
2114	movaps 0x50(TKEYP), KEY
2115	aesenc KEY, STATE1
2116	aesenc KEY, STATE2
2117	aesenc KEY, STATE3
2118	aesenc KEY, STATE4
2119	movaps 0x60(TKEYP), KEY
2120	aesenc KEY, STATE1
2121	aesenc KEY, STATE2
2122	aesenc KEY, STATE3
2123	aesenc KEY, STATE4
2124	movaps 0x70(TKEYP), KEY
2125	aesenclast KEY, STATE1		# last round
2126	aesenclast KEY, STATE2
2127	aesenclast KEY, STATE3
2128	aesenclast KEY, STATE4
2129	ret
2130SYM_FUNC_END(_aesni_enc4)
2131
2132/*
2133 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2134 */
2135SYM_FUNC_START(aesni_dec)
2136	FRAME_BEGIN
2137#ifndef __x86_64__
2138	pushl KEYP
2139	pushl KLEN
2140	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2141	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2142	movl (FRAME_OFFSET+20)(%esp), INP	# src
2143#endif
2144	mov 480(KEYP), KLEN		# key length
2145	add $240, KEYP
2146	movups (INP), STATE		# input
2147	call _aesni_dec1
2148	movups STATE, (OUTP)		#output
2149#ifndef __x86_64__
2150	popl KLEN
2151	popl KEYP
2152#endif
2153	FRAME_END
2154	ret
2155SYM_FUNC_END(aesni_dec)
2156
2157/*
2158 * _aesni_dec1:		internal ABI
2159 * input:
2160 *	KEYP:		key struct pointer
2161 *	KLEN:		key length
2162 *	STATE:		initial state (input)
2163 * output:
2164 *	STATE:		finial state (output)
2165 * changed:
2166 *	KEY
2167 *	TKEYP (T1)
2168 */
2169SYM_FUNC_START_LOCAL(_aesni_dec1)
2170	movaps (KEYP), KEY		# key
2171	mov KEYP, TKEYP
2172	pxor KEY, STATE		# round 0
2173	add $0x30, TKEYP
2174	cmp $24, KLEN
2175	jb .Ldec128
2176	lea 0x20(TKEYP), TKEYP
2177	je .Ldec192
2178	add $0x20, TKEYP
2179	movaps -0x60(TKEYP), KEY
2180	aesdec KEY, STATE
2181	movaps -0x50(TKEYP), KEY
2182	aesdec KEY, STATE
2183.align 4
2184.Ldec192:
2185	movaps -0x40(TKEYP), KEY
2186	aesdec KEY, STATE
2187	movaps -0x30(TKEYP), KEY
2188	aesdec KEY, STATE
2189.align 4
2190.Ldec128:
2191	movaps -0x20(TKEYP), KEY
2192	aesdec KEY, STATE
2193	movaps -0x10(TKEYP), KEY
2194	aesdec KEY, STATE
2195	movaps (TKEYP), KEY
2196	aesdec KEY, STATE
2197	movaps 0x10(TKEYP), KEY
2198	aesdec KEY, STATE
2199	movaps 0x20(TKEYP), KEY
2200	aesdec KEY, STATE
2201	movaps 0x30(TKEYP), KEY
2202	aesdec KEY, STATE
2203	movaps 0x40(TKEYP), KEY
2204	aesdec KEY, STATE
2205	movaps 0x50(TKEYP), KEY
2206	aesdec KEY, STATE
2207	movaps 0x60(TKEYP), KEY
2208	aesdec KEY, STATE
2209	movaps 0x70(TKEYP), KEY
2210	aesdeclast KEY, STATE
2211	ret
2212SYM_FUNC_END(_aesni_dec1)
2213
2214/*
2215 * _aesni_dec4:	internal ABI
2216 * input:
2217 *	KEYP:		key struct pointer
2218 *	KLEN:		key length
2219 *	STATE1:		initial state (input)
2220 *	STATE2
2221 *	STATE3
2222 *	STATE4
2223 * output:
2224 *	STATE1:		finial state (output)
2225 *	STATE2
2226 *	STATE3
2227 *	STATE4
2228 * changed:
2229 *	KEY
2230 *	TKEYP (T1)
2231 */
2232SYM_FUNC_START_LOCAL(_aesni_dec4)
2233	movaps (KEYP), KEY		# key
2234	mov KEYP, TKEYP
2235	pxor KEY, STATE1		# round 0
2236	pxor KEY, STATE2
2237	pxor KEY, STATE3
2238	pxor KEY, STATE4
2239	add $0x30, TKEYP
2240	cmp $24, KLEN
2241	jb .L4dec128
2242	lea 0x20(TKEYP), TKEYP
2243	je .L4dec192
2244	add $0x20, TKEYP
2245	movaps -0x60(TKEYP), KEY
2246	aesdec KEY, STATE1
2247	aesdec KEY, STATE2
2248	aesdec KEY, STATE3
2249	aesdec KEY, STATE4
2250	movaps -0x50(TKEYP), KEY
2251	aesdec KEY, STATE1
2252	aesdec KEY, STATE2
2253	aesdec KEY, STATE3
2254	aesdec KEY, STATE4
2255.align 4
2256.L4dec192:
2257	movaps -0x40(TKEYP), KEY
2258	aesdec KEY, STATE1
2259	aesdec KEY, STATE2
2260	aesdec KEY, STATE3
2261	aesdec KEY, STATE4
2262	movaps -0x30(TKEYP), KEY
2263	aesdec KEY, STATE1
2264	aesdec KEY, STATE2
2265	aesdec KEY, STATE3
2266	aesdec KEY, STATE4
2267.align 4
2268.L4dec128:
2269	movaps -0x20(TKEYP), KEY
2270	aesdec KEY, STATE1
2271	aesdec KEY, STATE2
2272	aesdec KEY, STATE3
2273	aesdec KEY, STATE4
2274	movaps -0x10(TKEYP), KEY
2275	aesdec KEY, STATE1
2276	aesdec KEY, STATE2
2277	aesdec KEY, STATE3
2278	aesdec KEY, STATE4
2279	movaps (TKEYP), KEY
2280	aesdec KEY, STATE1
2281	aesdec KEY, STATE2
2282	aesdec KEY, STATE3
2283	aesdec KEY, STATE4
2284	movaps 0x10(TKEYP), KEY
2285	aesdec KEY, STATE1
2286	aesdec KEY, STATE2
2287	aesdec KEY, STATE3
2288	aesdec KEY, STATE4
2289	movaps 0x20(TKEYP), KEY
2290	aesdec KEY, STATE1
2291	aesdec KEY, STATE2
2292	aesdec KEY, STATE3
2293	aesdec KEY, STATE4
2294	movaps 0x30(TKEYP), KEY
2295	aesdec KEY, STATE1
2296	aesdec KEY, STATE2
2297	aesdec KEY, STATE3
2298	aesdec KEY, STATE4
2299	movaps 0x40(TKEYP), KEY
2300	aesdec KEY, STATE1
2301	aesdec KEY, STATE2
2302	aesdec KEY, STATE3
2303	aesdec KEY, STATE4
2304	movaps 0x50(TKEYP), KEY
2305	aesdec KEY, STATE1
2306	aesdec KEY, STATE2
2307	aesdec KEY, STATE3
2308	aesdec KEY, STATE4
2309	movaps 0x60(TKEYP), KEY
2310	aesdec KEY, STATE1
2311	aesdec KEY, STATE2
2312	aesdec KEY, STATE3
2313	aesdec KEY, STATE4
2314	movaps 0x70(TKEYP), KEY
2315	aesdeclast KEY, STATE1		# last round
2316	aesdeclast KEY, STATE2
2317	aesdeclast KEY, STATE3
2318	aesdeclast KEY, STATE4
2319	ret
2320SYM_FUNC_END(_aesni_dec4)
2321
2322/*
2323 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2324 *		      size_t len)
2325 */
2326SYM_FUNC_START(aesni_ecb_enc)
2327	FRAME_BEGIN
2328#ifndef __x86_64__
2329	pushl LEN
2330	pushl KEYP
2331	pushl KLEN
2332	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2333	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2334	movl (FRAME_OFFSET+24)(%esp), INP	# src
2335	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2336#endif
2337	test LEN, LEN		# check length
2338	jz .Lecb_enc_ret
2339	mov 480(KEYP), KLEN
2340	cmp $16, LEN
2341	jb .Lecb_enc_ret
2342	cmp $64, LEN
2343	jb .Lecb_enc_loop1
2344.align 4
2345.Lecb_enc_loop4:
2346	movups (INP), STATE1
2347	movups 0x10(INP), STATE2
2348	movups 0x20(INP), STATE3
2349	movups 0x30(INP), STATE4
2350	call _aesni_enc4
2351	movups STATE1, (OUTP)
2352	movups STATE2, 0x10(OUTP)
2353	movups STATE3, 0x20(OUTP)
2354	movups STATE4, 0x30(OUTP)
2355	sub $64, LEN
2356	add $64, INP
2357	add $64, OUTP
2358	cmp $64, LEN
2359	jge .Lecb_enc_loop4
2360	cmp $16, LEN
2361	jb .Lecb_enc_ret
2362.align 4
2363.Lecb_enc_loop1:
2364	movups (INP), STATE1
2365	call _aesni_enc1
2366	movups STATE1, (OUTP)
2367	sub $16, LEN
2368	add $16, INP
2369	add $16, OUTP
2370	cmp $16, LEN
2371	jge .Lecb_enc_loop1
2372.Lecb_enc_ret:
2373#ifndef __x86_64__
2374	popl KLEN
2375	popl KEYP
2376	popl LEN
2377#endif
2378	FRAME_END
2379	ret
2380SYM_FUNC_END(aesni_ecb_enc)
2381
2382/*
2383 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2384 *		      size_t len);
2385 */
2386SYM_FUNC_START(aesni_ecb_dec)
2387	FRAME_BEGIN
2388#ifndef __x86_64__
2389	pushl LEN
2390	pushl KEYP
2391	pushl KLEN
2392	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2393	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2394	movl (FRAME_OFFSET+24)(%esp), INP	# src
2395	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2396#endif
2397	test LEN, LEN
2398	jz .Lecb_dec_ret
2399	mov 480(KEYP), KLEN
2400	add $240, KEYP
2401	cmp $16, LEN
2402	jb .Lecb_dec_ret
2403	cmp $64, LEN
2404	jb .Lecb_dec_loop1
2405.align 4
2406.Lecb_dec_loop4:
2407	movups (INP), STATE1
2408	movups 0x10(INP), STATE2
2409	movups 0x20(INP), STATE3
2410	movups 0x30(INP), STATE4
2411	call _aesni_dec4
2412	movups STATE1, (OUTP)
2413	movups STATE2, 0x10(OUTP)
2414	movups STATE3, 0x20(OUTP)
2415	movups STATE4, 0x30(OUTP)
2416	sub $64, LEN
2417	add $64, INP
2418	add $64, OUTP
2419	cmp $64, LEN
2420	jge .Lecb_dec_loop4
2421	cmp $16, LEN
2422	jb .Lecb_dec_ret
2423.align 4
2424.Lecb_dec_loop1:
2425	movups (INP), STATE1
2426	call _aesni_dec1
2427	movups STATE1, (OUTP)
2428	sub $16, LEN
2429	add $16, INP
2430	add $16, OUTP
2431	cmp $16, LEN
2432	jge .Lecb_dec_loop1
2433.Lecb_dec_ret:
2434#ifndef __x86_64__
2435	popl KLEN
2436	popl KEYP
2437	popl LEN
2438#endif
2439	FRAME_END
2440	ret
2441SYM_FUNC_END(aesni_ecb_dec)
2442
2443/*
2444 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2445 *		      size_t len, u8 *iv)
2446 */
2447SYM_FUNC_START(aesni_cbc_enc)
2448	FRAME_BEGIN
2449#ifndef __x86_64__
2450	pushl IVP
2451	pushl LEN
2452	pushl KEYP
2453	pushl KLEN
2454	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2455	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2456	movl (FRAME_OFFSET+28)(%esp), INP	# src
2457	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2458	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2459#endif
2460	cmp $16, LEN
2461	jb .Lcbc_enc_ret
2462	mov 480(KEYP), KLEN
2463	movups (IVP), STATE	# load iv as initial state
2464.align 4
2465.Lcbc_enc_loop:
2466	movups (INP), IN	# load input
2467	pxor IN, STATE
2468	call _aesni_enc1
2469	movups STATE, (OUTP)	# store output
2470	sub $16, LEN
2471	add $16, INP
2472	add $16, OUTP
2473	cmp $16, LEN
2474	jge .Lcbc_enc_loop
2475	movups STATE, (IVP)
2476.Lcbc_enc_ret:
2477#ifndef __x86_64__
2478	popl KLEN
2479	popl KEYP
2480	popl LEN
2481	popl IVP
2482#endif
2483	FRAME_END
2484	ret
2485SYM_FUNC_END(aesni_cbc_enc)
2486
2487/*
2488 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489 *		      size_t len, u8 *iv)
2490 */
2491SYM_FUNC_START(aesni_cbc_dec)
2492	FRAME_BEGIN
2493#ifndef __x86_64__
2494	pushl IVP
2495	pushl LEN
2496	pushl KEYP
2497	pushl KLEN
2498	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2499	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2500	movl (FRAME_OFFSET+28)(%esp), INP	# src
2501	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2502	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2503#endif
2504	cmp $16, LEN
2505	jb .Lcbc_dec_just_ret
2506	mov 480(KEYP), KLEN
2507	add $240, KEYP
2508	movups (IVP), IV
2509	cmp $64, LEN
2510	jb .Lcbc_dec_loop1
2511.align 4
2512.Lcbc_dec_loop4:
2513	movups (INP), IN1
2514	movaps IN1, STATE1
2515	movups 0x10(INP), IN2
2516	movaps IN2, STATE2
2517#ifdef __x86_64__
2518	movups 0x20(INP), IN3
2519	movaps IN3, STATE3
2520	movups 0x30(INP), IN4
2521	movaps IN4, STATE4
2522#else
2523	movups 0x20(INP), IN1
2524	movaps IN1, STATE3
2525	movups 0x30(INP), IN2
2526	movaps IN2, STATE4
2527#endif
2528	call _aesni_dec4
2529	pxor IV, STATE1
2530#ifdef __x86_64__
2531	pxor IN1, STATE2
2532	pxor IN2, STATE3
2533	pxor IN3, STATE4
2534	movaps IN4, IV
2535#else
2536	pxor IN1, STATE4
2537	movaps IN2, IV
2538	movups (INP), IN1
2539	pxor IN1, STATE2
2540	movups 0x10(INP), IN2
2541	pxor IN2, STATE3
2542#endif
2543	movups STATE1, (OUTP)
2544	movups STATE2, 0x10(OUTP)
2545	movups STATE3, 0x20(OUTP)
2546	movups STATE4, 0x30(OUTP)
2547	sub $64, LEN
2548	add $64, INP
2549	add $64, OUTP
2550	cmp $64, LEN
2551	jge .Lcbc_dec_loop4
2552	cmp $16, LEN
2553	jb .Lcbc_dec_ret
2554.align 4
2555.Lcbc_dec_loop1:
2556	movups (INP), IN
2557	movaps IN, STATE
2558	call _aesni_dec1
2559	pxor IV, STATE
2560	movups STATE, (OUTP)
2561	movaps IN, IV
2562	sub $16, LEN
2563	add $16, INP
2564	add $16, OUTP
2565	cmp $16, LEN
2566	jge .Lcbc_dec_loop1
2567.Lcbc_dec_ret:
2568	movups IV, (IVP)
2569.Lcbc_dec_just_ret:
2570#ifndef __x86_64__
2571	popl KLEN
2572	popl KEYP
2573	popl LEN
2574	popl IVP
2575#endif
2576	FRAME_END
2577	ret
2578SYM_FUNC_END(aesni_cbc_dec)
2579
2580#ifdef __x86_64__
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2581.pushsection .rodata
2582.align 16
 
 
 
 
 
 
 
 
2583.Lbswap_mask:
2584	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
2585.popsection
2586
 
2587/*
2588 * _aesni_inc_init:	internal ABI
2589 *	setup registers used by _aesni_inc
2590 * input:
2591 *	IV
2592 * output:
2593 *	CTR:	== IV, in little endian
2594 *	TCTR_LOW: == lower qword of CTR
2595 *	INC:	== 1, in little endian
2596 *	BSWAP_MASK == endian swapping mask
2597 */
2598SYM_FUNC_START_LOCAL(_aesni_inc_init)
2599	movaps .Lbswap_mask, BSWAP_MASK
2600	movaps IV, CTR
2601	pshufb BSWAP_MASK, CTR
2602	mov $1, TCTR_LOW
2603	movq TCTR_LOW, INC
2604	movq CTR, TCTR_LOW
2605	ret
2606SYM_FUNC_END(_aesni_inc_init)
2607
2608/*
2609 * _aesni_inc:		internal ABI
2610 *	Increase IV by 1, IV is in big endian
2611 * input:
2612 *	IV
2613 *	CTR:	== IV, in little endian
2614 *	TCTR_LOW: == lower qword of CTR
2615 *	INC:	== 1, in little endian
2616 *	BSWAP_MASK == endian swapping mask
2617 * output:
2618 *	IV:	Increase by 1
2619 * changed:
2620 *	CTR:	== output IV, in little endian
2621 *	TCTR_LOW: == lower qword of CTR
2622 */
2623SYM_FUNC_START_LOCAL(_aesni_inc)
2624	paddq INC, CTR
2625	add $1, TCTR_LOW
2626	jnc .Linc_low
2627	pslldq $8, INC
2628	paddq INC, CTR
2629	psrldq $8, INC
2630.Linc_low:
2631	movaps CTR, IV
2632	pshufb BSWAP_MASK, IV
2633	ret
2634SYM_FUNC_END(_aesni_inc)
2635
2636/*
2637 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2638 *		      size_t len, u8 *iv)
2639 */
2640SYM_FUNC_START(aesni_ctr_enc)
2641	FRAME_BEGIN
2642	cmp $16, LEN
2643	jb .Lctr_enc_just_ret
2644	mov 480(KEYP), KLEN
2645	movups (IVP), IV
2646	call _aesni_inc_init
2647	cmp $64, LEN
2648	jb .Lctr_enc_loop1
2649.align 4
2650.Lctr_enc_loop4:
2651	movaps IV, STATE1
2652	call _aesni_inc
2653	movups (INP), IN1
2654	movaps IV, STATE2
2655	call _aesni_inc
2656	movups 0x10(INP), IN2
2657	movaps IV, STATE3
2658	call _aesni_inc
2659	movups 0x20(INP), IN3
2660	movaps IV, STATE4
2661	call _aesni_inc
2662	movups 0x30(INP), IN4
2663	call _aesni_enc4
2664	pxor IN1, STATE1
2665	movups STATE1, (OUTP)
2666	pxor IN2, STATE2
2667	movups STATE2, 0x10(OUTP)
2668	pxor IN3, STATE3
2669	movups STATE3, 0x20(OUTP)
2670	pxor IN4, STATE4
2671	movups STATE4, 0x30(OUTP)
2672	sub $64, LEN
2673	add $64, INP
2674	add $64, OUTP
2675	cmp $64, LEN
2676	jge .Lctr_enc_loop4
2677	cmp $16, LEN
2678	jb .Lctr_enc_ret
2679.align 4
2680.Lctr_enc_loop1:
2681	movaps IV, STATE
2682	call _aesni_inc
2683	movups (INP), IN
2684	call _aesni_enc1
2685	pxor IN, STATE
2686	movups STATE, (OUTP)
2687	sub $16, LEN
2688	add $16, INP
2689	add $16, OUTP
2690	cmp $16, LEN
2691	jge .Lctr_enc_loop1
2692.Lctr_enc_ret:
2693	movups IV, (IVP)
2694.Lctr_enc_just_ret:
2695	FRAME_END
2696	ret
2697SYM_FUNC_END(aesni_ctr_enc)
2698
 
 
 
 
 
 
 
 
2699/*
2700 * _aesni_gf128mul_x_ble:		internal ABI
2701 *	Multiply in GF(2^128) for XTS IVs
2702 * input:
2703 *	IV:	current IV
2704 *	GF128MUL_MASK == mask with 0x87 and 0x01
2705 * output:
2706 *	IV:	next IV
2707 * changed:
2708 *	CTR:	== temporary value
2709 */
2710#define _aesni_gf128mul_x_ble() \
2711	pshufd $0x13, IV, CTR; \
2712	paddq IV, IV; \
2713	psrad $31, CTR; \
2714	pand GF128MUL_MASK, CTR; \
2715	pxor CTR, IV;
 
2716
2717/*
2718 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2719 *			 const u8 *src, bool enc, le128 *iv)
2720 */
2721SYM_FUNC_START(aesni_xts_crypt8)
2722	FRAME_BEGIN
2723	cmpb $0, %cl
2724	movl $0, %ecx
2725	movl $240, %r10d
2726	leaq _aesni_enc4, %r11
2727	leaq _aesni_dec4, %rax
2728	cmovel %r10d, %ecx
2729	cmoveq %rax, %r11
2730
 
 
2731	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
 
 
 
2732	movups (IVP), IV
2733
2734	mov 480(KEYP), KLEN
2735	addq %rcx, KEYP
 
 
 
 
 
 
 
 
 
 
2736
2737	movdqa IV, STATE1
2738	movdqu 0x00(INP), INC
2739	pxor INC, STATE1
2740	movdqu IV, 0x00(OUTP)
2741
2742	_aesni_gf128mul_x_ble()
2743	movdqa IV, STATE2
2744	movdqu 0x10(INP), INC
2745	pxor INC, STATE2
2746	movdqu IV, 0x10(OUTP)
2747
2748	_aesni_gf128mul_x_ble()
2749	movdqa IV, STATE3
2750	movdqu 0x20(INP), INC
2751	pxor INC, STATE3
2752	movdqu IV, 0x20(OUTP)
2753
2754	_aesni_gf128mul_x_ble()
2755	movdqa IV, STATE4
2756	movdqu 0x30(INP), INC
2757	pxor INC, STATE4
2758	movdqu IV, 0x30(OUTP)
2759
2760	CALL_NOSPEC r11
 
 
 
 
2761
2762	movdqu 0x00(OUTP), INC
2763	pxor INC, STATE1
2764	movdqu STATE1, 0x00(OUTP)
2765
2766	_aesni_gf128mul_x_ble()
2767	movdqa IV, STATE1
2768	movdqu 0x40(INP), INC
2769	pxor INC, STATE1
2770	movdqu IV, 0x40(OUTP)
2771
2772	movdqu 0x10(OUTP), INC
2773	pxor INC, STATE2
2774	movdqu STATE2, 0x10(OUTP)
2775
2776	_aesni_gf128mul_x_ble()
2777	movdqa IV, STATE2
2778	movdqu 0x50(INP), INC
2779	pxor INC, STATE2
2780	movdqu IV, 0x50(OUTP)
2781
2782	movdqu 0x20(OUTP), INC
2783	pxor INC, STATE3
2784	movdqu STATE3, 0x20(OUTP)
2785
2786	_aesni_gf128mul_x_ble()
2787	movdqa IV, STATE3
2788	movdqu 0x60(INP), INC
2789	pxor INC, STATE3
2790	movdqu IV, 0x60(OUTP)
2791
2792	movdqu 0x30(OUTP), INC
2793	pxor INC, STATE4
2794	movdqu STATE4, 0x30(OUTP)
2795
2796	_aesni_gf128mul_x_ble()
2797	movdqa IV, STATE4
2798	movdqu 0x70(INP), INC
2799	pxor INC, STATE4
2800	movdqu IV, 0x70(OUTP)
2801
2802	_aesni_gf128mul_x_ble()
2803	movups IV, (IVP)
2804
2805	CALL_NOSPEC r11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2806
2807	movdqu 0x40(OUTP), INC
2808	pxor INC, STATE1
2809	movdqu STATE1, 0x40(OUTP)
2810
2811	movdqu 0x50(OUTP), INC
2812	pxor INC, STATE2
2813	movdqu STATE2, 0x50(OUTP)
2814
2815	movdqu 0x60(OUTP), INC
2816	pxor INC, STATE3
2817	movdqu STATE3, 0x60(OUTP)
2818
2819	movdqu 0x70(OUTP), INC
2820	pxor INC, STATE4
2821	movdqu STATE4, 0x70(OUTP)
 
 
2822
2823	FRAME_END
2824	ret
2825SYM_FUNC_END(aesni_xts_crypt8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2826
 
 
 
 
 
 
 
 
2827#endif