Linux Audio

Check our new training course

Loading...
v6.13.7
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * ARIA Cipher 16-way parallel algorithm (AVX)
   4 *
   5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
   6 *
   7 */
   8
   9#include <linux/linkage.h>
  10#include <linux/cfi_types.h>
  11#include <asm/asm-offsets.h>
  12#include <asm/frame.h>
  13
 
 
 
 
 
  14/* register macros */
  15#define CTX %rdi
  16
  17
  18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
  19	( (((a0) & 1) << 0) |				\
  20	  (((a1) & 1) << 1) |				\
  21	  (((a2) & 1) << 2) |				\
  22	  (((a3) & 1) << 3) |				\
  23	  (((a4) & 1) << 4) |				\
  24	  (((a5) & 1) << 5) |				\
  25	  (((a6) & 1) << 6) |				\
  26	  (((a7) & 1) << 7) )
  27
  28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
  29	( ((l7) << (0 * 8)) |				\
  30	  ((l6) << (1 * 8)) |				\
  31	  ((l5) << (2 * 8)) |				\
  32	  ((l4) << (3 * 8)) |				\
  33	  ((l3) << (4 * 8)) |				\
  34	  ((l2) << (5 * 8)) |				\
  35	  ((l1) << (6 * 8)) |				\
  36	  ((l0) << (7 * 8)) )
  37
  38#define inc_le128(x, minus_one, tmp)			\
  39	vpcmpeqq minus_one, x, tmp;			\
  40	vpsubq minus_one, x, x;				\
  41	vpslldq $8, tmp, tmp;				\
  42	vpsubq tmp, x, x;
  43
  44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
  45	vpand x, mask4bit, tmp0;			\
  46	vpandn x, mask4bit, x;				\
  47	vpsrld $4, x, x;				\
  48							\
  49	vpshufb tmp0, lo_t, tmp0;			\
  50	vpshufb x, hi_t, x;				\
  51	vpxor tmp0, x, x;
  52
  53#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
  54	vpunpckhdq x1, x0, t2;				\
  55	vpunpckldq x1, x0, x0;				\
  56							\
  57	vpunpckldq x3, x2, t1;				\
  58	vpunpckhdq x3, x2, x2;				\
  59							\
  60	vpunpckhqdq t1, x0, x1;				\
  61	vpunpcklqdq t1, x0, x0;				\
  62							\
  63	vpunpckhqdq x2, t2, x3;				\
  64	vpunpcklqdq x2, t2, x2;
  65
  66#define byteslice_16x16b(a0, b0, c0, d0,		\
  67			 a1, b1, c1, d1,		\
  68			 a2, b2, c2, d2,		\
  69			 a3, b3, c3, d3,		\
  70			 st0, st1)			\
  71	vmovdqu d2, st0;				\
  72	vmovdqu d3, st1;				\
  73	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
  74	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
  75	vmovdqu st0, d2;				\
  76	vmovdqu st1, d3;				\
  77							\
  78	vmovdqu a0, st0;				\
  79	vmovdqu a1, st1;				\
  80	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
  81	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
  82							\
  83	vmovdqu .Lshufb_16x16b(%rip), a0;		\
  84	vmovdqu st1, a1;				\
  85	vpshufb a0, a2, a2;				\
  86	vpshufb a0, a3, a3;				\
  87	vpshufb a0, b0, b0;				\
  88	vpshufb a0, b1, b1;				\
  89	vpshufb a0, b2, b2;				\
  90	vpshufb a0, b3, b3;				\
  91	vpshufb a0, a1, a1;				\
  92	vpshufb a0, c0, c0;				\
  93	vpshufb a0, c1, c1;				\
  94	vpshufb a0, c2, c2;				\
  95	vpshufb a0, c3, c3;				\
  96	vpshufb a0, d0, d0;				\
  97	vpshufb a0, d1, d1;				\
  98	vpshufb a0, d2, d2;				\
  99	vpshufb a0, d3, d3;				\
 100	vmovdqu d3, st1;				\
 101	vmovdqu st0, d3;				\
 102	vpshufb a0, d3, a0;				\
 103	vmovdqu d2, st0;				\
 104							\
 105	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
 106	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
 107	vmovdqu st0, d2;				\
 108	vmovdqu st1, d3;				\
 109							\
 110	vmovdqu b0, st0;				\
 111	vmovdqu b1, st1;				\
 112	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
 113	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
 114	vmovdqu st0, b0;				\
 115	vmovdqu st1, b1;				\
 116	/* does not adjust output bytes inside vectors */
 117
 118#define debyteslice_16x16b(a0, b0, c0, d0,		\
 119			   a1, b1, c1, d1,		\
 120			   a2, b2, c2, d2,		\
 121			   a3, b3, c3, d3,		\
 122			   st0, st1)			\
 123	vmovdqu d2, st0;				\
 124	vmovdqu d3, st1;				\
 125	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
 126	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
 127	vmovdqu st0, d2;				\
 128	vmovdqu st1, d3;				\
 129							\
 130	vmovdqu a0, st0;				\
 131	vmovdqu a1, st1;				\
 132	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
 133	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
 134							\
 135	vmovdqu .Lshufb_16x16b(%rip), a0;		\
 136	vmovdqu st1, a1;				\
 137	vpshufb a0, a2, a2;				\
 138	vpshufb a0, a3, a3;				\
 139	vpshufb a0, b0, b0;				\
 140	vpshufb a0, b1, b1;				\
 141	vpshufb a0, b2, b2;				\
 142	vpshufb a0, b3, b3;				\
 143	vpshufb a0, a1, a1;				\
 144	vpshufb a0, c0, c0;				\
 145	vpshufb a0, c1, c1;				\
 146	vpshufb a0, c2, c2;				\
 147	vpshufb a0, c3, c3;				\
 148	vpshufb a0, d0, d0;				\
 149	vpshufb a0, d1, d1;				\
 150	vpshufb a0, d2, d2;				\
 151	vpshufb a0, d3, d3;				\
 152	vmovdqu d3, st1;				\
 153	vmovdqu st0, d3;				\
 154	vpshufb a0, d3, a0;				\
 155	vmovdqu d2, st0;				\
 156							\
 157	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
 158	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
 159	vmovdqu st0, d2;				\
 160	vmovdqu st1, d3;				\
 161							\
 162	vmovdqu b0, st0;				\
 163	vmovdqu b1, st1;				\
 164	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
 165	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
 166	vmovdqu st0, b0;				\
 167	vmovdqu st1, b1;				\
 168	/* does not adjust output bytes inside vectors */
 169
 170/* load blocks to registers and apply pre-whitening */
 171#define inpack16_pre(x0, x1, x2, x3,			\
 172		     x4, x5, x6, x7,			\
 173		     y0, y1, y2, y3,			\
 174		     y4, y5, y6, y7,			\
 175		     rio)				\
 176	vmovdqu (0 * 16)(rio), x0;			\
 177	vmovdqu (1 * 16)(rio), x1;			\
 178	vmovdqu (2 * 16)(rio), x2;			\
 179	vmovdqu (3 * 16)(rio), x3;			\
 180	vmovdqu (4 * 16)(rio), x4;			\
 181	vmovdqu (5 * 16)(rio), x5;			\
 182	vmovdqu (6 * 16)(rio), x6;			\
 183	vmovdqu (7 * 16)(rio), x7;			\
 184	vmovdqu (8 * 16)(rio), y0;			\
 185	vmovdqu (9 * 16)(rio), y1;			\
 186	vmovdqu (10 * 16)(rio), y2;			\
 187	vmovdqu (11 * 16)(rio), y3;			\
 188	vmovdqu (12 * 16)(rio), y4;			\
 189	vmovdqu (13 * 16)(rio), y5;			\
 190	vmovdqu (14 * 16)(rio), y6;			\
 191	vmovdqu (15 * 16)(rio), y7;
 192
 193/* byteslice pre-whitened blocks and store to temporary memory */
 194#define inpack16_post(x0, x1, x2, x3,			\
 195		      x4, x5, x6, x7,			\
 196		      y0, y1, y2, y3,			\
 197		      y4, y5, y6, y7,			\
 198		      mem_ab, mem_cd)			\
 199	byteslice_16x16b(x0, x1, x2, x3,		\
 200			 x4, x5, x6, x7,		\
 201			 y0, y1, y2, y3,		\
 202			 y4, y5, y6, y7,		\
 203			 (mem_ab), (mem_cd));		\
 204							\
 205	vmovdqu x0, 0 * 16(mem_ab);			\
 206	vmovdqu x1, 1 * 16(mem_ab);			\
 207	vmovdqu x2, 2 * 16(mem_ab);			\
 208	vmovdqu x3, 3 * 16(mem_ab);			\
 209	vmovdqu x4, 4 * 16(mem_ab);			\
 210	vmovdqu x5, 5 * 16(mem_ab);			\
 211	vmovdqu x6, 6 * 16(mem_ab);			\
 212	vmovdqu x7, 7 * 16(mem_ab);			\
 213	vmovdqu y0, 0 * 16(mem_cd);			\
 214	vmovdqu y1, 1 * 16(mem_cd);			\
 215	vmovdqu y2, 2 * 16(mem_cd);			\
 216	vmovdqu y3, 3 * 16(mem_cd);			\
 217	vmovdqu y4, 4 * 16(mem_cd);			\
 218	vmovdqu y5, 5 * 16(mem_cd);			\
 219	vmovdqu y6, 6 * 16(mem_cd);			\
 220	vmovdqu y7, 7 * 16(mem_cd);
 221
 222#define write_output(x0, x1, x2, x3,			\
 223		     x4, x5, x6, x7,			\
 224		     y0, y1, y2, y3,			\
 225		     y4, y5, y6, y7,			\
 226		     mem)				\
 227	vmovdqu x0, 0 * 16(mem);			\
 228	vmovdqu x1, 1 * 16(mem);			\
 229	vmovdqu x2, 2 * 16(mem);			\
 230	vmovdqu x3, 3 * 16(mem);			\
 231	vmovdqu x4, 4 * 16(mem);			\
 232	vmovdqu x5, 5 * 16(mem);			\
 233	vmovdqu x6, 6 * 16(mem);			\
 234	vmovdqu x7, 7 * 16(mem);			\
 235	vmovdqu y0, 8 * 16(mem);			\
 236	vmovdqu y1, 9 * 16(mem);			\
 237	vmovdqu y2, 10 * 16(mem);			\
 238	vmovdqu y3, 11 * 16(mem);			\
 239	vmovdqu y4, 12 * 16(mem);			\
 240	vmovdqu y5, 13 * 16(mem);			\
 241	vmovdqu y6, 14 * 16(mem);			\
 242	vmovdqu y7, 15 * 16(mem);			\
 243
 244#define aria_store_state_8way(x0, x1, x2, x3,		\
 245			      x4, x5, x6, x7,		\
 246			      mem_tmp, idx)		\
 247	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
 248	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
 249	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
 250	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
 251	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
 252	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
 253	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
 254	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
 255
 256#define aria_load_state_8way(x0, x1, x2, x3,		\
 257			     x4, x5, x6, x7,		\
 258			     mem_tmp, idx)		\
 259	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
 260	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
 261	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
 262	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
 263	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
 264	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
 265	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
 266	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
 267
 268#define aria_ark_8way(x0, x1, x2, x3,			\
 269		      x4, x5, x6, x7,			\
 270		      t0, t1, t2, rk,			\
 271		      idx, round)			\
 272	/* AddRoundKey */                               \
 273	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
 274	vpsrld $24, t0, t2;				\
 275	vpshufb t1, t2, t2;				\
 276	vpxor t2, x0, x0;				\
 277	vpsrld $16, t0, t2;				\
 278	vpshufb t1, t2, t2;				\
 279	vpxor t2, x1, x1;				\
 280	vpsrld $8, t0, t2;				\
 281	vpshufb t1, t2, t2;				\
 282	vpxor t2, x2, x2;				\
 283	vpshufb t1, t0, t2;				\
 284	vpxor t2, x3, x3;				\
 285	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
 286	vpsrld $24, t0, t2;				\
 287	vpshufb t1, t2, t2;				\
 288	vpxor t2, x4, x4;				\
 289	vpsrld $16, t0, t2;				\
 290	vpshufb t1, t2, t2;				\
 291	vpxor t2, x5, x5;				\
 292	vpsrld $8, t0, t2;				\
 293	vpshufb t1, t2, t2;				\
 294	vpxor t2, x6, x6;				\
 295	vpshufb t1, t0, t2;				\
 296	vpxor t2, x7, x7;
 297
 298#ifdef CONFIG_AS_GFNI
 299#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
 300			    x4, x5, x6, x7,		\
 301			    t0, t1, t2, t3,		\
 302			    t4, t5, t6, t7)		\
 303	vmovdqa .Ltf_s2_bitmatrix(%rip), t0;		\
 304	vmovdqa .Ltf_inv_bitmatrix(%rip), t1;		\
 305	vmovdqa .Ltf_id_bitmatrix(%rip), t2;		\
 306	vmovdqa .Ltf_aff_bitmatrix(%rip), t3;		\
 307	vmovdqa .Ltf_x2_bitmatrix(%rip), t4;		\
 308	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
 309	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
 310	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
 311	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
 312	vgf2p8affineinvqb $0, t2, x2, x2;		\
 313	vgf2p8affineinvqb $0, t2, x6, x6;		\
 314	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
 315	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
 316	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
 317	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
 318	vgf2p8affineinvqb $0, t2, x3, x3;		\
 319	vgf2p8affineinvqb $0, t2, x7, x7
 320
 321#endif /* CONFIG_AS_GFNI */
 322
 323#define aria_sbox_8way(x0, x1, x2, x3,            	\
 324		       x4, x5, x6, x7,			\
 325		       t0, t1, t2, t3,			\
 326		       t4, t5, t6, t7)			\
 327	vmovdqa .Linv_shift_row(%rip), t0;		\
 328	vmovdqa .Lshift_row(%rip), t1;			\
 329	vbroadcastss .L0f0f0f0f(%rip), t6;		\
 330	vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2;	\
 331	vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3;	\
 332	vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4;	\
 333	vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5;	\
 
 334							\
 335	vaesenclast t7, x0, x0;				\
 336	vaesenclast t7, x4, x4;				\
 337	vaesenclast t7, x1, x1;				\
 338	vaesenclast t7, x5, x5;				\
 339	vaesdeclast t7, x2, x2;				\
 340	vaesdeclast t7, x6, x6;				\
 341							\
 342	/* AES inverse shift rows */			\
 343	vpshufb t0, x0, x0;				\
 344	vpshufb t0, x4, x4;				\
 345	vpshufb t0, x1, x1;				\
 346	vpshufb t0, x5, x5;				\
 347	vpshufb t1, x3, x3;				\
 348	vpshufb t1, x7, x7;				\
 349	vpshufb t1, x2, x2;				\
 350	vpshufb t1, x6, x6;				\
 351							\
 352	/* affine transformation for S2 */		\
 353	filter_8bit(x1, t2, t3, t6, t0);		\
 354	/* affine transformation for S2 */		\
 355	filter_8bit(x5, t2, t3, t6, t0);		\
 356							\
 357	/* affine transformation for X2 */		\
 358	filter_8bit(x3, t4, t5, t6, t0);		\
 359	/* affine transformation for X2 */		\
 360	filter_8bit(x7, t4, t5, t6, t0);		\
 361	vaesdeclast t7, x3, x3;				\
 362	vaesdeclast t7, x7, x7;
 363
 364#define aria_diff_m(x0, x1, x2, x3,			\
 365		    t0, t1, t2, t3)			\
 366	/* T = rotr32(X, 8); */				\
 367	/* X ^= T */					\
 368	vpxor x0, x3, t0;				\
 369	vpxor x1, x0, t1;				\
 370	vpxor x2, x1, t2;				\
 371	vpxor x3, x2, t3;				\
 372	/* X = T ^ rotr(X, 16); */			\
 373	vpxor t2, x0, x0;				\
 374	vpxor x1, t3, t3;				\
 375	vpxor t0, x2, x2;				\
 376	vpxor t1, x3, x1;				\
 377	vmovdqu t3, x3;
 378
 379#define aria_diff_word(x0, x1, x2, x3,			\
 380		       x4, x5, x6, x7,			\
 381		       y0, y1, y2, y3,			\
 382		       y4, y5, y6, y7)			\
 383	/* t1 ^= t2; */					\
 384	vpxor y0, x4, x4;				\
 385	vpxor y1, x5, x5;				\
 386	vpxor y2, x6, x6;				\
 387	vpxor y3, x7, x7;				\
 388							\
 389	/* t2 ^= t3; */					\
 390	vpxor y4, y0, y0;				\
 391	vpxor y5, y1, y1;				\
 392	vpxor y6, y2, y2;				\
 393	vpxor y7, y3, y3;				\
 394							\
 395	/* t0 ^= t1; */					\
 396	vpxor x4, x0, x0;				\
 397	vpxor x5, x1, x1;				\
 398	vpxor x6, x2, x2;				\
 399	vpxor x7, x3, x3;				\
 400							\
 401	/* t3 ^= t1; */					\
 402	vpxor x4, y4, y4;				\
 403	vpxor x5, y5, y5;				\
 404	vpxor x6, y6, y6;				\
 405	vpxor x7, y7, y7;				\
 406							\
 407	/* t2 ^= t0; */					\
 408	vpxor x0, y0, y0;				\
 409	vpxor x1, y1, y1;				\
 410	vpxor x2, y2, y2;				\
 411	vpxor x3, y3, y3;				\
 412							\
 413	/* t1 ^= t2; */					\
 414	vpxor y0, x4, x4;				\
 415	vpxor y1, x5, x5;				\
 416	vpxor y2, x6, x6;				\
 417	vpxor y3, x7, x7;
 418
 419#define aria_fe(x0, x1, x2, x3,				\
 420		x4, x5, x6, x7,				\
 421		y0, y1, y2, y3,				\
 422		y4, y5, y6, y7,				\
 423		mem_tmp, rk, round)			\
 424	vpxor y7, y7, y7;				\
 425	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 426		      y0, y7, y2, rk, 8, round);	\
 427							\
 428	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 429		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 430							\
 431	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 432	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 433	aria_store_state_8way(x0, x1, x2, x3,		\
 434			      x4, x5, x6, x7,		\
 435			      mem_tmp, 8);		\
 436							\
 437	aria_load_state_8way(x0, x1, x2, x3,		\
 438			     x4, x5, x6, x7,		\
 439			     mem_tmp, 0);		\
 440	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 441		      y0, y7, y2, rk, 0, round);	\
 442							\
 443	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 444		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 445							\
 446	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 447	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 448	aria_store_state_8way(x0, x1, x2, x3,		\
 449			      x4, x5, x6, x7,		\
 450			      mem_tmp, 0);		\
 451	aria_load_state_8way(y0, y1, y2, y3,		\
 452			     y4, y5, y6, y7,		\
 453			     mem_tmp, 8);		\
 454	aria_diff_word(x0, x1, x2, x3,			\
 455		       x4, x5, x6, x7,			\
 456		       y0, y1, y2, y3,			\
 457		       y4, y5, y6, y7);			\
 458	/* aria_diff_byte() 				\
 459	 * T3 = ABCD -> BADC 				\
 460	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
 461	 * T0 = ABCD -> CDAB 				\
 462	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
 463	 * T1 = ABCD -> DCBA 				\
 464	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
 465	 */						\
 466	aria_diff_word(x2, x3, x0, x1,			\
 467		       x7, x6, x5, x4,			\
 468		       y0, y1, y2, y3,			\
 469		       y5, y4, y7, y6);			\
 470	aria_store_state_8way(x3, x2, x1, x0,		\
 471			      x6, x7, x4, x5,		\
 472			      mem_tmp, 0);
 473
 474#define aria_fo(x0, x1, x2, x3,				\
 475		x4, x5, x6, x7,				\
 476		y0, y1, y2, y3,				\
 477		y4, y5, y6, y7,				\
 478		mem_tmp, rk, round)			\
 479	vpxor y7, y7, y7;				\
 480	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 481		      y0, y7, y2, rk, 8, round);	\
 482							\
 483	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 484		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 485							\
 486	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 487	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 488	aria_store_state_8way(x0, x1, x2, x3,		\
 489			      x4, x5, x6, x7,		\
 490			      mem_tmp, 8);		\
 491							\
 492	aria_load_state_8way(x0, x1, x2, x3,		\
 493			     x4, x5, x6, x7,		\
 494			     mem_tmp, 0);		\
 495	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 496		      y0, y7, y2, rk, 0, round);	\
 497							\
 498	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 499		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 500							\
 501	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 502	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 503	aria_store_state_8way(x0, x1, x2, x3,		\
 504			      x4, x5, x6, x7,		\
 505			      mem_tmp, 0);		\
 506	aria_load_state_8way(y0, y1, y2, y3,		\
 507			     y4, y5, y6, y7,		\
 508			     mem_tmp, 8);		\
 509	aria_diff_word(x0, x1, x2, x3,			\
 510		       x4, x5, x6, x7,			\
 511		       y0, y1, y2, y3,			\
 512		       y4, y5, y6, y7);			\
 513	/* aria_diff_byte() 				\
 514	 * T1 = ABCD -> BADC 				\
 515	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
 516	 * T2 = ABCD -> CDAB 				\
 517	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
 518	 * T3 = ABCD -> DCBA 				\
 519	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
 520	 */						\
 521	aria_diff_word(x0, x1, x2, x3,			\
 522		       x5, x4, x7, x6,			\
 523		       y2, y3, y0, y1,			\
 524		       y7, y6, y5, y4);			\
 525	aria_store_state_8way(x3, x2, x1, x0,		\
 526			      x6, x7, x4, x5,		\
 527			      mem_tmp, 0);
 528
 529#define aria_ff(x0, x1, x2, x3,				\
 530		x4, x5, x6, x7,				\
 531		y0, y1, y2, y3,				\
 532		y4, y5, y6, y7,				\
 533		mem_tmp, rk, round, last_round)		\
 534	vpxor y7, y7, y7;				\
 535	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 536		      y0, y7, y2, rk, 8, round);	\
 537							\
 538	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 539		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 540							\
 541	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 542		      y0, y7, y2, rk, 8, last_round);	\
 543							\
 544	aria_store_state_8way(x0, x1, x2, x3,		\
 545			      x4, x5, x6, x7,		\
 546			      mem_tmp, 8);		\
 547							\
 548	aria_load_state_8way(x0, x1, x2, x3,		\
 549			     x4, x5, x6, x7,		\
 550			     mem_tmp, 0);		\
 551	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 552		      y0, y7, y2, rk, 0, round);	\
 553							\
 554	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 555		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 556							\
 557	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 558		      y0, y7, y2, rk, 0, last_round);	\
 559							\
 560	aria_load_state_8way(y0, y1, y2, y3,		\
 561			     y4, y5, y6, y7,		\
 562			     mem_tmp, 8);
 563
 564#ifdef CONFIG_AS_GFNI
 565#define aria_fe_gfni(x0, x1, x2, x3,			\
 566		     x4, x5, x6, x7,			\
 567		     y0, y1, y2, y3,			\
 568		     y4, y5, y6, y7,			\
 569		     mem_tmp, rk, round)		\
 570	vpxor y7, y7, y7;				\
 571	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 572		      y0, y7, y2, rk, 8, round);	\
 573							\
 574	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 575			    x6, x7, x4, x5,		\
 576			    y0, y1, y2, y3, 		\
 577			    y4, y5, y6, y7);		\
 578							\
 579	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 580	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 581	aria_store_state_8way(x0, x1, x2, x3,		\
 582			      x4, x5, x6, x7,		\
 583			      mem_tmp, 8);		\
 584							\
 585	aria_load_state_8way(x0, x1, x2, x3,		\
 586			     x4, x5, x6, x7,		\
 587			     mem_tmp, 0);		\
 588	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 589		      y0, y7, y2, rk, 0, round);	\
 590							\
 591	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 592			    x6, x7, x4, x5,		\
 593			    y0, y1, y2, y3, 		\
 594			    y4, y5, y6, y7);		\
 595							\
 596	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 597	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 598	aria_store_state_8way(x0, x1, x2, x3,		\
 599			      x4, x5, x6, x7,		\
 600			      mem_tmp, 0);		\
 601	aria_load_state_8way(y0, y1, y2, y3,		\
 602			     y4, y5, y6, y7,		\
 603			     mem_tmp, 8);		\
 604	aria_diff_word(x0, x1, x2, x3,			\
 605		       x4, x5, x6, x7,			\
 606		       y0, y1, y2, y3,			\
 607		       y4, y5, y6, y7);			\
 608	/* aria_diff_byte() 				\
 609	 * T3 = ABCD -> BADC 				\
 610	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
 611	 * T0 = ABCD -> CDAB 				\
 612	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
 613	 * T1 = ABCD -> DCBA 				\
 614	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
 615	 */						\
 616	aria_diff_word(x2, x3, x0, x1,			\
 617		       x7, x6, x5, x4,			\
 618		       y0, y1, y2, y3,			\
 619		       y5, y4, y7, y6);			\
 620	aria_store_state_8way(x3, x2, x1, x0,		\
 621			      x6, x7, x4, x5,		\
 622			      mem_tmp, 0);
 623
 624#define aria_fo_gfni(x0, x1, x2, x3,			\
 625		     x4, x5, x6, x7,			\
 626		     y0, y1, y2, y3,			\
 627		     y4, y5, y6, y7,			\
 628		     mem_tmp, rk, round)		\
 629	vpxor y7, y7, y7;				\
 630	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 631		      y0, y7, y2, rk, 8, round);	\
 632							\
 633	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 634			    x4, x5, x6, x7,		\
 635			    y0, y1, y2, y3, 		\
 636			    y4, y5, y6, y7);		\
 637							\
 638	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 639	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 640	aria_store_state_8way(x0, x1, x2, x3,		\
 641			      x4, x5, x6, x7,		\
 642			      mem_tmp, 8);		\
 643							\
 644	aria_load_state_8way(x0, x1, x2, x3,		\
 645			     x4, x5, x6, x7,		\
 646			     mem_tmp, 0);		\
 647	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 648		      y0, y7, y2, rk, 0, round);	\
 649							\
 650	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 651			    x4, x5, x6, x7,		\
 652			    y0, y1, y2, y3, 		\
 653			    y4, y5, y6, y7);		\
 654							\
 655	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 656	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 657	aria_store_state_8way(x0, x1, x2, x3,		\
 658			      x4, x5, x6, x7,		\
 659			      mem_tmp, 0);		\
 660	aria_load_state_8way(y0, y1, y2, y3,		\
 661			     y4, y5, y6, y7,		\
 662			     mem_tmp, 8);		\
 663	aria_diff_word(x0, x1, x2, x3,			\
 664		       x4, x5, x6, x7,			\
 665		       y0, y1, y2, y3,			\
 666		       y4, y5, y6, y7);			\
 667	/* aria_diff_byte() 				\
 668	 * T1 = ABCD -> BADC 				\
 669	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
 670	 * T2 = ABCD -> CDAB 				\
 671	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
 672	 * T3 = ABCD -> DCBA 				\
 673	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
 674	 */						\
 675	aria_diff_word(x0, x1, x2, x3,			\
 676		       x5, x4, x7, x6,			\
 677		       y2, y3, y0, y1,			\
 678		       y7, y6, y5, y4);			\
 679	aria_store_state_8way(x3, x2, x1, x0,		\
 680			      x6, x7, x4, x5,		\
 681			      mem_tmp, 0);
 682
 683#define aria_ff_gfni(x0, x1, x2, x3,			\
 684		x4, x5, x6, x7,				\
 685		y0, y1, y2, y3,				\
 686		y4, y5, y6, y7,				\
 687		mem_tmp, rk, round, last_round)		\
 688	vpxor y7, y7, y7;				\
 689	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 690		      y0, y7, y2, rk, 8, round);	\
 691							\
 692	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 693			    x6, x7, x4, x5,		\
 694			    y0, y1, y2, y3, 		\
 695			    y4, y5, y6, y7);		\
 696							\
 697	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 698		      y0, y7, y2, rk, 8, last_round);	\
 699							\
 700	aria_store_state_8way(x0, x1, x2, x3,		\
 701			      x4, x5, x6, x7,		\
 702			      mem_tmp, 8);		\
 703							\
 704	aria_load_state_8way(x0, x1, x2, x3,		\
 705			     x4, x5, x6, x7,		\
 706			     mem_tmp, 0);		\
 707	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 708		      y0, y7, y2, rk, 0, round);	\
 709							\
 710	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 711			    x6, x7, x4, x5,		\
 712			    y0, y1, y2, y3, 		\
 713			    y4, y5, y6, y7);		\
 714							\
 715	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 716		      y0, y7, y2, rk, 0, last_round);	\
 717							\
 718	aria_load_state_8way(y0, y1, y2, y3,		\
 719			     y4, y5, y6, y7,		\
 720			     mem_tmp, 8);
 721
 722#endif /* CONFIG_AS_GFNI */
 723
 724/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
 725.section	.rodata.cst16, "aM", @progbits, 16
 726.align 16
 727
 728#define SHUFB_BYTES(idx) \
 729	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 730
 731.Lshufb_16x16b:
 732	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
 733/* For isolating SubBytes from AESENCLAST, inverse shift row */
 734.Linv_shift_row:
 735	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 736	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 737.Lshift_row:
 738	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
 739	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
 740/* For CTR-mode IV byteswap */
 741.Lbswap128_mask:
 742	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
 743	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
 744
 745/* AES inverse affine and S2 combined:
 746 *      1 1 0 0 0 0 0 1     x0     0
 747 *      0 1 0 0 1 0 0 0     x1     0
 748 *      1 1 0 0 1 1 1 1     x2     0
 749 *      0 1 1 0 1 0 0 1     x3     1
 750 *      0 1 0 0 1 1 0 0  *  x4  +  0
 751 *      0 1 0 1 1 0 0 0     x5     0
 752 *      0 0 0 0 0 1 0 1     x6     0
 753 *      1 1 1 0 0 1 1 1     x7     1
 754 */
 755.Ltf_lo__inv_aff__and__s2:
 756	.octa 0x92172DA81A9FA520B2370D883ABF8500
 757.Ltf_hi__inv_aff__and__s2:
 758	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
 759
 760/* X2 and AES forward affine combined:
 761 *      1 0 1 1 0 0 0 1     x0     0
 762 *      0 1 1 1 1 0 1 1     x1     0
 763 *      0 0 0 1 1 0 1 0     x2     1
 764 *      0 1 0 0 0 1 0 0     x3     0
 765 *      0 0 1 1 1 0 1 1  *  x4  +  0
 766 *      0 1 0 0 1 0 0 0     x5     0
 767 *      1 1 0 1 0 0 1 1     x6     0
 768 *      0 1 0 0 1 0 1 0     x7     0
 769 */
 770.Ltf_lo__x2__and__fwd_aff:
 771	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
 772.Ltf_hi__x2__and__fwd_aff:
 773	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
 774
 775#ifdef CONFIG_AS_GFNI
 
 776/* AES affine: */
 777#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
 778.Ltf_aff_bitmatrix:
 779	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
 780		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
 781		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
 782		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
 783		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
 784		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
 785		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
 786		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
 787	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
 788		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
 789		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
 790		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
 791		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
 792		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
 793		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
 794		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
 795
 796/* AES inverse affine: */
 797#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
 798.Ltf_inv_bitmatrix:
 799	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
 800		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
 801		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
 802		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
 803		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
 804		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
 805		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
 806		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
 807	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
 808		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
 809		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
 810		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
 811		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
 812		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
 813		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
 814		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
 815
 816/* S2: */
 817#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
 818.Ltf_s2_bitmatrix:
 819	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
 820		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
 821		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
 822		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
 823		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
 824		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
 825		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
 826		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
 827	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
 828		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
 829		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
 830		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
 831		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
 832		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
 833		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
 834		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
 835
 836/* X2: */
 837#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
 838.Ltf_x2_bitmatrix:
 839	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
 840		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
 841		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
 842		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
 843		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
 844		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
 845		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
 846		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
 847	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
 848		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
 849		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
 850		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
 851		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
 852		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
 853		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
 854		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
 855
 856/* Identity matrix: */
 857.Ltf_id_bitmatrix:
 858	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
 859		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
 860		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
 861		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
 862		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
 863		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
 864		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
 865		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 866	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
 867		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
 868		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
 869		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
 870		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
 871		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
 872		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
 873		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 874#endif /* CONFIG_AS_GFNI */
 875
 876/* 4-bit mask */
 877.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 878.align 4
 879.L0f0f0f0f:
 880	.long 0x0f0f0f0f
 881
 882.text
 883
 884SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
 885	/* input:
 886	*      %r9: rk
 887	*      %rsi: dst
 888	*      %rdx: src
 889	*      %xmm0..%xmm15: 16 byte-sliced blocks
 890	*/
 891
 892	FRAME_BEGIN
 893
 894	movq %rsi, %rax;
 895	leaq 8 * 16(%rax), %r8;
 896
 897	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 898		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 899		      %xmm15, %rax, %r8);
 900	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
 901		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 902		%rax, %r9, 0);
 903	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 904		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 905		%xmm15, %rax, %r9, 1);
 906	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 907		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 908		%rax, %r9, 2);
 909	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 910		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 911		%xmm15, %rax, %r9, 3);
 912	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 913		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 914		%rax, %r9, 4);
 915	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 916		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 917		%xmm15, %rax, %r9, 5);
 918	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 919		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 920		%rax, %r9, 6);
 921	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 922		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 923		%xmm15, %rax, %r9, 7);
 924	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 925		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 926		%rax, %r9, 8);
 927	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 928		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 929		%xmm15, %rax, %r9, 9);
 930	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 931		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 932		%rax, %r9, 10);
 933	cmpl $12, ARIA_CTX_rounds(CTX);
 934	jne .Laria_192;
 935	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 936		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 937		%xmm15, %rax, %r9, 11, 12);
 938	jmp .Laria_end;
 939.Laria_192:
 940	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 941		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 942		%xmm15, %rax, %r9, 11);
 943	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 944		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 945		%rax, %r9, 12);
 946	cmpl $14, ARIA_CTX_rounds(CTX);
 947	jne .Laria_256;
 948	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 949		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 950		%xmm15, %rax, %r9, 13, 14);
 951	jmp .Laria_end;
 952.Laria_256:
 953	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 954		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 955		%xmm15, %rax, %r9, 13);
 956	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 957		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 958		%rax, %r9, 14);
 959	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 960		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 961		%xmm15, %rax, %r9, 15, 16);
 962.Laria_end:
 963	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
 964			   %xmm9, %xmm13, %xmm0, %xmm5,
 965			   %xmm10, %xmm14, %xmm3, %xmm6,
 966			   %xmm11, %xmm15, %xmm2, %xmm7,
 967			   (%rax), (%r8));
 968
 969	FRAME_END
 970	RET;
 971SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
 972
 973SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
 974	/* input:
 975	*      %rdi: ctx, CTX
 976	*      %rsi: dst
 977	*      %rdx: src
 978	*/
 979
 980	FRAME_BEGIN
 981
 982	leaq ARIA_CTX_enc_key(CTX), %r9;
 983
 984	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 985		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 986		     %xmm15, %rdx);
 987
 988	call __aria_aesni_avx_crypt_16way;
 989
 990	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 991		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 992		     %xmm15, %rax);
 993
 994	FRAME_END
 995	RET;
 996SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
 997
 998SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
 999	/* input:
1000	*      %rdi: ctx, CTX
1001	*      %rsi: dst
1002	*      %rdx: src
1003	*/
1004
1005	FRAME_BEGIN
1006
1007	leaq ARIA_CTX_dec_key(CTX), %r9;
1008
1009	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1010		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1011		     %xmm15, %rdx);
1012
1013	call __aria_aesni_avx_crypt_16way;
1014
1015	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1016		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1017		     %xmm15, %rax);
1018
1019	FRAME_END
1020	RET;
1021SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1022
1023SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1024	/* input:
1025	*      %rdi: ctx
1026	*      %rsi: dst
1027	*      %rdx: src
1028	*      %rcx: keystream
1029	*      %r8: iv (big endian, 128bit)
1030	*/
1031
1032	FRAME_BEGIN
1033	/* load IV and byteswap */
1034	vmovdqu (%r8), %xmm8;
1035
1036	vmovdqa .Lbswap128_mask (%rip), %xmm1;
1037	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1038
1039	vpcmpeqd %xmm0, %xmm0, %xmm0;
1040	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1041
1042	/* construct IVs */
1043	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044	vpshufb %xmm1, %xmm3, %xmm9;
1045	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046	vpshufb %xmm1, %xmm3, %xmm10;
1047	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048	vpshufb %xmm1, %xmm3, %xmm11;
1049	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050	vpshufb %xmm1, %xmm3, %xmm12;
1051	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052	vpshufb %xmm1, %xmm3, %xmm13;
1053	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054	vpshufb %xmm1, %xmm3, %xmm14;
1055	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056	vpshufb %xmm1, %xmm3, %xmm15;
1057	vmovdqu %xmm8, (0 * 16)(%rcx);
1058	vmovdqu %xmm9, (1 * 16)(%rcx);
1059	vmovdqu %xmm10, (2 * 16)(%rcx);
1060	vmovdqu %xmm11, (3 * 16)(%rcx);
1061	vmovdqu %xmm12, (4 * 16)(%rcx);
1062	vmovdqu %xmm13, (5 * 16)(%rcx);
1063	vmovdqu %xmm14, (6 * 16)(%rcx);
1064	vmovdqu %xmm15, (7 * 16)(%rcx);
1065
1066	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067	vpshufb %xmm1, %xmm3, %xmm8;
1068	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069	vpshufb %xmm1, %xmm3, %xmm9;
1070	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071	vpshufb %xmm1, %xmm3, %xmm10;
1072	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073	vpshufb %xmm1, %xmm3, %xmm11;
1074	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075	vpshufb %xmm1, %xmm3, %xmm12;
1076	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077	vpshufb %xmm1, %xmm3, %xmm13;
1078	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079	vpshufb %xmm1, %xmm3, %xmm14;
1080	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081	vpshufb %xmm1, %xmm3, %xmm15;
1082	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083	vpshufb %xmm1, %xmm3, %xmm4;
1084	vmovdqu %xmm4, (%r8);
1085
1086	vmovdqu (0 * 16)(%rcx), %xmm0;
1087	vmovdqu (1 * 16)(%rcx), %xmm1;
1088	vmovdqu (2 * 16)(%rcx), %xmm2;
1089	vmovdqu (3 * 16)(%rcx), %xmm3;
1090	vmovdqu (4 * 16)(%rcx), %xmm4;
1091	vmovdqu (5 * 16)(%rcx), %xmm5;
1092	vmovdqu (6 * 16)(%rcx), %xmm6;
1093	vmovdqu (7 * 16)(%rcx), %xmm7;
1094
1095	FRAME_END
1096	RET;
1097SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1098
1099SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1100	/* input:
1101	*      %rdi: ctx
1102	*      %rsi: dst
1103	*      %rdx: src
1104	*      %rcx: keystream
1105	*      %r8: iv (big endian, 128bit)
1106	*/
1107	FRAME_BEGIN
1108
1109	call __aria_aesni_avx_ctr_gen_keystream_16way;
1110
1111	leaq (%rsi), %r10;
1112	leaq (%rdx), %r11;
1113	leaq (%rcx), %rsi;
1114	leaq (%rcx), %rdx;
1115	leaq ARIA_CTX_enc_key(CTX), %r9;
1116
1117	call __aria_aesni_avx_crypt_16way;
1118
1119	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1120	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1121	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1122	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1123	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1124	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1125	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1126	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1127	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1128	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1129	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1130	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1131	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1132	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1133	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1134	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1135	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1136		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1137		     %xmm15, %r10);
1138
1139	FRAME_END
1140	RET;
1141SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1142
1143#ifdef CONFIG_AS_GFNI
1144SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1145	/* input:
1146	*      %r9: rk
1147	*      %rsi: dst
1148	*      %rdx: src
1149	*      %xmm0..%xmm15: 16 byte-sliced blocks
1150	*/
1151
1152	FRAME_BEGIN
1153
1154	movq %rsi, %rax;
1155	leaq 8 * 16(%rax), %r8;
1156
1157	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1158		      %xmm4, %xmm5, %xmm6, %xmm7,
1159		      %xmm8, %xmm9, %xmm10, %xmm11,
1160		      %xmm12, %xmm13, %xmm14,
1161		      %xmm15, %rax, %r8);
1162	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1163		     %xmm12, %xmm13, %xmm14, %xmm15,
1164		     %xmm0, %xmm1, %xmm2, %xmm3,
1165		     %xmm4, %xmm5, %xmm6, %xmm7,
1166		     %rax, %r9, 0);
1167	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168		     %xmm4, %xmm5, %xmm6, %xmm7,
1169		     %xmm8, %xmm9, %xmm10, %xmm11,
1170		     %xmm12, %xmm13, %xmm14,
1171		     %xmm15, %rax, %r9, 1);
1172	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173		     %xmm12, %xmm13, %xmm14, %xmm15,
1174		     %xmm0, %xmm1, %xmm2, %xmm3,
1175		     %xmm4, %xmm5, %xmm6, %xmm7,
1176		     %rax, %r9, 2);
1177	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1178		     %xmm4, %xmm5, %xmm6, %xmm7,
1179		     %xmm8, %xmm9, %xmm10, %xmm11,
1180		     %xmm12, %xmm13, %xmm14,
1181		     %xmm15, %rax, %r9, 3);
1182	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1183		     %xmm12, %xmm13, %xmm14, %xmm15,
1184		     %xmm0, %xmm1, %xmm2, %xmm3,
1185		     %xmm4, %xmm5, %xmm6, %xmm7,
1186		     %rax, %r9, 4);
1187	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1188		     %xmm4, %xmm5, %xmm6, %xmm7,
1189		     %xmm8, %xmm9, %xmm10, %xmm11,
1190		     %xmm12, %xmm13, %xmm14,
1191		     %xmm15, %rax, %r9, 5);
1192	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1193		     %xmm12, %xmm13, %xmm14, %xmm15,
1194		     %xmm0, %xmm1, %xmm2, %xmm3,
1195		     %xmm4, %xmm5, %xmm6, %xmm7,
1196		     %rax, %r9, 6);
1197	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1198		     %xmm4, %xmm5, %xmm6, %xmm7,
1199		     %xmm8, %xmm9, %xmm10, %xmm11,
1200		     %xmm12, %xmm13, %xmm14,
1201		     %xmm15, %rax, %r9, 7);
1202	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1203		     %xmm12, %xmm13, %xmm14, %xmm15,
1204		     %xmm0, %xmm1, %xmm2, %xmm3,
1205		     %xmm4, %xmm5, %xmm6, %xmm7,
1206		     %rax, %r9, 8);
1207	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1208		     %xmm4, %xmm5, %xmm6, %xmm7,
1209		     %xmm8, %xmm9, %xmm10, %xmm11,
1210		     %xmm12, %xmm13, %xmm14,
1211		     %xmm15, %rax, %r9, 9);
1212	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1213		     %xmm12, %xmm13, %xmm14, %xmm15,
1214		     %xmm0, %xmm1, %xmm2, %xmm3,
1215		     %xmm4, %xmm5, %xmm6, %xmm7,
1216		     %rax, %r9, 10);
1217	cmpl $12, ARIA_CTX_rounds(CTX);
1218	jne .Laria_gfni_192;
1219	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1220		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1221		%xmm15, %rax, %r9, 11, 12);
1222	jmp .Laria_gfni_end;
1223.Laria_gfni_192:
1224	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1225		     %xmm4, %xmm5, %xmm6, %xmm7,
1226		     %xmm8, %xmm9, %xmm10, %xmm11,
1227		     %xmm12, %xmm13, %xmm14,
1228		     %xmm15, %rax, %r9, 11);
1229	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1230		     %xmm12, %xmm13, %xmm14, %xmm15,
1231		     %xmm0, %xmm1, %xmm2, %xmm3,
1232		     %xmm4, %xmm5, %xmm6, %xmm7,
1233		     %rax, %r9, 12);
1234	cmpl $14, ARIA_CTX_rounds(CTX);
1235	jne .Laria_gfni_256;
1236	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1237		     %xmm4, %xmm5, %xmm6, %xmm7,
1238		     %xmm8, %xmm9, %xmm10, %xmm11,
1239		     %xmm12, %xmm13, %xmm14,
1240		     %xmm15, %rax, %r9, 13, 14);
1241	jmp .Laria_gfni_end;
1242.Laria_gfni_256:
1243	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1244		     %xmm4, %xmm5, %xmm6, %xmm7,
1245		     %xmm8, %xmm9, %xmm10, %xmm11,
1246		     %xmm12, %xmm13, %xmm14,
1247		     %xmm15, %rax, %r9, 13);
1248	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1249		     %xmm12, %xmm13, %xmm14, %xmm15,
1250		     %xmm0, %xmm1, %xmm2, %xmm3,
1251		     %xmm4, %xmm5, %xmm6, %xmm7,
1252		     %rax, %r9, 14);
1253	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1254		     %xmm4, %xmm5, %xmm6, %xmm7,
1255		     %xmm8, %xmm9, %xmm10, %xmm11,
1256		     %xmm12, %xmm13, %xmm14,
1257		     %xmm15, %rax, %r9, 15, 16);
1258.Laria_gfni_end:
1259	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1260			   %xmm9, %xmm13, %xmm0, %xmm5,
1261			   %xmm10, %xmm14, %xmm3, %xmm6,
1262			   %xmm11, %xmm15, %xmm2, %xmm7,
1263			   (%rax), (%r8));
1264
1265	FRAME_END
1266	RET;
1267SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1268
1269SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1270	/* input:
1271	*      %rdi: ctx, CTX
1272	*      %rsi: dst
1273	*      %rdx: src
1274	*/
1275
1276	FRAME_BEGIN
1277
1278	leaq ARIA_CTX_enc_key(CTX), %r9;
1279
1280	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1281		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1282		     %xmm15, %rdx);
1283
1284	call __aria_aesni_avx_gfni_crypt_16way;
1285
1286	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1287		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1288		     %xmm15, %rax);
1289
1290	FRAME_END
1291	RET;
1292SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1293
1294SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1295	/* input:
1296	*      %rdi: ctx, CTX
1297	*      %rsi: dst
1298	*      %rdx: src
1299	*/
1300
1301	FRAME_BEGIN
1302
1303	leaq ARIA_CTX_dec_key(CTX), %r9;
1304
1305	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1306		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1307		     %xmm15, %rdx);
1308
1309	call __aria_aesni_avx_gfni_crypt_16way;
1310
1311	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1312		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1313		     %xmm15, %rax);
1314
1315	FRAME_END
1316	RET;
1317SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1318
1319SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1320	/* input:
1321	*      %rdi: ctx
1322	*      %rsi: dst
1323	*      %rdx: src
1324	*      %rcx: keystream
1325	*      %r8: iv (big endian, 128bit)
1326	*/
1327	FRAME_BEGIN
1328
1329	call __aria_aesni_avx_ctr_gen_keystream_16way
1330
1331	leaq (%rsi), %r10;
1332	leaq (%rdx), %r11;
1333	leaq (%rcx), %rsi;
1334	leaq (%rcx), %rdx;
1335	leaq ARIA_CTX_enc_key(CTX), %r9;
1336
1337	call __aria_aesni_avx_gfni_crypt_16way;
1338
1339	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1340	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1341	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1342	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1343	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1344	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1345	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1346	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1347	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1348	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1349	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1350	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1351	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1352	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1353	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1354	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1355	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1356		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1357		     %xmm15, %r10);
1358
1359	FRAME_END
1360	RET;
1361SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1362#endif /* CONFIG_AS_GFNI */
v6.2
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * ARIA Cipher 16-way parallel algorithm (AVX)
   4 *
   5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
   6 *
   7 */
   8
   9#include <linux/linkage.h>
  10#include <linux/cfi_types.h>
 
  11#include <asm/frame.h>
  12
  13/* struct aria_ctx: */
  14#define enc_key 0
  15#define dec_key 272
  16#define rounds 544
  17
  18/* register macros */
  19#define CTX %rdi
  20
  21
  22#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
  23	( (((a0) & 1) << 0) |				\
  24	  (((a1) & 1) << 1) |				\
  25	  (((a2) & 1) << 2) |				\
  26	  (((a3) & 1) << 3) |				\
  27	  (((a4) & 1) << 4) |				\
  28	  (((a5) & 1) << 5) |				\
  29	  (((a6) & 1) << 6) |				\
  30	  (((a7) & 1) << 7) )
  31
  32#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
  33	( ((l7) << (0 * 8)) |				\
  34	  ((l6) << (1 * 8)) |				\
  35	  ((l5) << (2 * 8)) |				\
  36	  ((l4) << (3 * 8)) |				\
  37	  ((l3) << (4 * 8)) |				\
  38	  ((l2) << (5 * 8)) |				\
  39	  ((l1) << (6 * 8)) |				\
  40	  ((l0) << (7 * 8)) )
  41
  42#define inc_le128(x, minus_one, tmp)			\
  43	vpcmpeqq minus_one, x, tmp;			\
  44	vpsubq minus_one, x, x;				\
  45	vpslldq $8, tmp, tmp;				\
  46	vpsubq tmp, x, x;
  47
  48#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
  49	vpand x, mask4bit, tmp0;			\
  50	vpandn x, mask4bit, x;				\
  51	vpsrld $4, x, x;				\
  52							\
  53	vpshufb tmp0, lo_t, tmp0;			\
  54	vpshufb x, hi_t, x;				\
  55	vpxor tmp0, x, x;
  56
  57#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
  58	vpunpckhdq x1, x0, t2;				\
  59	vpunpckldq x1, x0, x0;				\
  60							\
  61	vpunpckldq x3, x2, t1;				\
  62	vpunpckhdq x3, x2, x2;				\
  63							\
  64	vpunpckhqdq t1, x0, x1;				\
  65	vpunpcklqdq t1, x0, x0;				\
  66							\
  67	vpunpckhqdq x2, t2, x3;				\
  68	vpunpcklqdq x2, t2, x2;
  69
  70#define byteslice_16x16b(a0, b0, c0, d0,		\
  71			 a1, b1, c1, d1,		\
  72			 a2, b2, c2, d2,		\
  73			 a3, b3, c3, d3,		\
  74			 st0, st1)			\
  75	vmovdqu d2, st0;				\
  76	vmovdqu d3, st1;				\
  77	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
  78	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
  79	vmovdqu st0, d2;				\
  80	vmovdqu st1, d3;				\
  81							\
  82	vmovdqu a0, st0;				\
  83	vmovdqu a1, st1;				\
  84	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
  85	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
  86							\
  87	vmovdqu .Lshufb_16x16b, a0;			\
  88	vmovdqu st1, a1;				\
  89	vpshufb a0, a2, a2;				\
  90	vpshufb a0, a3, a3;				\
  91	vpshufb a0, b0, b0;				\
  92	vpshufb a0, b1, b1;				\
  93	vpshufb a0, b2, b2;				\
  94	vpshufb a0, b3, b3;				\
  95	vpshufb a0, a1, a1;				\
  96	vpshufb a0, c0, c0;				\
  97	vpshufb a0, c1, c1;				\
  98	vpshufb a0, c2, c2;				\
  99	vpshufb a0, c3, c3;				\
 100	vpshufb a0, d0, d0;				\
 101	vpshufb a0, d1, d1;				\
 102	vpshufb a0, d2, d2;				\
 103	vpshufb a0, d3, d3;				\
 104	vmovdqu d3, st1;				\
 105	vmovdqu st0, d3;				\
 106	vpshufb a0, d3, a0;				\
 107	vmovdqu d2, st0;				\
 108							\
 109	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
 110	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
 111	vmovdqu st0, d2;				\
 112	vmovdqu st1, d3;				\
 113							\
 114	vmovdqu b0, st0;				\
 115	vmovdqu b1, st1;				\
 116	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
 117	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
 118	vmovdqu st0, b0;				\
 119	vmovdqu st1, b1;				\
 120	/* does not adjust output bytes inside vectors */
 121
 122#define debyteslice_16x16b(a0, b0, c0, d0,		\
 123			   a1, b1, c1, d1,		\
 124			   a2, b2, c2, d2,		\
 125			   a3, b3, c3, d3,		\
 126			   st0, st1)			\
 127	vmovdqu d2, st0;				\
 128	vmovdqu d3, st1;				\
 129	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
 130	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
 131	vmovdqu st0, d2;				\
 132	vmovdqu st1, d3;				\
 133							\
 134	vmovdqu a0, st0;				\
 135	vmovdqu a1, st1;				\
 136	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
 137	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
 138							\
 139	vmovdqu .Lshufb_16x16b, a0;			\
 140	vmovdqu st1, a1;				\
 141	vpshufb a0, a2, a2;				\
 142	vpshufb a0, a3, a3;				\
 143	vpshufb a0, b0, b0;				\
 144	vpshufb a0, b1, b1;				\
 145	vpshufb a0, b2, b2;				\
 146	vpshufb a0, b3, b3;				\
 147	vpshufb a0, a1, a1;				\
 148	vpshufb a0, c0, c0;				\
 149	vpshufb a0, c1, c1;				\
 150	vpshufb a0, c2, c2;				\
 151	vpshufb a0, c3, c3;				\
 152	vpshufb a0, d0, d0;				\
 153	vpshufb a0, d1, d1;				\
 154	vpshufb a0, d2, d2;				\
 155	vpshufb a0, d3, d3;				\
 156	vmovdqu d3, st1;				\
 157	vmovdqu st0, d3;				\
 158	vpshufb a0, d3, a0;				\
 159	vmovdqu d2, st0;				\
 160							\
 161	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
 162	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
 163	vmovdqu st0, d2;				\
 164	vmovdqu st1, d3;				\
 165							\
 166	vmovdqu b0, st0;				\
 167	vmovdqu b1, st1;				\
 168	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
 169	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
 170	vmovdqu st0, b0;				\
 171	vmovdqu st1, b1;				\
 172	/* does not adjust output bytes inside vectors */
 173
 174/* load blocks to registers and apply pre-whitening */
 175#define inpack16_pre(x0, x1, x2, x3,			\
 176		     x4, x5, x6, x7,			\
 177		     y0, y1, y2, y3,			\
 178		     y4, y5, y6, y7,			\
 179		     rio)				\
 180	vmovdqu (0 * 16)(rio), x0;			\
 181	vmovdqu (1 * 16)(rio), x1;			\
 182	vmovdqu (2 * 16)(rio), x2;			\
 183	vmovdqu (3 * 16)(rio), x3;			\
 184	vmovdqu (4 * 16)(rio), x4;			\
 185	vmovdqu (5 * 16)(rio), x5;			\
 186	vmovdqu (6 * 16)(rio), x6;			\
 187	vmovdqu (7 * 16)(rio), x7;			\
 188	vmovdqu (8 * 16)(rio), y0;			\
 189	vmovdqu (9 * 16)(rio), y1;			\
 190	vmovdqu (10 * 16)(rio), y2;			\
 191	vmovdqu (11 * 16)(rio), y3;			\
 192	vmovdqu (12 * 16)(rio), y4;			\
 193	vmovdqu (13 * 16)(rio), y5;			\
 194	vmovdqu (14 * 16)(rio), y6;			\
 195	vmovdqu (15 * 16)(rio), y7;
 196
 197/* byteslice pre-whitened blocks and store to temporary memory */
 198#define inpack16_post(x0, x1, x2, x3,			\
 199		      x4, x5, x6, x7,			\
 200		      y0, y1, y2, y3,			\
 201		      y4, y5, y6, y7,			\
 202		      mem_ab, mem_cd)			\
 203	byteslice_16x16b(x0, x1, x2, x3,		\
 204			 x4, x5, x6, x7,		\
 205			 y0, y1, y2, y3,		\
 206			 y4, y5, y6, y7,		\
 207			 (mem_ab), (mem_cd));		\
 208							\
 209	vmovdqu x0, 0 * 16(mem_ab);			\
 210	vmovdqu x1, 1 * 16(mem_ab);			\
 211	vmovdqu x2, 2 * 16(mem_ab);			\
 212	vmovdqu x3, 3 * 16(mem_ab);			\
 213	vmovdqu x4, 4 * 16(mem_ab);			\
 214	vmovdqu x5, 5 * 16(mem_ab);			\
 215	vmovdqu x6, 6 * 16(mem_ab);			\
 216	vmovdqu x7, 7 * 16(mem_ab);			\
 217	vmovdqu y0, 0 * 16(mem_cd);			\
 218	vmovdqu y1, 1 * 16(mem_cd);			\
 219	vmovdqu y2, 2 * 16(mem_cd);			\
 220	vmovdqu y3, 3 * 16(mem_cd);			\
 221	vmovdqu y4, 4 * 16(mem_cd);			\
 222	vmovdqu y5, 5 * 16(mem_cd);			\
 223	vmovdqu y6, 6 * 16(mem_cd);			\
 224	vmovdqu y7, 7 * 16(mem_cd);
 225
 226#define write_output(x0, x1, x2, x3,			\
 227		     x4, x5, x6, x7,			\
 228		     y0, y1, y2, y3,			\
 229		     y4, y5, y6, y7,			\
 230		     mem)				\
 231	vmovdqu x0, 0 * 16(mem);			\
 232	vmovdqu x1, 1 * 16(mem);			\
 233	vmovdqu x2, 2 * 16(mem);			\
 234	vmovdqu x3, 3 * 16(mem);			\
 235	vmovdqu x4, 4 * 16(mem);			\
 236	vmovdqu x5, 5 * 16(mem);			\
 237	vmovdqu x6, 6 * 16(mem);			\
 238	vmovdqu x7, 7 * 16(mem);			\
 239	vmovdqu y0, 8 * 16(mem);			\
 240	vmovdqu y1, 9 * 16(mem);			\
 241	vmovdqu y2, 10 * 16(mem);			\
 242	vmovdqu y3, 11 * 16(mem);			\
 243	vmovdqu y4, 12 * 16(mem);			\
 244	vmovdqu y5, 13 * 16(mem);			\
 245	vmovdqu y6, 14 * 16(mem);			\
 246	vmovdqu y7, 15 * 16(mem);			\
 247
 248#define aria_store_state_8way(x0, x1, x2, x3,		\
 249			      x4, x5, x6, x7,		\
 250			      mem_tmp, idx)		\
 251	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
 252	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
 253	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
 254	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
 255	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
 256	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
 257	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
 258	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
 259
 260#define aria_load_state_8way(x0, x1, x2, x3,		\
 261			     x4, x5, x6, x7,		\
 262			     mem_tmp, idx)		\
 263	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
 264	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
 265	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
 266	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
 267	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
 268	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
 269	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
 270	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
 271
 272#define aria_ark_8way(x0, x1, x2, x3,			\
 273		      x4, x5, x6, x7,			\
 274		      t0, rk, idx, round)		\
 
 275	/* AddRoundKey */                               \
 276	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
 277	vpxor t0, x0, x0;				\
 278	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
 279	vpxor t0, x1, x1;				\
 280	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
 281	vpxor t0, x2, x2;				\
 282	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
 283	vpxor t0, x3, x3;				\
 284	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
 285	vpxor t0, x4, x4;				\
 286	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
 287	vpxor t0, x5, x5;				\
 288	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
 289	vpxor t0, x6, x6;				\
 290	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
 291	vpxor t0, x7, x7;
 
 
 
 
 
 
 
 
 292
 
 293#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
 294			    x4, x5, x6, x7,		\
 295			    t0, t1, t2, t3,		\
 296			    t4, t5, t6, t7)		\
 297	vpbroadcastq .Ltf_s2_bitmatrix, t0;		\
 298	vpbroadcastq .Ltf_inv_bitmatrix, t1;		\
 299	vpbroadcastq .Ltf_id_bitmatrix, t2;		\
 300	vpbroadcastq .Ltf_aff_bitmatrix, t3;		\
 301	vpbroadcastq .Ltf_x2_bitmatrix, t4;		\
 302	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
 303	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
 304	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
 305	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
 306	vgf2p8affineinvqb $0, t2, x2, x2;		\
 307	vgf2p8affineinvqb $0, t2, x6, x6;		\
 308	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
 309	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
 310	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
 311	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
 312	vgf2p8affineinvqb $0, t2, x3, x3;		\
 313	vgf2p8affineinvqb $0, t2, x7, x7
 314
 
 
 315#define aria_sbox_8way(x0, x1, x2, x3,            	\
 316		       x4, x5, x6, x7,			\
 317		       t0, t1, t2, t3,			\
 318		       t4, t5, t6, t7)			\
 319	vpxor t7, t7, t7;				\
 320	vmovdqa .Linv_shift_row, t0;			\
 321	vmovdqa .Lshift_row, t1;			\
 322	vpbroadcastd .L0f0f0f0f, t6;			\
 323	vmovdqa .Ltf_lo__inv_aff__and__s2, t2;		\
 324	vmovdqa .Ltf_hi__inv_aff__and__s2, t3;		\
 325	vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;		\
 326	vmovdqa .Ltf_hi__x2__and__fwd_aff, t5;		\
 327							\
 328	vaesenclast t7, x0, x0;				\
 329	vaesenclast t7, x4, x4;				\
 330	vaesenclast t7, x1, x1;				\
 331	vaesenclast t7, x5, x5;				\
 332	vaesdeclast t7, x2, x2;				\
 333	vaesdeclast t7, x6, x6;				\
 334							\
 335	/* AES inverse shift rows */			\
 336	vpshufb t0, x0, x0;				\
 337	vpshufb t0, x4, x4;				\
 338	vpshufb t0, x1, x1;				\
 339	vpshufb t0, x5, x5;				\
 340	vpshufb t1, x3, x3;				\
 341	vpshufb t1, x7, x7;				\
 342	vpshufb t1, x2, x2;				\
 343	vpshufb t1, x6, x6;				\
 344							\
 345	/* affine transformation for S2 */		\
 346	filter_8bit(x1, t2, t3, t6, t0);		\
 347	/* affine transformation for S2 */		\
 348	filter_8bit(x5, t2, t3, t6, t0);		\
 349							\
 350	/* affine transformation for X2 */		\
 351	filter_8bit(x3, t4, t5, t6, t0);		\
 352	/* affine transformation for X2 */		\
 353	filter_8bit(x7, t4, t5, t6, t0);		\
 354	vaesdeclast t7, x3, x3;				\
 355	vaesdeclast t7, x7, x7;
 356
 357#define aria_diff_m(x0, x1, x2, x3,			\
 358		    t0, t1, t2, t3)			\
 359	/* T = rotr32(X, 8); */				\
 360	/* X ^= T */					\
 361	vpxor x0, x3, t0;				\
 362	vpxor x1, x0, t1;				\
 363	vpxor x2, x1, t2;				\
 364	vpxor x3, x2, t3;				\
 365	/* X = T ^ rotr(X, 16); */			\
 366	vpxor t2, x0, x0;				\
 367	vpxor x1, t3, t3;				\
 368	vpxor t0, x2, x2;				\
 369	vpxor t1, x3, x1;				\
 370	vmovdqu t3, x3;
 371
 372#define aria_diff_word(x0, x1, x2, x3,			\
 373		       x4, x5, x6, x7,			\
 374		       y0, y1, y2, y3,			\
 375		       y4, y5, y6, y7)			\
 376	/* t1 ^= t2; */					\
 377	vpxor y0, x4, x4;				\
 378	vpxor y1, x5, x5;				\
 379	vpxor y2, x6, x6;				\
 380	vpxor y3, x7, x7;				\
 381							\
 382	/* t2 ^= t3; */					\
 383	vpxor y4, y0, y0;				\
 384	vpxor y5, y1, y1;				\
 385	vpxor y6, y2, y2;				\
 386	vpxor y7, y3, y3;				\
 387							\
 388	/* t0 ^= t1; */					\
 389	vpxor x4, x0, x0;				\
 390	vpxor x5, x1, x1;				\
 391	vpxor x6, x2, x2;				\
 392	vpxor x7, x3, x3;				\
 393							\
 394	/* t3 ^= t1; */					\
 395	vpxor x4, y4, y4;				\
 396	vpxor x5, y5, y5;				\
 397	vpxor x6, y6, y6;				\
 398	vpxor x7, y7, y7;				\
 399							\
 400	/* t2 ^= t0; */					\
 401	vpxor x0, y0, y0;				\
 402	vpxor x1, y1, y1;				\
 403	vpxor x2, y2, y2;				\
 404	vpxor x3, y3, y3;				\
 405							\
 406	/* t1 ^= t2; */					\
 407	vpxor y0, x4, x4;				\
 408	vpxor y1, x5, x5;				\
 409	vpxor y2, x6, x6;				\
 410	vpxor y3, x7, x7;
 411
 412#define aria_fe(x0, x1, x2, x3,				\
 413		x4, x5, x6, x7,				\
 414		y0, y1, y2, y3,				\
 415		y4, y5, y6, y7,				\
 416		mem_tmp, rk, round)			\
 
 417	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 418		      y0, rk, 8, round);		\
 419							\
 420	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 421		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 422							\
 423	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 424	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 425	aria_store_state_8way(x0, x1, x2, x3,		\
 426			      x4, x5, x6, x7,		\
 427			      mem_tmp, 8);		\
 428							\
 429	aria_load_state_8way(x0, x1, x2, x3,		\
 430			     x4, x5, x6, x7,		\
 431			     mem_tmp, 0);		\
 432	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 433		      y0, rk, 0, round);		\
 434							\
 435	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 436		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 437							\
 438	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 439	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 440	aria_store_state_8way(x0, x1, x2, x3,		\
 441			      x4, x5, x6, x7,		\
 442			      mem_tmp, 0);		\
 443	aria_load_state_8way(y0, y1, y2, y3,		\
 444			     y4, y5, y6, y7,		\
 445			     mem_tmp, 8);		\
 446	aria_diff_word(x0, x1, x2, x3,			\
 447		       x4, x5, x6, x7,			\
 448		       y0, y1, y2, y3,			\
 449		       y4, y5, y6, y7);			\
 450	/* aria_diff_byte() 				\
 451	 * T3 = ABCD -> BADC 				\
 452	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
 453	 * T0 = ABCD -> CDAB 				\
 454	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
 455	 * T1 = ABCD -> DCBA 				\
 456	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
 457	 */						\
 458	aria_diff_word(x2, x3, x0, x1,			\
 459		       x7, x6, x5, x4,			\
 460		       y0, y1, y2, y3,			\
 461		       y5, y4, y7, y6);			\
 462	aria_store_state_8way(x3, x2, x1, x0,		\
 463			      x6, x7, x4, x5,		\
 464			      mem_tmp, 0);
 465
 466#define aria_fo(x0, x1, x2, x3,				\
 467		x4, x5, x6, x7,				\
 468		y0, y1, y2, y3,				\
 469		y4, y5, y6, y7,				\
 470		mem_tmp, rk, round)			\
 
 471	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 472		      y0, rk, 8, round);		\
 473							\
 474	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 475		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 476							\
 477	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 478	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 479	aria_store_state_8way(x0, x1, x2, x3,		\
 480			      x4, x5, x6, x7,		\
 481			      mem_tmp, 8);		\
 482							\
 483	aria_load_state_8way(x0, x1, x2, x3,		\
 484			     x4, x5, x6, x7,		\
 485			     mem_tmp, 0);		\
 486	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 487		      y0, rk, 0, round);		\
 488							\
 489	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 490		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 491							\
 492	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 493	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 494	aria_store_state_8way(x0, x1, x2, x3,		\
 495			      x4, x5, x6, x7,		\
 496			      mem_tmp, 0);		\
 497	aria_load_state_8way(y0, y1, y2, y3,		\
 498			     y4, y5, y6, y7,		\
 499			     mem_tmp, 8);		\
 500	aria_diff_word(x0, x1, x2, x3,			\
 501		       x4, x5, x6, x7,			\
 502		       y0, y1, y2, y3,			\
 503		       y4, y5, y6, y7);			\
 504	/* aria_diff_byte() 				\
 505	 * T1 = ABCD -> BADC 				\
 506	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
 507	 * T2 = ABCD -> CDAB 				\
 508	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
 509	 * T3 = ABCD -> DCBA 				\
 510	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
 511	 */						\
 512	aria_diff_word(x0, x1, x2, x3,			\
 513		       x5, x4, x7, x6,			\
 514		       y2, y3, y0, y1,			\
 515		       y7, y6, y5, y4);			\
 516	aria_store_state_8way(x3, x2, x1, x0,		\
 517			      x6, x7, x4, x5,		\
 518			      mem_tmp, 0);
 519
 520#define aria_ff(x0, x1, x2, x3,				\
 521		x4, x5, x6, x7,				\
 522		y0, y1, y2, y3,				\
 523		y4, y5, y6, y7,				\
 524		mem_tmp, rk, round, last_round)		\
 
 525	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 526		      y0, rk, 8, round);		\
 527							\
 528	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 529		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 530							\
 531	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 532		      y0, rk, 8, last_round);		\
 533							\
 534	aria_store_state_8way(x0, x1, x2, x3,		\
 535			      x4, x5, x6, x7,		\
 536			      mem_tmp, 8);		\
 537							\
 538	aria_load_state_8way(x0, x1, x2, x3,		\
 539			     x4, x5, x6, x7,		\
 540			     mem_tmp, 0);		\
 541	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 542		      y0, rk, 0, round);		\
 543							\
 544	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
 545		       y0, y1, y2, y3, y4, y5, y6, y7);	\
 546							\
 547	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 548		      y0, rk, 0, last_round);		\
 549							\
 550	aria_load_state_8way(y0, y1, y2, y3,		\
 551			     y4, y5, y6, y7,		\
 552			     mem_tmp, 8);
 553
 
 554#define aria_fe_gfni(x0, x1, x2, x3,			\
 555		     x4, x5, x6, x7,			\
 556		     y0, y1, y2, y3,			\
 557		     y4, y5, y6, y7,			\
 558		     mem_tmp, rk, round)		\
 
 559	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 560		      y0, rk, 8, round);		\
 561							\
 562	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 563			    x6, x7, x4, x5,		\
 564			    y0, y1, y2, y3, 		\
 565			    y4, y5, y6, y7);		\
 566							\
 567	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 568	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 569	aria_store_state_8way(x0, x1, x2, x3,		\
 570			      x4, x5, x6, x7,		\
 571			      mem_tmp, 8);		\
 572							\
 573	aria_load_state_8way(x0, x1, x2, x3,		\
 574			     x4, x5, x6, x7,		\
 575			     mem_tmp, 0);		\
 576	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 577		      y0, rk, 0, round);		\
 578							\
 579	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 580			    x6, x7, x4, x5,		\
 581			    y0, y1, y2, y3, 		\
 582			    y4, y5, y6, y7);		\
 583							\
 584	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 585	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 586	aria_store_state_8way(x0, x1, x2, x3,		\
 587			      x4, x5, x6, x7,		\
 588			      mem_tmp, 0);		\
 589	aria_load_state_8way(y0, y1, y2, y3,		\
 590			     y4, y5, y6, y7,		\
 591			     mem_tmp, 8);		\
 592	aria_diff_word(x0, x1, x2, x3,			\
 593		       x4, x5, x6, x7,			\
 594		       y0, y1, y2, y3,			\
 595		       y4, y5, y6, y7);			\
 596	/* aria_diff_byte() 				\
 597	 * T3 = ABCD -> BADC 				\
 598	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
 599	 * T0 = ABCD -> CDAB 				\
 600	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
 601	 * T1 = ABCD -> DCBA 				\
 602	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
 603	 */						\
 604	aria_diff_word(x2, x3, x0, x1,			\
 605		       x7, x6, x5, x4,			\
 606		       y0, y1, y2, y3,			\
 607		       y5, y4, y7, y6);			\
 608	aria_store_state_8way(x3, x2, x1, x0,		\
 609			      x6, x7, x4, x5,		\
 610			      mem_tmp, 0);
 611
 612#define aria_fo_gfni(x0, x1, x2, x3,			\
 613		     x4, x5, x6, x7,			\
 614		     y0, y1, y2, y3,			\
 615		     y4, y5, y6, y7,			\
 616		     mem_tmp, rk, round)		\
 
 617	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 618		      y0, rk, 8, round);		\
 619							\
 620	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 621			    x4, x5, x6, x7,		\
 622			    y0, y1, y2, y3, 		\
 623			    y4, y5, y6, y7);		\
 624							\
 625	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 626	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 627	aria_store_state_8way(x0, x1, x2, x3,		\
 628			      x4, x5, x6, x7,		\
 629			      mem_tmp, 8);		\
 630							\
 631	aria_load_state_8way(x0, x1, x2, x3,		\
 632			     x4, x5, x6, x7,		\
 633			     mem_tmp, 0);		\
 634	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 635		      y0, rk, 0, round);		\
 636							\
 637	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
 638			    x4, x5, x6, x7,		\
 639			    y0, y1, y2, y3, 		\
 640			    y4, y5, y6, y7);		\
 641							\
 642	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 643	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 644	aria_store_state_8way(x0, x1, x2, x3,		\
 645			      x4, x5, x6, x7,		\
 646			      mem_tmp, 0);		\
 647	aria_load_state_8way(y0, y1, y2, y3,		\
 648			     y4, y5, y6, y7,		\
 649			     mem_tmp, 8);		\
 650	aria_diff_word(x0, x1, x2, x3,			\
 651		       x4, x5, x6, x7,			\
 652		       y0, y1, y2, y3,			\
 653		       y4, y5, y6, y7);			\
 654	/* aria_diff_byte() 				\
 655	 * T1 = ABCD -> BADC 				\
 656	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
 657	 * T2 = ABCD -> CDAB 				\
 658	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
 659	 * T3 = ABCD -> DCBA 				\
 660	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
 661	 */						\
 662	aria_diff_word(x0, x1, x2, x3,			\
 663		       x5, x4, x7, x6,			\
 664		       y2, y3, y0, y1,			\
 665		       y7, y6, y5, y4);			\
 666	aria_store_state_8way(x3, x2, x1, x0,		\
 667			      x6, x7, x4, x5,		\
 668			      mem_tmp, 0);
 669
 670#define aria_ff_gfni(x0, x1, x2, x3,			\
 671		x4, x5, x6, x7,				\
 672		y0, y1, y2, y3,				\
 673		y4, y5, y6, y7,				\
 674		mem_tmp, rk, round, last_round)		\
 
 675	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 676		      y0, rk, 8, round);		\
 677							\
 678	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 679			    x6, x7, x4, x5,		\
 680			    y0, y1, y2, y3, 		\
 681			    y4, y5, y6, y7);		\
 682							\
 683	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 684		      y0, rk, 8, last_round);		\
 685							\
 686	aria_store_state_8way(x0, x1, x2, x3,		\
 687			      x4, x5, x6, x7,		\
 688			      mem_tmp, 8);		\
 689							\
 690	aria_load_state_8way(x0, x1, x2, x3,		\
 691			     x4, x5, x6, x7,		\
 692			     mem_tmp, 0);		\
 693	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 694		      y0, rk, 0, round);		\
 695							\
 696	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
 697			    x6, x7, x4, x5,		\
 698			    y0, y1, y2, y3, 		\
 699			    y4, y5, y6, y7);		\
 700							\
 701	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 702		      y0, rk, 0, last_round);		\
 703							\
 704	aria_load_state_8way(y0, y1, y2, y3,		\
 705			     y4, y5, y6, y7,		\
 706			     mem_tmp, 8);
 707
 
 
 708/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
 709.section	.rodata.cst16, "aM", @progbits, 16
 710.align 16
 711
 712#define SHUFB_BYTES(idx) \
 713	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 714
 715.Lshufb_16x16b:
 716	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
 717/* For isolating SubBytes from AESENCLAST, inverse shift row */
 718.Linv_shift_row:
 719	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 720	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 721.Lshift_row:
 722	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
 723	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
 724/* For CTR-mode IV byteswap */
 725.Lbswap128_mask:
 726	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
 727	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
 728
 729/* AES inverse affine and S2 combined:
 730 *      1 1 0 0 0 0 0 1     x0     0
 731 *      0 1 0 0 1 0 0 0     x1     0
 732 *      1 1 0 0 1 1 1 1     x2     0
 733 *      0 1 1 0 1 0 0 1     x3     1
 734 *      0 1 0 0 1 1 0 0  *  x4  +  0
 735 *      0 1 0 1 1 0 0 0     x5     0
 736 *      0 0 0 0 0 1 0 1     x6     0
 737 *      1 1 1 0 0 1 1 1     x7     1
 738 */
 739.Ltf_lo__inv_aff__and__s2:
 740	.octa 0x92172DA81A9FA520B2370D883ABF8500
 741.Ltf_hi__inv_aff__and__s2:
 742	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
 743
 744/* X2 and AES forward affine combined:
 745 *      1 0 1 1 0 0 0 1     x0     0
 746 *      0 1 1 1 1 0 1 1     x1     0
 747 *      0 0 0 1 1 0 1 0     x2     1
 748 *      0 1 0 0 0 1 0 0     x3     0
 749 *      0 0 1 1 1 0 1 1  *  x4  +  0
 750 *      0 1 0 0 1 0 0 0     x5     0
 751 *      1 1 0 1 0 0 1 1     x6     0
 752 *      0 1 0 0 1 0 1 0     x7     0
 753 */
 754.Ltf_lo__x2__and__fwd_aff:
 755	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
 756.Ltf_hi__x2__and__fwd_aff:
 757	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
 758
 759.section	.rodata.cst8, "aM", @progbits, 8
 760.align 8
 761/* AES affine: */
 762#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
 763.Ltf_aff_bitmatrix:
 764	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
 765		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
 766		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
 767		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
 768		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
 769		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
 770		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
 771		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
 
 
 
 
 
 
 
 
 772
 773/* AES inverse affine: */
 774#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
 775.Ltf_inv_bitmatrix:
 776	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
 777		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
 778		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
 779		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
 780		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
 781		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
 782		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
 783		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
 
 
 
 
 
 
 
 
 784
 785/* S2: */
 786#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
 787.Ltf_s2_bitmatrix:
 788	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
 789		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
 790		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
 791		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
 792		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
 793		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
 794		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
 795		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
 
 
 
 
 
 
 
 
 796
 797/* X2: */
 798#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
 799.Ltf_x2_bitmatrix:
 800	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
 801		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
 802		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
 803		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
 804		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
 805		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
 806		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
 807		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
 
 
 
 
 
 
 
 
 808
 809/* Identity matrix: */
 810.Ltf_id_bitmatrix:
 811	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
 812		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
 813		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
 814		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
 815		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
 816		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
 817		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
 818		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 
 
 
 
 
 
 
 
 
 819
 820/* 4-bit mask */
 821.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 822.align 4
 823.L0f0f0f0f:
 824	.long 0x0f0f0f0f
 825
 826.text
 827
 828SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
 829	/* input:
 830	*      %r9: rk
 831	*      %rsi: dst
 832	*      %rdx: src
 833	*      %xmm0..%xmm15: 16 byte-sliced blocks
 834	*/
 835
 836	FRAME_BEGIN
 837
 838	movq %rsi, %rax;
 839	leaq 8 * 16(%rax), %r8;
 840
 841	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 842		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 843		      %xmm15, %rax, %r8);
 844	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
 845		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 846		%rax, %r9, 0);
 847	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 848		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 849		%xmm15, %rax, %r9, 1);
 850	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 851		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 852		%rax, %r9, 2);
 853	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 854		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 855		%xmm15, %rax, %r9, 3);
 856	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 857		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 858		%rax, %r9, 4);
 859	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 860		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 861		%xmm15, %rax, %r9, 5);
 862	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 863		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 864		%rax, %r9, 6);
 865	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 866		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 867		%xmm15, %rax, %r9, 7);
 868	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 869		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 870		%rax, %r9, 8);
 871	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 872		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 873		%xmm15, %rax, %r9, 9);
 874	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 875		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 876		%rax, %r9, 10);
 877	cmpl $12, rounds(CTX);
 878	jne .Laria_192;
 879	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 880		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 881		%xmm15, %rax, %r9, 11, 12);
 882	jmp .Laria_end;
 883.Laria_192:
 884	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 885		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 886		%xmm15, %rax, %r9, 11);
 887	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 888		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 889		%rax, %r9, 12);
 890	cmpl $14, rounds(CTX);
 891	jne .Laria_256;
 892	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 893		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 894		%xmm15, %rax, %r9, 13, 14);
 895	jmp .Laria_end;
 896.Laria_256:
 897	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 898		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 899		%xmm15, %rax, %r9, 13);
 900	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
 901		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 902		%rax, %r9, 14);
 903	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 904		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 905		%xmm15, %rax, %r9, 15, 16);
 906.Laria_end:
 907	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
 908			   %xmm9, %xmm13, %xmm0, %xmm5,
 909			   %xmm10, %xmm14, %xmm3, %xmm6,
 910			   %xmm11, %xmm15, %xmm2, %xmm7,
 911			   (%rax), (%r8));
 912
 913	FRAME_END
 914	RET;
 915SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
 916
 917SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
 918	/* input:
 919	*      %rdi: ctx, CTX
 920	*      %rsi: dst
 921	*      %rdx: src
 922	*/
 923
 924	FRAME_BEGIN
 925
 926	leaq enc_key(CTX), %r9;
 927
 928	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 929		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 930		     %xmm15, %rdx);
 931
 932	call __aria_aesni_avx_crypt_16way;
 933
 934	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 935		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 936		     %xmm15, %rax);
 937
 938	FRAME_END
 939	RET;
 940SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
 941
 942SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
 943	/* input:
 944	*      %rdi: ctx, CTX
 945	*      %rsi: dst
 946	*      %rdx: src
 947	*/
 948
 949	FRAME_BEGIN
 950
 951	leaq dec_key(CTX), %r9;
 952
 953	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 954		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 955		     %xmm15, %rdx);
 956
 957	call __aria_aesni_avx_crypt_16way;
 958
 959	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
 960		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 961		     %xmm15, %rax);
 962
 963	FRAME_END
 964	RET;
 965SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
 966
 967SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
 968	/* input:
 969	*      %rdi: ctx
 970	*      %rsi: dst
 971	*      %rdx: src
 972	*      %rcx: keystream
 973	*      %r8: iv (big endian, 128bit)
 974	*/
 975
 976	FRAME_BEGIN
 977	/* load IV and byteswap */
 978	vmovdqu (%r8), %xmm8;
 979
 980	vmovdqa .Lbswap128_mask (%rip), %xmm1;
 981	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
 982
 983	vpcmpeqd %xmm0, %xmm0, %xmm0;
 984	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
 985
 986	/* construct IVs */
 987	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
 988	vpshufb %xmm1, %xmm3, %xmm9;
 989	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
 990	vpshufb %xmm1, %xmm3, %xmm10;
 991	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
 992	vpshufb %xmm1, %xmm3, %xmm11;
 993	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
 994	vpshufb %xmm1, %xmm3, %xmm12;
 995	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
 996	vpshufb %xmm1, %xmm3, %xmm13;
 997	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
 998	vpshufb %xmm1, %xmm3, %xmm14;
 999	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1000	vpshufb %xmm1, %xmm3, %xmm15;
1001	vmovdqu %xmm8, (0 * 16)(%rcx);
1002	vmovdqu %xmm9, (1 * 16)(%rcx);
1003	vmovdqu %xmm10, (2 * 16)(%rcx);
1004	vmovdqu %xmm11, (3 * 16)(%rcx);
1005	vmovdqu %xmm12, (4 * 16)(%rcx);
1006	vmovdqu %xmm13, (5 * 16)(%rcx);
1007	vmovdqu %xmm14, (6 * 16)(%rcx);
1008	vmovdqu %xmm15, (7 * 16)(%rcx);
1009
1010	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1011	vpshufb %xmm1, %xmm3, %xmm8;
1012	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1013	vpshufb %xmm1, %xmm3, %xmm9;
1014	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1015	vpshufb %xmm1, %xmm3, %xmm10;
1016	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1017	vpshufb %xmm1, %xmm3, %xmm11;
1018	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1019	vpshufb %xmm1, %xmm3, %xmm12;
1020	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1021	vpshufb %xmm1, %xmm3, %xmm13;
1022	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1023	vpshufb %xmm1, %xmm3, %xmm14;
1024	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1025	vpshufb %xmm1, %xmm3, %xmm15;
1026	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1027	vpshufb %xmm1, %xmm3, %xmm4;
1028	vmovdqu %xmm4, (%r8);
1029
1030	vmovdqu (0 * 16)(%rcx), %xmm0;
1031	vmovdqu (1 * 16)(%rcx), %xmm1;
1032	vmovdqu (2 * 16)(%rcx), %xmm2;
1033	vmovdqu (3 * 16)(%rcx), %xmm3;
1034	vmovdqu (4 * 16)(%rcx), %xmm4;
1035	vmovdqu (5 * 16)(%rcx), %xmm5;
1036	vmovdqu (6 * 16)(%rcx), %xmm6;
1037	vmovdqu (7 * 16)(%rcx), %xmm7;
1038
1039	FRAME_END
1040	RET;
1041SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1042
1043SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1044	/* input:
1045	*      %rdi: ctx
1046	*      %rsi: dst
1047	*      %rdx: src
1048	*      %rcx: keystream
1049	*      %r8: iv (big endian, 128bit)
1050	*/
1051	FRAME_BEGIN
1052
1053	call __aria_aesni_avx_ctr_gen_keystream_16way;
1054
1055	leaq (%rsi), %r10;
1056	leaq (%rdx), %r11;
1057	leaq (%rcx), %rsi;
1058	leaq (%rcx), %rdx;
1059	leaq enc_key(CTX), %r9;
1060
1061	call __aria_aesni_avx_crypt_16way;
1062
1063	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1064	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1065	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1066	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1067	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1068	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1069	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1070	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1071	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1072	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1073	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1074	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1075	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1076	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1077	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1078	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1079	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1080		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1081		     %xmm15, %r10);
1082
1083	FRAME_END
1084	RET;
1085SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1086
 
1087SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1088	/* input:
1089	*      %r9: rk
1090	*      %rsi: dst
1091	*      %rdx: src
1092	*      %xmm0..%xmm15: 16 byte-sliced blocks
1093	*/
1094
1095	FRAME_BEGIN
1096
1097	movq %rsi, %rax;
1098	leaq 8 * 16(%rax), %r8;
1099
1100	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1101		      %xmm4, %xmm5, %xmm6, %xmm7,
1102		      %xmm8, %xmm9, %xmm10, %xmm11,
1103		      %xmm12, %xmm13, %xmm14,
1104		      %xmm15, %rax, %r8);
1105	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1106		     %xmm12, %xmm13, %xmm14, %xmm15,
1107		     %xmm0, %xmm1, %xmm2, %xmm3,
1108		     %xmm4, %xmm5, %xmm6, %xmm7,
1109		     %rax, %r9, 0);
1110	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1111		     %xmm4, %xmm5, %xmm6, %xmm7,
1112		     %xmm8, %xmm9, %xmm10, %xmm11,
1113		     %xmm12, %xmm13, %xmm14,
1114		     %xmm15, %rax, %r9, 1);
1115	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1116		     %xmm12, %xmm13, %xmm14, %xmm15,
1117		     %xmm0, %xmm1, %xmm2, %xmm3,
1118		     %xmm4, %xmm5, %xmm6, %xmm7,
1119		     %rax, %r9, 2);
1120	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1121		     %xmm4, %xmm5, %xmm6, %xmm7,
1122		     %xmm8, %xmm9, %xmm10, %xmm11,
1123		     %xmm12, %xmm13, %xmm14,
1124		     %xmm15, %rax, %r9, 3);
1125	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1126		     %xmm12, %xmm13, %xmm14, %xmm15,
1127		     %xmm0, %xmm1, %xmm2, %xmm3,
1128		     %xmm4, %xmm5, %xmm6, %xmm7,
1129		     %rax, %r9, 4);
1130	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1131		     %xmm4, %xmm5, %xmm6, %xmm7,
1132		     %xmm8, %xmm9, %xmm10, %xmm11,
1133		     %xmm12, %xmm13, %xmm14,
1134		     %xmm15, %rax, %r9, 5);
1135	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1136		     %xmm12, %xmm13, %xmm14, %xmm15,
1137		     %xmm0, %xmm1, %xmm2, %xmm3,
1138		     %xmm4, %xmm5, %xmm6, %xmm7,
1139		     %rax, %r9, 6);
1140	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1141		     %xmm4, %xmm5, %xmm6, %xmm7,
1142		     %xmm8, %xmm9, %xmm10, %xmm11,
1143		     %xmm12, %xmm13, %xmm14,
1144		     %xmm15, %rax, %r9, 7);
1145	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1146		     %xmm12, %xmm13, %xmm14, %xmm15,
1147		     %xmm0, %xmm1, %xmm2, %xmm3,
1148		     %xmm4, %xmm5, %xmm6, %xmm7,
1149		     %rax, %r9, 8);
1150	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1151		     %xmm4, %xmm5, %xmm6, %xmm7,
1152		     %xmm8, %xmm9, %xmm10, %xmm11,
1153		     %xmm12, %xmm13, %xmm14,
1154		     %xmm15, %rax, %r9, 9);
1155	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1156		     %xmm12, %xmm13, %xmm14, %xmm15,
1157		     %xmm0, %xmm1, %xmm2, %xmm3,
1158		     %xmm4, %xmm5, %xmm6, %xmm7,
1159		     %rax, %r9, 10);
1160	cmpl $12, rounds(CTX);
1161	jne .Laria_gfni_192;
1162	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1163		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1164		%xmm15, %rax, %r9, 11, 12);
1165	jmp .Laria_gfni_end;
1166.Laria_gfni_192:
1167	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168		     %xmm4, %xmm5, %xmm6, %xmm7,
1169		     %xmm8, %xmm9, %xmm10, %xmm11,
1170		     %xmm12, %xmm13, %xmm14,
1171		     %xmm15, %rax, %r9, 11);
1172	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173		     %xmm12, %xmm13, %xmm14, %xmm15,
1174		     %xmm0, %xmm1, %xmm2, %xmm3,
1175		     %xmm4, %xmm5, %xmm6, %xmm7,
1176		     %rax, %r9, 12);
1177	cmpl $14, rounds(CTX);
1178	jne .Laria_gfni_256;
1179	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1180		     %xmm4, %xmm5, %xmm6, %xmm7,
1181		     %xmm8, %xmm9, %xmm10, %xmm11,
1182		     %xmm12, %xmm13, %xmm14,
1183		     %xmm15, %rax, %r9, 13, 14);
1184	jmp .Laria_gfni_end;
1185.Laria_gfni_256:
1186	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1187		     %xmm4, %xmm5, %xmm6, %xmm7,
1188		     %xmm8, %xmm9, %xmm10, %xmm11,
1189		     %xmm12, %xmm13, %xmm14,
1190		     %xmm15, %rax, %r9, 13);
1191	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1192		     %xmm12, %xmm13, %xmm14, %xmm15,
1193		     %xmm0, %xmm1, %xmm2, %xmm3,
1194		     %xmm4, %xmm5, %xmm6, %xmm7,
1195		     %rax, %r9, 14);
1196	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1197		     %xmm4, %xmm5, %xmm6, %xmm7,
1198		     %xmm8, %xmm9, %xmm10, %xmm11,
1199		     %xmm12, %xmm13, %xmm14,
1200		     %xmm15, %rax, %r9, 15, 16);
1201.Laria_gfni_end:
1202	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1203			   %xmm9, %xmm13, %xmm0, %xmm5,
1204			   %xmm10, %xmm14, %xmm3, %xmm6,
1205			   %xmm11, %xmm15, %xmm2, %xmm7,
1206			   (%rax), (%r8));
1207
1208	FRAME_END
1209	RET;
1210SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1211
1212SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1213	/* input:
1214	*      %rdi: ctx, CTX
1215	*      %rsi: dst
1216	*      %rdx: src
1217	*/
1218
1219	FRAME_BEGIN
1220
1221	leaq enc_key(CTX), %r9;
1222
1223	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1224		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1225		     %xmm15, %rdx);
1226
1227	call __aria_aesni_avx_gfni_crypt_16way;
1228
1229	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1230		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1231		     %xmm15, %rax);
1232
1233	FRAME_END
1234	RET;
1235SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1236
1237SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1238	/* input:
1239	*      %rdi: ctx, CTX
1240	*      %rsi: dst
1241	*      %rdx: src
1242	*/
1243
1244	FRAME_BEGIN
1245
1246	leaq dec_key(CTX), %r9;
1247
1248	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1249		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1250		     %xmm15, %rdx);
1251
1252	call __aria_aesni_avx_gfni_crypt_16way;
1253
1254	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1255		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1256		     %xmm15, %rax);
1257
1258	FRAME_END
1259	RET;
1260SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1261
1262SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1263	/* input:
1264	*      %rdi: ctx
1265	*      %rsi: dst
1266	*      %rdx: src
1267	*      %rcx: keystream
1268	*      %r8: iv (big endian, 128bit)
1269	*/
1270	FRAME_BEGIN
1271
1272	call __aria_aesni_avx_ctr_gen_keystream_16way
1273
1274	leaq (%rsi), %r10;
1275	leaq (%rdx), %r11;
1276	leaq (%rcx), %rsi;
1277	leaq (%rcx), %rdx;
1278	leaq enc_key(CTX), %r9;
1279
1280	call __aria_aesni_avx_gfni_crypt_16way;
1281
1282	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1283	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1284	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1285	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1286	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1287	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1288	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1289	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1290	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1291	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1292	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1293	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1294	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1295	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1296	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1297	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1298	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1299		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1300		     %xmm15, %r10);
1301
1302	FRAME_END
1303	RET;
1304SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)