Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.6.
  1/* -*- linux-c -*- --------------------------------------------------------
  2 *
  3 *   Copyright (C) 2016 Intel Corporation
  4 *
  5 *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
  6 *   Author: Megha Dey <megha.dey@linux.intel.com>
  7 *
  8 *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
  9 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
 10 *
 11 *   This program is free software; you can redistribute it and/or modify
 12 *   it under the terms of the GNU General Public License as published by
 13 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 14 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 15 *   (at your option) any later version; incorporated herein by reference.
 16 *
 17 * -----------------------------------------------------------------------
 18 */
 19
 20/*
 21 * AVX512 implementation of RAID-6 syndrome functions
 22 *
 23 */
 24
 25#ifdef CONFIG_AS_AVX512
 26
 27#include <linux/raid/pq.h>
 28#include "x86.h"
 29
 30static const struct raid6_avx512_constants {
 31	u64 x1d[8];
 32} raid6_avx512_constants __aligned(512/8) = {
 33	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 34	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 35	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 36	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 37};
 38
 39static int raid6_have_avx512(void)
 40{
 41	return boot_cpu_has(X86_FEATURE_AVX2) &&
 42		boot_cpu_has(X86_FEATURE_AVX) &&
 43		boot_cpu_has(X86_FEATURE_AVX512F) &&
 44		boot_cpu_has(X86_FEATURE_AVX512BW) &&
 45		boot_cpu_has(X86_FEATURE_AVX512VL) &&
 46		boot_cpu_has(X86_FEATURE_AVX512DQ);
 47}
 48
 49static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
 50{
 51	u8 **dptr = (u8 **)ptrs;
 52	u8 *p, *q;
 53	int d, z, z0;
 54
 55	z0 = disks - 3;         /* Highest data disk */
 56	p = dptr[z0+1];         /* XOR parity */
 57	q = dptr[z0+2];         /* RS syndrome */
 58
 59	kernel_fpu_begin();
 60
 61	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
 62		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
 63		     :
 64		     : "m" (raid6_avx512_constants.x1d[0]));
 65
 66	for (d = 0; d < bytes; d += 64) {
 67		asm volatile("prefetchnta %0\n\t"
 68			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
 69			     "prefetchnta %1\n\t"
 70			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
 71			     "vmovdqa64 %1,%%zmm6"
 72			     :
 73			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
 74		for (z = z0-2; z >= 0; z--) {
 75			asm volatile("prefetchnta %0\n\t"
 76				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
 77				     "vpmovm2b %%k1,%%zmm5\n\t"
 78				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 79				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 80				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 81				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
 82				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
 83				     "vmovdqa64 %0,%%zmm6"
 84				     :
 85				     : "m" (dptr[z][d]));
 86		}
 87		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
 88			     "vpmovm2b %%k1,%%zmm5\n\t"
 89			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 90			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 91			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 92			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
 93			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
 94			     "vmovntdq %%zmm2,%0\n\t"
 95			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
 96			     "vmovntdq %%zmm4,%1\n\t"
 97			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
 98			     :
 99			     : "m" (p[d]), "m" (q[d]));
100	}
101
102	asm volatile("sfence" : : : "memory");
103	kernel_fpu_end();
104}
105
106static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
107				       size_t bytes, void **ptrs)
108{
109	u8 **dptr = (u8 **)ptrs;
110	u8 *p, *q;
111	int d, z, z0;
112
113	z0 = stop;		/* P/Q right side optimization */
114	p = dptr[disks-2];	/* XOR parity */
115	q = dptr[disks-1];	/* RS syndrome */
116
117	kernel_fpu_begin();
118
119	asm volatile("vmovdqa64 %0,%%zmm0"
120		     : : "m" (raid6_avx512_constants.x1d[0]));
121
122	for (d = 0 ; d < bytes ; d += 64) {
123		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
124			     "vmovdqa64 %1,%%zmm2\n\t"
125			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
126			     :
127			     : "m" (dptr[z0][d]),  "m" (p[d]));
128		/* P/Q data pages */
129		for (z = z0-1 ; z >= start ; z--) {
130			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
131				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
132				     "vpmovm2b %%k1,%%zmm5\n\t"
133				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
134				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
135				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
136				     "vmovdqa64 %0,%%zmm5\n\t"
137				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
138				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
139				     :
140				     : "m" (dptr[z][d]));
141		}
142		/* P/Q left side optimization */
143		for (z = start-1 ; z >= 0 ; z--) {
144			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
145				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
146				     "vpmovm2b %%k1,%%zmm5\n\t"
147				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
148				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
149				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
150				     :
151				     : );
152		}
153		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
154		/* Don't use movntdq for r/w memory area < cache line */
155			     "vmovdqa64 %%zmm4,%0\n\t"
156			     "vmovdqa64 %%zmm2,%1"
157			     :
158			     : "m" (q[d]), "m" (p[d]));
159	}
160
161	asm volatile("sfence" : : : "memory");
162	kernel_fpu_end();
163}
164
165const struct raid6_calls raid6_avx512x1 = {
166	raid6_avx5121_gen_syndrome,
167	raid6_avx5121_xor_syndrome,
168	raid6_have_avx512,
169	"avx512x1",
170	1                       /* Has cache hints */
171};
172
173/*
174 * Unrolled-by-2 AVX512 implementation
175 */
176static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
177{
178	u8 **dptr = (u8 **)ptrs;
179	u8 *p, *q;
180	int d, z, z0;
181
182	z0 = disks - 3;         /* Highest data disk */
183	p = dptr[z0+1];         /* XOR parity */
184	q = dptr[z0+2];         /* RS syndrome */
185
186	kernel_fpu_begin();
187
188	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
189		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
190		     :
191		     : "m" (raid6_avx512_constants.x1d[0]));
192
193	/* We uniformly assume a single prefetch covers at least 64 bytes */
194	for (d = 0; d < bytes; d += 128) {
195		asm volatile("prefetchnta %0\n\t"
196			     "prefetchnta %1\n\t"
197			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
198			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
199			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
200			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
201			     :
202			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
203		for (z = z0-1; z >= 0; z--) {
204			asm volatile("prefetchnta %0\n\t"
205				     "prefetchnta %1\n\t"
206				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
207				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
208				     "vpmovm2b %%k1,%%zmm5\n\t"
209				     "vpmovm2b %%k2,%%zmm7\n\t"
210				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
211				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
212				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
213				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
214				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
215				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
216				     "vmovdqa64 %0,%%zmm5\n\t"
217				     "vmovdqa64 %1,%%zmm7\n\t"
218				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
219				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
220				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
221				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
222				     :
223				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
224		}
225		asm volatile("vmovntdq %%zmm2,%0\n\t"
226			     "vmovntdq %%zmm3,%1\n\t"
227			     "vmovntdq %%zmm4,%2\n\t"
228			     "vmovntdq %%zmm6,%3"
229			     :
230			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
231			       "m" (q[d+64]));
232	}
233
234	asm volatile("sfence" : : : "memory");
235	kernel_fpu_end();
236}
237
238static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
239				       size_t bytes, void **ptrs)
240{
241	u8 **dptr = (u8 **)ptrs;
242	u8 *p, *q;
243	int d, z, z0;
244
245	z0 = stop;		/* P/Q right side optimization */
246	p = dptr[disks-2];	/* XOR parity */
247	q = dptr[disks-1];	/* RS syndrome */
248
249	kernel_fpu_begin();
250
251	asm volatile("vmovdqa64 %0,%%zmm0"
252		     : : "m" (raid6_avx512_constants.x1d[0]));
253
254	for (d = 0 ; d < bytes ; d += 128) {
255		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
256			     "vmovdqa64 %1,%%zmm6\n\t"
257			     "vmovdqa64 %2,%%zmm2\n\t"
258			     "vmovdqa64 %3,%%zmm3\n\t"
259			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
260			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
261			     :
262			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
263			       "m" (p[d]), "m" (p[d+64]));
264		/* P/Q data pages */
265		for (z = z0-1 ; z >= start ; z--) {
266			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
267				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
268				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
269				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
270				     "vpmovm2b %%k1,%%zmm5\n\t"
271				     "vpmovm2b %%k2,%%zmm7\n\t"
272				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
273				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
274				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
275				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
276				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
277				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
278				     "vmovdqa64 %0,%%zmm5\n\t"
279				     "vmovdqa64 %1,%%zmm7\n\t"
280				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
281				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
282				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
283				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
284				     :
285				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
286		}
287		/* P/Q left side optimization */
288		for (z = start-1 ; z >= 0 ; z--) {
289			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
290				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
291				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
292				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
293				     "vpmovm2b %%k1,%%zmm5\n\t"
294				     "vpmovm2b %%k2,%%zmm7\n\t"
295				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
296				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
297				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
298				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
299				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
300				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
301				     :
302				     : );
303		}
304		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
305			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
306			     /* Don't use movntdq for r/w
307			      * memory area < cache line
308			      */
309			     "vmovdqa64 %%zmm4,%0\n\t"
310			     "vmovdqa64 %%zmm6,%1\n\t"
311			     "vmovdqa64 %%zmm2,%2\n\t"
312			     "vmovdqa64 %%zmm3,%3"
313			     :
314			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
315			       "m" (p[d+64]));
316	}
317
318	asm volatile("sfence" : : : "memory");
319	kernel_fpu_end();
320}
321
322const struct raid6_calls raid6_avx512x2 = {
323	raid6_avx5122_gen_syndrome,
324	raid6_avx5122_xor_syndrome,
325	raid6_have_avx512,
326	"avx512x2",
327	1                       /* Has cache hints */
328};
329
330#ifdef CONFIG_X86_64
331
332/*
333 * Unrolled-by-4 AVX2 implementation
334 */
335static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
336{
337	u8 **dptr = (u8 **)ptrs;
338	u8 *p, *q;
339	int d, z, z0;
340
341	z0 = disks - 3;         /* Highest data disk */
342	p = dptr[z0+1];         /* XOR parity */
343	q = dptr[z0+2];         /* RS syndrome */
344
345	kernel_fpu_begin();
346
347	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
348		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
349		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
350		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
351		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
352		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
353		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
354		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
355		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
356		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
357		     :
358		     : "m" (raid6_avx512_constants.x1d[0]));
359
360	for (d = 0; d < bytes; d += 256) {
361		for (z = z0; z >= 0; z--) {
362		asm volatile("prefetchnta %0\n\t"
363			     "prefetchnta %1\n\t"
364			     "prefetchnta %2\n\t"
365			     "prefetchnta %3\n\t"
366			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
367			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
368			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
369			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
370			     "vpmovm2b %%k1,%%zmm5\n\t"
371			     "vpmovm2b %%k2,%%zmm7\n\t"
372			     "vpmovm2b %%k3,%%zmm13\n\t"
373			     "vpmovm2b %%k4,%%zmm15\n\t"
374			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
375			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
376			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
377			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
378			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
379			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
380			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
381			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
382			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
383			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
384			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
385			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
386			     "vmovdqa64 %0,%%zmm5\n\t"
387			     "vmovdqa64 %1,%%zmm7\n\t"
388			     "vmovdqa64 %2,%%zmm13\n\t"
389			     "vmovdqa64 %3,%%zmm15\n\t"
390			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
391			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
392			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
393			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
394			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
395			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
396			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
397			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
398			     :
399			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
400			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
401		}
402		asm volatile("vmovntdq %%zmm2,%0\n\t"
403			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
404			     "vmovntdq %%zmm3,%1\n\t"
405			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
406			     "vmovntdq %%zmm10,%2\n\t"
407			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
408			     "vmovntdq %%zmm11,%3\n\t"
409			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
410			     "vmovntdq %%zmm4,%4\n\t"
411			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
412			     "vmovntdq %%zmm6,%5\n\t"
413			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
414			     "vmovntdq %%zmm12,%6\n\t"
415			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
416			     "vmovntdq %%zmm14,%7\n\t"
417			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
418			     :
419			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
420			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
421			       "m" (q[d+128]), "m" (q[d+192]));
422	}
423
424	asm volatile("sfence" : : : "memory");
425	kernel_fpu_end();
426}
427
428static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
429				       size_t bytes, void **ptrs)
430{
431	u8 **dptr = (u8 **)ptrs;
432	u8 *p, *q;
433	int d, z, z0;
434
435	z0 = stop;		/* P/Q right side optimization */
436	p = dptr[disks-2];	/* XOR parity */
437	q = dptr[disks-1];	/* RS syndrome */
438
439	kernel_fpu_begin();
440
441	asm volatile("vmovdqa64 %0,%%zmm0"
442		     :: "m" (raid6_avx512_constants.x1d[0]));
443
444	for (d = 0 ; d < bytes ; d += 256) {
445		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
446			     "vmovdqa64 %1,%%zmm6\n\t"
447			     "vmovdqa64 %2,%%zmm12\n\t"
448			     "vmovdqa64 %3,%%zmm14\n\t"
449			     "vmovdqa64 %4,%%zmm2\n\t"
450			     "vmovdqa64 %5,%%zmm3\n\t"
451			     "vmovdqa64 %6,%%zmm10\n\t"
452			     "vmovdqa64 %7,%%zmm11\n\t"
453			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
454			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
455			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
456			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
457			     :
458			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
459			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
460			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
461			       "m" (p[d+192]));
462		/* P/Q data pages */
463		for (z = z0-1 ; z >= start ; z--) {
464			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
465				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
466				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
467				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
468				     "prefetchnta %0\n\t"
469				     "prefetchnta %2\n\t"
470				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
471				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
472				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
473				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
474				     "vpmovm2b %%k1,%%zmm5\n\t"
475				     "vpmovm2b %%k2,%%zmm7\n\t"
476				     "vpmovm2b %%k3,%%zmm13\n\t"
477				     "vpmovm2b %%k4,%%zmm15\n\t"
478				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
479				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
480				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
481				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
482				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
483				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
484				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
485				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
486				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
487				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
488				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
489				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
490				     "vmovdqa64 %0,%%zmm5\n\t"
491				     "vmovdqa64 %1,%%zmm7\n\t"
492				     "vmovdqa64 %2,%%zmm13\n\t"
493				     "vmovdqa64 %3,%%zmm15\n\t"
494				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
495				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
496				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
497				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
498				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
499				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
500				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
501				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
502				     :
503				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
504				       "m" (dptr[z][d+128]),
505				       "m" (dptr[z][d+192]));
506		}
507		asm volatile("prefetchnta %0\n\t"
508			     "prefetchnta %1\n\t"
509			     :
510			     : "m" (q[d]), "m" (q[d+128]));
511		/* P/Q left side optimization */
512		for (z = start-1 ; z >= 0 ; z--) {
513			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
514				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
515				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
516				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
517				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
518				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
519				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
520				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
521				     "vpmovm2b %%k1,%%zmm5\n\t"
522				     "vpmovm2b %%k2,%%zmm7\n\t"
523				     "vpmovm2b %%k3,%%zmm13\n\t"
524				     "vpmovm2b %%k4,%%zmm15\n\t"
525				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
526				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
527				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
528				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
529				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
530				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
531				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
532				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
533				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
534				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
535				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
536				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
537				     :
538				     : );
539		}
540		asm volatile("vmovntdq %%zmm2,%0\n\t"
541			     "vmovntdq %%zmm3,%1\n\t"
542			     "vmovntdq %%zmm10,%2\n\t"
543			     "vmovntdq %%zmm11,%3\n\t"
544			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
545			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
546			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
547			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
548			     "vmovntdq %%zmm4,%4\n\t"
549			     "vmovntdq %%zmm6,%5\n\t"
550			     "vmovntdq %%zmm12,%6\n\t"
551			     "vmovntdq %%zmm14,%7"
552			     :
553			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
554			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
555			       "m" (q[d+128]), "m" (q[d+192]));
556	}
557	asm volatile("sfence" : : : "memory");
558	kernel_fpu_end();
559}
560const struct raid6_calls raid6_avx512x4 = {
561	raid6_avx5124_gen_syndrome,
562	raid6_avx5124_xor_syndrome,
563	raid6_have_avx512,
564	"avx512x4",
565	1                       /* Has cache hints */
566};
567#endif
568
569#endif /* CONFIG_AS_AVX512 */