Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* -*- linux-c -*- --------------------------------------------------------
  3 *
  4 *   Copyright (C) 2016 Intel Corporation
  5 *
  6 *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
  7 *   Author: Megha Dey <megha.dey@linux.intel.com>
  8 *
  9 *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
 10 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
 11 *
 12 * -----------------------------------------------------------------------
 13 */
 14
 15/*
 16 * AVX512 implementation of RAID-6 syndrome functions
 17 *
 18 */
 19
 20#ifdef CONFIG_AS_AVX512
 21
 22#include <linux/raid/pq.h>
 23#include "x86.h"
 24
 25static const struct raid6_avx512_constants {
 26	u64 x1d[8];
 27} raid6_avx512_constants __aligned(512/8) = {
 28	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 29	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 30	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 31	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 32};
 33
 34static int raid6_have_avx512(void)
 35{
 36	return boot_cpu_has(X86_FEATURE_AVX2) &&
 37		boot_cpu_has(X86_FEATURE_AVX) &&
 38		boot_cpu_has(X86_FEATURE_AVX512F) &&
 39		boot_cpu_has(X86_FEATURE_AVX512BW) &&
 40		boot_cpu_has(X86_FEATURE_AVX512VL) &&
 41		boot_cpu_has(X86_FEATURE_AVX512DQ);
 42}
 43
 44static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
 45{
 46	u8 **dptr = (u8 **)ptrs;
 47	u8 *p, *q;
 48	int d, z, z0;
 49
 50	z0 = disks - 3;         /* Highest data disk */
 51	p = dptr[z0+1];         /* XOR parity */
 52	q = dptr[z0+2];         /* RS syndrome */
 53
 54	kernel_fpu_begin();
 55
 56	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
 57		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
 58		     :
 59		     : "m" (raid6_avx512_constants.x1d[0]));
 60
 61	for (d = 0; d < bytes; d += 64) {
 62		asm volatile("prefetchnta %0\n\t"
 63			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
 64			     "prefetchnta %1\n\t"
 65			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
 66			     "vmovdqa64 %1,%%zmm6"
 67			     :
 68			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
 69		for (z = z0-2; z >= 0; z--) {
 70			asm volatile("prefetchnta %0\n\t"
 71				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
 72				     "vpmovm2b %%k1,%%zmm5\n\t"
 73				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 74				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 75				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 76				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
 77				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
 78				     "vmovdqa64 %0,%%zmm6"
 79				     :
 80				     : "m" (dptr[z][d]));
 81		}
 82		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
 83			     "vpmovm2b %%k1,%%zmm5\n\t"
 84			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 85			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 86			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 87			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
 88			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
 89			     "vmovntdq %%zmm2,%0\n\t"
 90			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
 91			     "vmovntdq %%zmm4,%1\n\t"
 92			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
 93			     :
 94			     : "m" (p[d]), "m" (q[d]));
 95	}
 96
 97	asm volatile("sfence" : : : "memory");
 98	kernel_fpu_end();
 99}
100
101static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
102				       size_t bytes, void **ptrs)
103{
104	u8 **dptr = (u8 **)ptrs;
105	u8 *p, *q;
106	int d, z, z0;
107
108	z0 = stop;		/* P/Q right side optimization */
109	p = dptr[disks-2];	/* XOR parity */
110	q = dptr[disks-1];	/* RS syndrome */
111
112	kernel_fpu_begin();
113
114	asm volatile("vmovdqa64 %0,%%zmm0"
115		     : : "m" (raid6_avx512_constants.x1d[0]));
116
117	for (d = 0 ; d < bytes ; d += 64) {
118		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
119			     "vmovdqa64 %1,%%zmm2\n\t"
120			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
121			     :
122			     : "m" (dptr[z0][d]),  "m" (p[d]));
123		/* P/Q data pages */
124		for (z = z0-1 ; z >= start ; z--) {
125			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
126				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
127				     "vpmovm2b %%k1,%%zmm5\n\t"
128				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
129				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
130				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
131				     "vmovdqa64 %0,%%zmm5\n\t"
132				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
133				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
134				     :
135				     : "m" (dptr[z][d]));
136		}
137		/* P/Q left side optimization */
138		for (z = start-1 ; z >= 0 ; z--) {
139			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
140				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
141				     "vpmovm2b %%k1,%%zmm5\n\t"
142				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
143				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
144				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
145				     :
146				     : );
147		}
148		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
149		/* Don't use movntdq for r/w memory area < cache line */
150			     "vmovdqa64 %%zmm4,%0\n\t"
151			     "vmovdqa64 %%zmm2,%1"
152			     :
153			     : "m" (q[d]), "m" (p[d]));
154	}
155
156	asm volatile("sfence" : : : "memory");
157	kernel_fpu_end();
158}
159
160const struct raid6_calls raid6_avx512x1 = {
161	raid6_avx5121_gen_syndrome,
162	raid6_avx5121_xor_syndrome,
163	raid6_have_avx512,
164	"avx512x1",
165	1                       /* Has cache hints */
166};
167
168/*
169 * Unrolled-by-2 AVX512 implementation
170 */
171static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
172{
173	u8 **dptr = (u8 **)ptrs;
174	u8 *p, *q;
175	int d, z, z0;
176
177	z0 = disks - 3;         /* Highest data disk */
178	p = dptr[z0+1];         /* XOR parity */
179	q = dptr[z0+2];         /* RS syndrome */
180
181	kernel_fpu_begin();
182
183	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
184		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
185		     :
186		     : "m" (raid6_avx512_constants.x1d[0]));
187
188	/* We uniformly assume a single prefetch covers at least 64 bytes */
189	for (d = 0; d < bytes; d += 128) {
190		asm volatile("prefetchnta %0\n\t"
191			     "prefetchnta %1\n\t"
192			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
193			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
194			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
195			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
196			     :
197			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
198		for (z = z0-1; z >= 0; z--) {
199			asm volatile("prefetchnta %0\n\t"
200				     "prefetchnta %1\n\t"
201				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
202				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
203				     "vpmovm2b %%k1,%%zmm5\n\t"
204				     "vpmovm2b %%k2,%%zmm7\n\t"
205				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
206				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
207				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
208				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
209				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
210				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
211				     "vmovdqa64 %0,%%zmm5\n\t"
212				     "vmovdqa64 %1,%%zmm7\n\t"
213				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
214				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
215				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
216				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
217				     :
218				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
219		}
220		asm volatile("vmovntdq %%zmm2,%0\n\t"
221			     "vmovntdq %%zmm3,%1\n\t"
222			     "vmovntdq %%zmm4,%2\n\t"
223			     "vmovntdq %%zmm6,%3"
224			     :
225			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
226			       "m" (q[d+64]));
227	}
228
229	asm volatile("sfence" : : : "memory");
230	kernel_fpu_end();
231}
232
233static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
234				       size_t bytes, void **ptrs)
235{
236	u8 **dptr = (u8 **)ptrs;
237	u8 *p, *q;
238	int d, z, z0;
239
240	z0 = stop;		/* P/Q right side optimization */
241	p = dptr[disks-2];	/* XOR parity */
242	q = dptr[disks-1];	/* RS syndrome */
243
244	kernel_fpu_begin();
245
246	asm volatile("vmovdqa64 %0,%%zmm0"
247		     : : "m" (raid6_avx512_constants.x1d[0]));
248
249	for (d = 0 ; d < bytes ; d += 128) {
250		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
251			     "vmovdqa64 %1,%%zmm6\n\t"
252			     "vmovdqa64 %2,%%zmm2\n\t"
253			     "vmovdqa64 %3,%%zmm3\n\t"
254			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
255			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
256			     :
257			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
258			       "m" (p[d]), "m" (p[d+64]));
259		/* P/Q data pages */
260		for (z = z0-1 ; z >= start ; z--) {
261			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
262				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
263				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
264				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
265				     "vpmovm2b %%k1,%%zmm5\n\t"
266				     "vpmovm2b %%k2,%%zmm7\n\t"
267				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
268				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
269				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
270				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
271				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
272				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
273				     "vmovdqa64 %0,%%zmm5\n\t"
274				     "vmovdqa64 %1,%%zmm7\n\t"
275				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
276				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
277				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
278				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
279				     :
280				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
281		}
282		/* P/Q left side optimization */
283		for (z = start-1 ; z >= 0 ; z--) {
284			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
285				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
286				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
287				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
288				     "vpmovm2b %%k1,%%zmm5\n\t"
289				     "vpmovm2b %%k2,%%zmm7\n\t"
290				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
291				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
292				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
293				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
294				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
295				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
296				     :
297				     : );
298		}
299		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
300			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
301			     /* Don't use movntdq for r/w
302			      * memory area < cache line
303			      */
304			     "vmovdqa64 %%zmm4,%0\n\t"
305			     "vmovdqa64 %%zmm6,%1\n\t"
306			     "vmovdqa64 %%zmm2,%2\n\t"
307			     "vmovdqa64 %%zmm3,%3"
308			     :
309			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
310			       "m" (p[d+64]));
311	}
312
313	asm volatile("sfence" : : : "memory");
314	kernel_fpu_end();
315}
316
317const struct raid6_calls raid6_avx512x2 = {
318	raid6_avx5122_gen_syndrome,
319	raid6_avx5122_xor_syndrome,
320	raid6_have_avx512,
321	"avx512x2",
322	1                       /* Has cache hints */
323};
324
325#ifdef CONFIG_X86_64
326
327/*
328 * Unrolled-by-4 AVX2 implementation
329 */
330static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
331{
332	u8 **dptr = (u8 **)ptrs;
333	u8 *p, *q;
334	int d, z, z0;
335
336	z0 = disks - 3;         /* Highest data disk */
337	p = dptr[z0+1];         /* XOR parity */
338	q = dptr[z0+2];         /* RS syndrome */
339
340	kernel_fpu_begin();
341
342	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
343		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
344		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
345		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
346		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
347		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
348		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
349		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
350		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
351		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
352		     :
353		     : "m" (raid6_avx512_constants.x1d[0]));
354
355	for (d = 0; d < bytes; d += 256) {
356		for (z = z0; z >= 0; z--) {
357		asm volatile("prefetchnta %0\n\t"
358			     "prefetchnta %1\n\t"
359			     "prefetchnta %2\n\t"
360			     "prefetchnta %3\n\t"
361			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
362			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
363			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
364			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
365			     "vpmovm2b %%k1,%%zmm5\n\t"
366			     "vpmovm2b %%k2,%%zmm7\n\t"
367			     "vpmovm2b %%k3,%%zmm13\n\t"
368			     "vpmovm2b %%k4,%%zmm15\n\t"
369			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
370			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
371			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
372			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
373			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
374			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
375			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
376			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
377			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
378			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
379			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
380			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
381			     "vmovdqa64 %0,%%zmm5\n\t"
382			     "vmovdqa64 %1,%%zmm7\n\t"
383			     "vmovdqa64 %2,%%zmm13\n\t"
384			     "vmovdqa64 %3,%%zmm15\n\t"
385			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
386			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
387			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
388			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
389			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
390			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
391			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
392			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
393			     :
394			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
395			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
396		}
397		asm volatile("vmovntdq %%zmm2,%0\n\t"
398			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
399			     "vmovntdq %%zmm3,%1\n\t"
400			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
401			     "vmovntdq %%zmm10,%2\n\t"
402			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
403			     "vmovntdq %%zmm11,%3\n\t"
404			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
405			     "vmovntdq %%zmm4,%4\n\t"
406			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
407			     "vmovntdq %%zmm6,%5\n\t"
408			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
409			     "vmovntdq %%zmm12,%6\n\t"
410			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
411			     "vmovntdq %%zmm14,%7\n\t"
412			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
413			     :
414			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
415			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
416			       "m" (q[d+128]), "m" (q[d+192]));
417	}
418
419	asm volatile("sfence" : : : "memory");
420	kernel_fpu_end();
421}
422
423static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
424				       size_t bytes, void **ptrs)
425{
426	u8 **dptr = (u8 **)ptrs;
427	u8 *p, *q;
428	int d, z, z0;
429
430	z0 = stop;		/* P/Q right side optimization */
431	p = dptr[disks-2];	/* XOR parity */
432	q = dptr[disks-1];	/* RS syndrome */
433
434	kernel_fpu_begin();
435
436	asm volatile("vmovdqa64 %0,%%zmm0"
437		     :: "m" (raid6_avx512_constants.x1d[0]));
438
439	for (d = 0 ; d < bytes ; d += 256) {
440		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
441			     "vmovdqa64 %1,%%zmm6\n\t"
442			     "vmovdqa64 %2,%%zmm12\n\t"
443			     "vmovdqa64 %3,%%zmm14\n\t"
444			     "vmovdqa64 %4,%%zmm2\n\t"
445			     "vmovdqa64 %5,%%zmm3\n\t"
446			     "vmovdqa64 %6,%%zmm10\n\t"
447			     "vmovdqa64 %7,%%zmm11\n\t"
448			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
449			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
450			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
451			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
452			     :
453			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
454			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
455			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
456			       "m" (p[d+192]));
457		/* P/Q data pages */
458		for (z = z0-1 ; z >= start ; z--) {
459			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
460				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
461				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
462				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
463				     "prefetchnta %0\n\t"
464				     "prefetchnta %2\n\t"
465				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
466				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
467				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
468				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
469				     "vpmovm2b %%k1,%%zmm5\n\t"
470				     "vpmovm2b %%k2,%%zmm7\n\t"
471				     "vpmovm2b %%k3,%%zmm13\n\t"
472				     "vpmovm2b %%k4,%%zmm15\n\t"
473				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
474				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
475				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
476				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
477				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
478				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
479				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
480				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
481				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
482				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
483				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
484				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
485				     "vmovdqa64 %0,%%zmm5\n\t"
486				     "vmovdqa64 %1,%%zmm7\n\t"
487				     "vmovdqa64 %2,%%zmm13\n\t"
488				     "vmovdqa64 %3,%%zmm15\n\t"
489				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
490				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
491				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
492				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
493				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
494				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
495				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
496				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
497				     :
498				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
499				       "m" (dptr[z][d+128]),
500				       "m" (dptr[z][d+192]));
501		}
502		asm volatile("prefetchnta %0\n\t"
503			     "prefetchnta %1\n\t"
504			     :
505			     : "m" (q[d]), "m" (q[d+128]));
506		/* P/Q left side optimization */
507		for (z = start-1 ; z >= 0 ; z--) {
508			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
509				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
510				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
511				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
512				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
513				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
514				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
515				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
516				     "vpmovm2b %%k1,%%zmm5\n\t"
517				     "vpmovm2b %%k2,%%zmm7\n\t"
518				     "vpmovm2b %%k3,%%zmm13\n\t"
519				     "vpmovm2b %%k4,%%zmm15\n\t"
520				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
521				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
522				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
523				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
524				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
525				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
526				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
527				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
528				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
529				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
530				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
531				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
532				     :
533				     : );
534		}
535		asm volatile("vmovntdq %%zmm2,%0\n\t"
536			     "vmovntdq %%zmm3,%1\n\t"
537			     "vmovntdq %%zmm10,%2\n\t"
538			     "vmovntdq %%zmm11,%3\n\t"
539			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
540			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
541			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
542			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
543			     "vmovntdq %%zmm4,%4\n\t"
544			     "vmovntdq %%zmm6,%5\n\t"
545			     "vmovntdq %%zmm12,%6\n\t"
546			     "vmovntdq %%zmm14,%7"
547			     :
548			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
549			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
550			       "m" (q[d+128]), "m" (q[d+192]));
551	}
552	asm volatile("sfence" : : : "memory");
553	kernel_fpu_end();
554}
555const struct raid6_calls raid6_avx512x4 = {
556	raid6_avx5124_gen_syndrome,
557	raid6_avx5124_xor_syndrome,
558	raid6_have_avx512,
559	"avx512x4",
560	1                       /* Has cache hints */
561};
562#endif
563
564#endif /* CONFIG_AS_AVX512 */