Linux Audio

Check our new training course

Yocto / OpenEmbedded training

Feb 10-13, 2025
Register
Loading...
v6.13.7
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* -*- linux-c -*- ------------------------------------------------------- *
  3 *
  4 *   Copyright (C) 2012 Intel Corporation
  5 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
  6 *
  7 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  8 *
  9 * ----------------------------------------------------------------------- */
 10
 11/*
 12 * AVX2 implementation of RAID-6 syndrome functions
 13 *
 14 */
 15
 16#include <linux/raid/pq.h>
 17#include "x86.h"
 18
 19static const struct raid6_avx2_constants {
 20	u64 x1d[4];
 21} raid6_avx2_constants __aligned(32) = {
 22	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 23	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 24};
 25
 26static int raid6_have_avx2(void)
 27{
 28	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
 29}
 30
 31/*
 32 * Plain AVX2 implementation
 33 */
 34static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 35{
 36	u8 **dptr = (u8 **)ptrs;
 37	u8 *p, *q;
 38	int d, z, z0;
 39
 40	z0 = disks - 3;		/* Highest data disk */
 41	p = dptr[z0+1];		/* XOR parity */
 42	q = dptr[z0+2];		/* RS syndrome */
 43
 44	kernel_fpu_begin();
 45
 46	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 47	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
 48
 49	for (d = 0; d < bytes; d += 32) {
 50		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 51		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 52		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 53		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
 54		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
 55		for (z = z0-2; z >= 0; z--) {
 56			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 57			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 58			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 59			asm volatile("vpand %ymm0,%ymm5,%ymm5");
 60			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 61			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 62			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 63			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
 64		}
 65		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 66		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 67		asm volatile("vpand %ymm0,%ymm5,%ymm5");
 68		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 69		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 70		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 71
 72		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 73		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 74		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 75		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 76	}
 77
 78	asm volatile("sfence" : : : "memory");
 79	kernel_fpu_end();
 80}
 81
 82static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
 83				     size_t bytes, void **ptrs)
 84{
 85	u8 **dptr = (u8 **)ptrs;
 86	u8 *p, *q;
 87	int d, z, z0;
 88
 89	z0 = stop;		/* P/Q right side optimization */
 90	p = dptr[disks-2];	/* XOR parity */
 91	q = dptr[disks-1];	/* RS syndrome */
 92
 93	kernel_fpu_begin();
 94
 95	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 96
 97	for (d = 0 ; d < bytes ; d += 32) {
 98		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
 99		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
100		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
101		/* P/Q data pages */
102		for (z = z0-1 ; z >= start ; z--) {
103			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
104			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
105			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
106			asm volatile("vpand %ymm0,%ymm5,%ymm5");
107			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
108			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
109			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
110			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
111		}
112		/* P/Q left side optimization */
113		for (z = start-1 ; z >= 0 ; z--) {
114			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
115			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
116			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
117			asm volatile("vpand %ymm0,%ymm5,%ymm5");
118			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119		}
120		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
121		/* Don't use movntdq for r/w memory area < cache line */
122		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
123		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
124	}
125
126	asm volatile("sfence" : : : "memory");
127	kernel_fpu_end();
128}
129
130const struct raid6_calls raid6_avx2x1 = {
131	raid6_avx21_gen_syndrome,
132	raid6_avx21_xor_syndrome,
133	raid6_have_avx2,
134	"avx2x1",
135	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
136};
137
138/*
139 * Unrolled-by-2 AVX2 implementation
140 */
141static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
142{
143	u8 **dptr = (u8 **)ptrs;
144	u8 *p, *q;
145	int d, z, z0;
146
147	z0 = disks - 3;		/* Highest data disk */
148	p = dptr[z0+1];		/* XOR parity */
149	q = dptr[z0+2];		/* RS syndrome */
150
151	kernel_fpu_begin();
152
153	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
154	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
155
156	/* We uniformly assume a single prefetch covers at least 32 bytes */
157	for (d = 0; d < bytes; d += 64) {
158		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
159		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
160		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
161		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
162		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
163		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
164		for (z = z0-1; z >= 0; z--) {
165			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
166			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
167			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
168			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
169			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
170			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
171			asm volatile("vpand %ymm0,%ymm5,%ymm5");
172			asm volatile("vpand %ymm0,%ymm7,%ymm7");
173			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
174			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
175			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
176			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
177			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
178			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
179			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
180			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
181		}
182		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
183		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
184		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
185		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
186	}
187
188	asm volatile("sfence" : : : "memory");
189	kernel_fpu_end();
190}
191
192static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
193				     size_t bytes, void **ptrs)
194{
195	u8 **dptr = (u8 **)ptrs;
196	u8 *p, *q;
197	int d, z, z0;
198
199	z0 = stop;		/* P/Q right side optimization */
200	p = dptr[disks-2];	/* XOR parity */
201	q = dptr[disks-1];	/* RS syndrome */
202
203	kernel_fpu_begin();
204
205	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
206
207	for (d = 0 ; d < bytes ; d += 64) {
208		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
209		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
210		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
211		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
212		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
213		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
214		/* P/Q data pages */
215		for (z = z0-1 ; z >= start ; z--) {
216			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
217			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
218			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
219			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
220			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
221			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
222			asm volatile("vpand %ymm0,%ymm5,%ymm5");
223			asm volatile("vpand %ymm0,%ymm7,%ymm7");
224			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
225			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
226			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
227			asm volatile("vmovdqa %0,%%ymm7"
228				     :: "m" (dptr[z][d+32]));
229			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
230			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
231			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
232			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
233		}
234		/* P/Q left side optimization */
235		for (z = start-1 ; z >= 0 ; z--) {
236			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
237			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
238			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
239			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
240			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
241			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
242			asm volatile("vpand %ymm0,%ymm5,%ymm5");
243			asm volatile("vpand %ymm0,%ymm7,%ymm7");
244			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
245			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
246		}
247		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
248		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
249		/* Don't use movntdq for r/w memory area < cache line */
250		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
251		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
252		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
253		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
254	}
255
256	asm volatile("sfence" : : : "memory");
257	kernel_fpu_end();
258}
259
260const struct raid6_calls raid6_avx2x2 = {
261	raid6_avx22_gen_syndrome,
262	raid6_avx22_xor_syndrome,
263	raid6_have_avx2,
264	"avx2x2",
265	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
266};
267
268#ifdef CONFIG_X86_64
269
270/*
271 * Unrolled-by-4 AVX2 implementation
272 */
273static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
274{
275	u8 **dptr = (u8 **)ptrs;
276	u8 *p, *q;
277	int d, z, z0;
278
279	z0 = disks - 3;		/* Highest data disk */
280	p = dptr[z0+1];		/* XOR parity */
281	q = dptr[z0+2];		/* RS syndrome */
282
283	kernel_fpu_begin();
284
285	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
286	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
287	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
288	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
289	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
290	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
291	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
292	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
293	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
294	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
295
296	for (d = 0; d < bytes; d += 128) {
297		for (z = z0; z >= 0; z--) {
298			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
299			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
300			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
301			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
302			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
303			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
304			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
305			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
306			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
307			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
308			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
309			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
310			asm volatile("vpand %ymm0,%ymm5,%ymm5");
311			asm volatile("vpand %ymm0,%ymm7,%ymm7");
312			asm volatile("vpand %ymm0,%ymm13,%ymm13");
313			asm volatile("vpand %ymm0,%ymm15,%ymm15");
314			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
315			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
316			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
317			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
318			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
319			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
320			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
321			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
322			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
323			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
324			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
325			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
326			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
327			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
328			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
329			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
330		}
331		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
332		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
333		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
334		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
335		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
336		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
337		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
338		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
339		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
340		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
341		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
342		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
343		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
344		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
345		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
346		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
347	}
348
349	asm volatile("sfence" : : : "memory");
350	kernel_fpu_end();
351}
352
353static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
354				     size_t bytes, void **ptrs)
355{
356	u8 **dptr = (u8 **)ptrs;
357	u8 *p, *q;
358	int d, z, z0;
359
360	z0 = stop;		/* P/Q right side optimization */
361	p = dptr[disks-2];	/* XOR parity */
362	q = dptr[disks-1];	/* RS syndrome */
363
364	kernel_fpu_begin();
365
366	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
367
368	for (d = 0 ; d < bytes ; d += 128) {
369		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
370		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
371		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
372		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
373		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
374		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
375		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
376		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
377		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
378		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
379		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
380		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
381		/* P/Q data pages */
382		for (z = z0-1 ; z >= start ; z--) {
383			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
384			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
385			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
386			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
387			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
388			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
389			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
390			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
391			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
392			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
393			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
394			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
395			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
396			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
397			asm volatile("vpand %ymm0,%ymm5,%ymm5");
398			asm volatile("vpand %ymm0,%ymm7,%ymm7");
399			asm volatile("vpand %ymm0,%ymm13,%ymm13");
400			asm volatile("vpand %ymm0,%ymm15,%ymm15");
401			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
402			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
403			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
404			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
405			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
406			asm volatile("vmovdqa %0,%%ymm7"
407				     :: "m" (dptr[z][d+32]));
408			asm volatile("vmovdqa %0,%%ymm13"
409				     :: "m" (dptr[z][d+64]));
410			asm volatile("vmovdqa %0,%%ymm15"
411				     :: "m" (dptr[z][d+96]));
412			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
413			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
414			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
415			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
416			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
417			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
418			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
419			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
420		}
421		asm volatile("prefetchnta %0" :: "m" (q[d]));
422		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
423		/* P/Q left side optimization */
424		for (z = start-1 ; z >= 0 ; z--) {
425			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
426			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
427			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
428			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
429			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
430			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
431			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
432			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
433			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
434			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
435			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
436			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
437			asm volatile("vpand %ymm0,%ymm5,%ymm5");
438			asm volatile("vpand %ymm0,%ymm7,%ymm7");
439			asm volatile("vpand %ymm0,%ymm13,%ymm13");
440			asm volatile("vpand %ymm0,%ymm15,%ymm15");
441			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
442			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
443			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
444			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
445		}
446		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
447		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
448		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
449		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
450		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
451		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
452		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
453		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
454		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
455		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
456		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
457		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
458	}
459	asm volatile("sfence" : : : "memory");
460	kernel_fpu_end();
461}
462
463const struct raid6_calls raid6_avx2x4 = {
464	raid6_avx24_gen_syndrome,
465	raid6_avx24_xor_syndrome,
466	raid6_have_avx2,
467	"avx2x4",
468	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
469};
470#endif /* CONFIG_X86_64 */
v6.2
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* -*- linux-c -*- ------------------------------------------------------- *
  3 *
  4 *   Copyright (C) 2012 Intel Corporation
  5 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
  6 *
  7 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  8 *
  9 * ----------------------------------------------------------------------- */
 10
 11/*
 12 * AVX2 implementation of RAID-6 syndrome functions
 13 *
 14 */
 15
 16#include <linux/raid/pq.h>
 17#include "x86.h"
 18
 19static const struct raid6_avx2_constants {
 20	u64 x1d[4];
 21} raid6_avx2_constants __aligned(32) = {
 22	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 23	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 24};
 25
 26static int raid6_have_avx2(void)
 27{
 28	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
 29}
 30
 31/*
 32 * Plain AVX2 implementation
 33 */
 34static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 35{
 36	u8 **dptr = (u8 **)ptrs;
 37	u8 *p, *q;
 38	int d, z, z0;
 39
 40	z0 = disks - 3;		/* Highest data disk */
 41	p = dptr[z0+1];		/* XOR parity */
 42	q = dptr[z0+2];		/* RS syndrome */
 43
 44	kernel_fpu_begin();
 45
 46	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 47	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
 48
 49	for (d = 0; d < bytes; d += 32) {
 50		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 51		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 52		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 53		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
 54		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
 55		for (z = z0-2; z >= 0; z--) {
 56			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 57			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 58			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 59			asm volatile("vpand %ymm0,%ymm5,%ymm5");
 60			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 61			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 62			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 63			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
 64		}
 65		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 66		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 67		asm volatile("vpand %ymm0,%ymm5,%ymm5");
 68		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 69		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 70		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 71
 72		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 73		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 74		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 75		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 76	}
 77
 78	asm volatile("sfence" : : : "memory");
 79	kernel_fpu_end();
 80}
 81
 82static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
 83				     size_t bytes, void **ptrs)
 84{
 85	u8 **dptr = (u8 **)ptrs;
 86	u8 *p, *q;
 87	int d, z, z0;
 88
 89	z0 = stop;		/* P/Q right side optimization */
 90	p = dptr[disks-2];	/* XOR parity */
 91	q = dptr[disks-1];	/* RS syndrome */
 92
 93	kernel_fpu_begin();
 94
 95	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 96
 97	for (d = 0 ; d < bytes ; d += 32) {
 98		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
 99		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
100		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
101		/* P/Q data pages */
102		for (z = z0-1 ; z >= start ; z--) {
103			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
104			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
105			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
106			asm volatile("vpand %ymm0,%ymm5,%ymm5");
107			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
108			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
109			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
110			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
111		}
112		/* P/Q left side optimization */
113		for (z = start-1 ; z >= 0 ; z--) {
114			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
115			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
116			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
117			asm volatile("vpand %ymm0,%ymm5,%ymm5");
118			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119		}
120		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
121		/* Don't use movntdq for r/w memory area < cache line */
122		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
123		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
124	}
125
126	asm volatile("sfence" : : : "memory");
127	kernel_fpu_end();
128}
129
130const struct raid6_calls raid6_avx2x1 = {
131	raid6_avx21_gen_syndrome,
132	raid6_avx21_xor_syndrome,
133	raid6_have_avx2,
134	"avx2x1",
135	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
136};
137
138/*
139 * Unrolled-by-2 AVX2 implementation
140 */
141static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
142{
143	u8 **dptr = (u8 **)ptrs;
144	u8 *p, *q;
145	int d, z, z0;
146
147	z0 = disks - 3;		/* Highest data disk */
148	p = dptr[z0+1];		/* XOR parity */
149	q = dptr[z0+2];		/* RS syndrome */
150
151	kernel_fpu_begin();
152
153	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
154	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
155
156	/* We uniformly assume a single prefetch covers at least 32 bytes */
157	for (d = 0; d < bytes; d += 64) {
158		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
159		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
160		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
161		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
162		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
163		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
164		for (z = z0-1; z >= 0; z--) {
165			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
166			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
167			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
168			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
169			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
170			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
171			asm volatile("vpand %ymm0,%ymm5,%ymm5");
172			asm volatile("vpand %ymm0,%ymm7,%ymm7");
173			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
174			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
175			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
176			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
177			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
178			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
179			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
180			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
181		}
182		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
183		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
184		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
185		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
186	}
187
188	asm volatile("sfence" : : : "memory");
189	kernel_fpu_end();
190}
191
192static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
193				     size_t bytes, void **ptrs)
194{
195	u8 **dptr = (u8 **)ptrs;
196	u8 *p, *q;
197	int d, z, z0;
198
199	z0 = stop;		/* P/Q right side optimization */
200	p = dptr[disks-2];	/* XOR parity */
201	q = dptr[disks-1];	/* RS syndrome */
202
203	kernel_fpu_begin();
204
205	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
206
207	for (d = 0 ; d < bytes ; d += 64) {
208		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
209		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
210		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
211		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
212		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
213		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
214		/* P/Q data pages */
215		for (z = z0-1 ; z >= start ; z--) {
216			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
217			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
218			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
219			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
220			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
221			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
222			asm volatile("vpand %ymm0,%ymm5,%ymm5");
223			asm volatile("vpand %ymm0,%ymm7,%ymm7");
224			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
225			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
226			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
227			asm volatile("vmovdqa %0,%%ymm7"
228				     :: "m" (dptr[z][d+32]));
229			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
230			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
231			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
232			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
233		}
234		/* P/Q left side optimization */
235		for (z = start-1 ; z >= 0 ; z--) {
236			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
237			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
238			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
239			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
240			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
241			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
242			asm volatile("vpand %ymm0,%ymm5,%ymm5");
243			asm volatile("vpand %ymm0,%ymm7,%ymm7");
244			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
245			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
246		}
247		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
248		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
249		/* Don't use movntdq for r/w memory area < cache line */
250		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
251		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
252		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
253		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
254	}
255
256	asm volatile("sfence" : : : "memory");
257	kernel_fpu_end();
258}
259
260const struct raid6_calls raid6_avx2x2 = {
261	raid6_avx22_gen_syndrome,
262	raid6_avx22_xor_syndrome,
263	raid6_have_avx2,
264	"avx2x2",
265	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
266};
267
268#ifdef CONFIG_X86_64
269
270/*
271 * Unrolled-by-4 AVX2 implementation
272 */
273static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
274{
275	u8 **dptr = (u8 **)ptrs;
276	u8 *p, *q;
277	int d, z, z0;
278
279	z0 = disks - 3;		/* Highest data disk */
280	p = dptr[z0+1];		/* XOR parity */
281	q = dptr[z0+2];		/* RS syndrome */
282
283	kernel_fpu_begin();
284
285	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
286	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
287	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
288	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
289	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
290	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
291	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
292	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
293	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
294	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
295
296	for (d = 0; d < bytes; d += 128) {
297		for (z = z0; z >= 0; z--) {
298			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
299			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
300			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
301			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
302			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
303			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
304			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
305			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
306			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
307			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
308			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
309			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
310			asm volatile("vpand %ymm0,%ymm5,%ymm5");
311			asm volatile("vpand %ymm0,%ymm7,%ymm7");
312			asm volatile("vpand %ymm0,%ymm13,%ymm13");
313			asm volatile("vpand %ymm0,%ymm15,%ymm15");
314			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
315			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
316			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
317			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
318			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
319			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
320			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
321			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
322			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
323			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
324			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
325			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
326			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
327			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
328			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
329			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
330		}
331		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
332		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
333		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
334		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
335		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
336		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
337		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
338		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
339		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
340		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
341		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
342		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
343		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
344		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
345		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
346		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
347	}
348
349	asm volatile("sfence" : : : "memory");
350	kernel_fpu_end();
351}
352
353static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
354				     size_t bytes, void **ptrs)
355{
356	u8 **dptr = (u8 **)ptrs;
357	u8 *p, *q;
358	int d, z, z0;
359
360	z0 = stop;		/* P/Q right side optimization */
361	p = dptr[disks-2];	/* XOR parity */
362	q = dptr[disks-1];	/* RS syndrome */
363
364	kernel_fpu_begin();
365
366	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
367
368	for (d = 0 ; d < bytes ; d += 128) {
369		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
370		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
371		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
372		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
373		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
374		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
375		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
376		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
377		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
378		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
379		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
380		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
381		/* P/Q data pages */
382		for (z = z0-1 ; z >= start ; z--) {
383			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
384			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
385			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
386			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
387			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
388			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
389			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
390			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
391			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
392			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
393			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
394			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
395			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
396			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
397			asm volatile("vpand %ymm0,%ymm5,%ymm5");
398			asm volatile("vpand %ymm0,%ymm7,%ymm7");
399			asm volatile("vpand %ymm0,%ymm13,%ymm13");
400			asm volatile("vpand %ymm0,%ymm15,%ymm15");
401			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
402			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
403			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
404			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
405			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
406			asm volatile("vmovdqa %0,%%ymm7"
407				     :: "m" (dptr[z][d+32]));
408			asm volatile("vmovdqa %0,%%ymm13"
409				     :: "m" (dptr[z][d+64]));
410			asm volatile("vmovdqa %0,%%ymm15"
411				     :: "m" (dptr[z][d+96]));
412			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
413			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
414			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
415			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
416			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
417			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
418			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
419			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
420		}
421		asm volatile("prefetchnta %0" :: "m" (q[d]));
422		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
423		/* P/Q left side optimization */
424		for (z = start-1 ; z >= 0 ; z--) {
425			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
426			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
427			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
428			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
429			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
430			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
431			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
432			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
433			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
434			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
435			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
436			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
437			asm volatile("vpand %ymm0,%ymm5,%ymm5");
438			asm volatile("vpand %ymm0,%ymm7,%ymm7");
439			asm volatile("vpand %ymm0,%ymm13,%ymm13");
440			asm volatile("vpand %ymm0,%ymm15,%ymm15");
441			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
442			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
443			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
444			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
445		}
446		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
447		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
448		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
449		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
450		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
451		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
452		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
453		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
454		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
455		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
456		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
457		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
458	}
459	asm volatile("sfence" : : : "memory");
460	kernel_fpu_end();
461}
462
463const struct raid6_calls raid6_avx2x4 = {
464	raid6_avx24_gen_syndrome,
465	raid6_avx24_xor_syndrome,
466	raid6_have_avx2,
467	"avx2x4",
468	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
469};
470#endif /* CONFIG_X86_64 */