Linux Audio

Check our new training course

Loading...
v4.17
 
  1/* -*- linux-c -*- ------------------------------------------------------- *
  2 *
  3 *   Copyright (C) 2012 Intel Corporation
  4 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
  5 *
  6 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  7 *
  8 *
  9 *   This program is free software; you can redistribute it and/or modify
 10 *   it under the terms of the GNU General Public License as published by
 11 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 12 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 13 *   (at your option) any later version; incorporated herein by reference.
 14 *
 15 * ----------------------------------------------------------------------- */
 16
 17/*
 18 * AVX2 implementation of RAID-6 syndrome functions
 19 *
 20 */
 21
 22#ifdef CONFIG_AS_AVX2
 23
 24#include <linux/raid/pq.h>
 25#include "x86.h"
 26
 27static const struct raid6_avx2_constants {
 28	u64 x1d[4];
 29} raid6_avx2_constants __aligned(32) = {
 30	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 31	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 32};
 33
 34static int raid6_have_avx2(void)
 35{
 36	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
 37}
 38
 39/*
 40 * Plain AVX2 implementation
 41 */
 42static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 43{
 44	u8 **dptr = (u8 **)ptrs;
 45	u8 *p, *q;
 46	int d, z, z0;
 47
 48	z0 = disks - 3;		/* Highest data disk */
 49	p = dptr[z0+1];		/* XOR parity */
 50	q = dptr[z0+2];		/* RS syndrome */
 51
 52	kernel_fpu_begin();
 53
 54	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 55	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
 56
 57	for (d = 0; d < bytes; d += 32) {
 58		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 59		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 60		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 61		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
 62		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
 63		for (z = z0-2; z >= 0; z--) {
 64			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 65			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 66			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 67			asm volatile("vpand %ymm0,%ymm5,%ymm5");
 68			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 69			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 70			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 71			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
 72		}
 73		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 74		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 75		asm volatile("vpand %ymm0,%ymm5,%ymm5");
 76		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 77		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 78		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 79
 80		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 81		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 82		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 83		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 84	}
 85
 86	asm volatile("sfence" : : : "memory");
 87	kernel_fpu_end();
 88}
 89
 90static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
 91				     size_t bytes, void **ptrs)
 92{
 93	u8 **dptr = (u8 **)ptrs;
 94	u8 *p, *q;
 95	int d, z, z0;
 96
 97	z0 = stop;		/* P/Q right side optimization */
 98	p = dptr[disks-2];	/* XOR parity */
 99	q = dptr[disks-1];	/* RS syndrome */
100
101	kernel_fpu_begin();
102
103	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
104
105	for (d = 0 ; d < bytes ; d += 32) {
106		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
107		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
108		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
109		/* P/Q data pages */
110		for (z = z0-1 ; z >= start ; z--) {
111			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
112			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
113			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
114			asm volatile("vpand %ymm0,%ymm5,%ymm5");
115			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
116			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
117			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
118			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119		}
120		/* P/Q left side optimization */
121		for (z = start-1 ; z >= 0 ; z--) {
122			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
123			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
124			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
125			asm volatile("vpand %ymm0,%ymm5,%ymm5");
126			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
127		}
128		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
129		/* Don't use movntdq for r/w memory area < cache line */
130		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
131		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
132	}
133
134	asm volatile("sfence" : : : "memory");
135	kernel_fpu_end();
136}
137
138const struct raid6_calls raid6_avx2x1 = {
139	raid6_avx21_gen_syndrome,
140	raid6_avx21_xor_syndrome,
141	raid6_have_avx2,
142	"avx2x1",
143	1			/* Has cache hints */
144};
145
146/*
147 * Unrolled-by-2 AVX2 implementation
148 */
149static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
150{
151	u8 **dptr = (u8 **)ptrs;
152	u8 *p, *q;
153	int d, z, z0;
154
155	z0 = disks - 3;		/* Highest data disk */
156	p = dptr[z0+1];		/* XOR parity */
157	q = dptr[z0+2];		/* RS syndrome */
158
159	kernel_fpu_begin();
160
161	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
162	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
163
164	/* We uniformly assume a single prefetch covers at least 32 bytes */
165	for (d = 0; d < bytes; d += 64) {
166		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
167		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
168		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
169		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
170		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
171		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
172		for (z = z0-1; z >= 0; z--) {
173			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
174			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
175			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
176			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
177			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
178			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
179			asm volatile("vpand %ymm0,%ymm5,%ymm5");
180			asm volatile("vpand %ymm0,%ymm7,%ymm7");
181			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
184			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
185			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
186			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
187			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
188			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
189		}
190		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
191		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
192		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
193		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
194	}
195
196	asm volatile("sfence" : : : "memory");
197	kernel_fpu_end();
198}
199
200static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
201				     size_t bytes, void **ptrs)
202{
203	u8 **dptr = (u8 **)ptrs;
204	u8 *p, *q;
205	int d, z, z0;
206
207	z0 = stop;		/* P/Q right side optimization */
208	p = dptr[disks-2];	/* XOR parity */
209	q = dptr[disks-1];	/* RS syndrome */
210
211	kernel_fpu_begin();
212
213	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
214
215	for (d = 0 ; d < bytes ; d += 64) {
216		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
217		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
218		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
219		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
220		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
221		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
222		/* P/Q data pages */
223		for (z = z0-1 ; z >= start ; z--) {
224			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
225			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
226			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
227			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
228			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
229			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
230			asm volatile("vpand %ymm0,%ymm5,%ymm5");
231			asm volatile("vpand %ymm0,%ymm7,%ymm7");
232			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
233			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
234			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
235			asm volatile("vmovdqa %0,%%ymm7"
236				     :: "m" (dptr[z][d+32]));
237			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
238			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
239			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
240			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
241		}
242		/* P/Q left side optimization */
243		for (z = start-1 ; z >= 0 ; z--) {
244			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
245			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
246			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
247			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
248			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
249			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
250			asm volatile("vpand %ymm0,%ymm5,%ymm5");
251			asm volatile("vpand %ymm0,%ymm7,%ymm7");
252			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
253			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
254		}
255		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
256		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
257		/* Don't use movntdq for r/w memory area < cache line */
258		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
259		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
260		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
261		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
262	}
263
264	asm volatile("sfence" : : : "memory");
265	kernel_fpu_end();
266}
267
268const struct raid6_calls raid6_avx2x2 = {
269	raid6_avx22_gen_syndrome,
270	raid6_avx22_xor_syndrome,
271	raid6_have_avx2,
272	"avx2x2",
273	1			/* Has cache hints */
274};
275
276#ifdef CONFIG_X86_64
277
278/*
279 * Unrolled-by-4 AVX2 implementation
280 */
281static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
282{
283	u8 **dptr = (u8 **)ptrs;
284	u8 *p, *q;
285	int d, z, z0;
286
287	z0 = disks - 3;		/* Highest data disk */
288	p = dptr[z0+1];		/* XOR parity */
289	q = dptr[z0+2];		/* RS syndrome */
290
291	kernel_fpu_begin();
292
293	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
294	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
295	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
296	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
297	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
298	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
299	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
300	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
301	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
302	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
303
304	for (d = 0; d < bytes; d += 128) {
305		for (z = z0; z >= 0; z--) {
306			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
307			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
308			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
309			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
310			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
311			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
312			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
313			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
314			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
315			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
316			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
317			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
318			asm volatile("vpand %ymm0,%ymm5,%ymm5");
319			asm volatile("vpand %ymm0,%ymm7,%ymm7");
320			asm volatile("vpand %ymm0,%ymm13,%ymm13");
321			asm volatile("vpand %ymm0,%ymm15,%ymm15");
322			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
323			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
324			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
325			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
326			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
327			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
328			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
329			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
330			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
331			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
332			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
333			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
334			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
335			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
336			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
337			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
338		}
339		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
340		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
341		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
342		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
343		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
344		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
345		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
346		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
347		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
348		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
349		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
350		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
351		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
352		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
353		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
354		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
355	}
356
357	asm volatile("sfence" : : : "memory");
358	kernel_fpu_end();
359}
360
361static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
362				     size_t bytes, void **ptrs)
363{
364	u8 **dptr = (u8 **)ptrs;
365	u8 *p, *q;
366	int d, z, z0;
367
368	z0 = stop;		/* P/Q right side optimization */
369	p = dptr[disks-2];	/* XOR parity */
370	q = dptr[disks-1];	/* RS syndrome */
371
372	kernel_fpu_begin();
373
374	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
375
376	for (d = 0 ; d < bytes ; d += 128) {
377		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
378		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
379		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
380		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
381		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
382		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
383		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
384		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
385		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
386		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
387		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
388		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
389		/* P/Q data pages */
390		for (z = z0-1 ; z >= start ; z--) {
391			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
392			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
393			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
394			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
395			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
396			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
397			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
398			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
399			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
400			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
401			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
402			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
403			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
404			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
405			asm volatile("vpand %ymm0,%ymm5,%ymm5");
406			asm volatile("vpand %ymm0,%ymm7,%ymm7");
407			asm volatile("vpand %ymm0,%ymm13,%ymm13");
408			asm volatile("vpand %ymm0,%ymm15,%ymm15");
409			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
410			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
411			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
412			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
413			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
414			asm volatile("vmovdqa %0,%%ymm7"
415				     :: "m" (dptr[z][d+32]));
416			asm volatile("vmovdqa %0,%%ymm13"
417				     :: "m" (dptr[z][d+64]));
418			asm volatile("vmovdqa %0,%%ymm15"
419				     :: "m" (dptr[z][d+96]));
420			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
421			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
422			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
423			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
424			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
425			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
426			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
427			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
428		}
429		asm volatile("prefetchnta %0" :: "m" (q[d]));
430		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
431		/* P/Q left side optimization */
432		for (z = start-1 ; z >= 0 ; z--) {
433			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
434			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
435			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
436			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
437			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
438			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
439			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
440			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
441			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
442			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
443			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
444			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
445			asm volatile("vpand %ymm0,%ymm5,%ymm5");
446			asm volatile("vpand %ymm0,%ymm7,%ymm7");
447			asm volatile("vpand %ymm0,%ymm13,%ymm13");
448			asm volatile("vpand %ymm0,%ymm15,%ymm15");
449			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
450			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
451			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
452			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
453		}
454		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
455		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
456		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
457		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
458		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
459		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
460		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
461		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
462		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
463		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
464		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
465		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
466	}
467	asm volatile("sfence" : : : "memory");
468	kernel_fpu_end();
469}
470
471const struct raid6_calls raid6_avx2x4 = {
472	raid6_avx24_gen_syndrome,
473	raid6_avx24_xor_syndrome,
474	raid6_have_avx2,
475	"avx2x4",
476	1			/* Has cache hints */
477};
478#endif
479
480#endif /* CONFIG_AS_AVX2 */
v6.2
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* -*- linux-c -*- ------------------------------------------------------- *
  3 *
  4 *   Copyright (C) 2012 Intel Corporation
  5 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
  6 *
  7 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  8 *
 
 
 
 
 
 
 
  9 * ----------------------------------------------------------------------- */
 10
 11/*
 12 * AVX2 implementation of RAID-6 syndrome functions
 13 *
 14 */
 15
 
 
 16#include <linux/raid/pq.h>
 17#include "x86.h"
 18
 19static const struct raid6_avx2_constants {
 20	u64 x1d[4];
 21} raid6_avx2_constants __aligned(32) = {
 22	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 23	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 24};
 25
 26static int raid6_have_avx2(void)
 27{
 28	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
 29}
 30
 31/*
 32 * Plain AVX2 implementation
 33 */
 34static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 35{
 36	u8 **dptr = (u8 **)ptrs;
 37	u8 *p, *q;
 38	int d, z, z0;
 39
 40	z0 = disks - 3;		/* Highest data disk */
 41	p = dptr[z0+1];		/* XOR parity */
 42	q = dptr[z0+2];		/* RS syndrome */
 43
 44	kernel_fpu_begin();
 45
 46	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 47	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
 48
 49	for (d = 0; d < bytes; d += 32) {
 50		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 51		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 52		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 53		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
 54		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
 55		for (z = z0-2; z >= 0; z--) {
 56			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 57			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 58			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 59			asm volatile("vpand %ymm0,%ymm5,%ymm5");
 60			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 61			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 62			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 63			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
 64		}
 65		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 66		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 67		asm volatile("vpand %ymm0,%ymm5,%ymm5");
 68		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 69		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 70		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 71
 72		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 73		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 74		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 75		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 76	}
 77
 78	asm volatile("sfence" : : : "memory");
 79	kernel_fpu_end();
 80}
 81
 82static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
 83				     size_t bytes, void **ptrs)
 84{
 85	u8 **dptr = (u8 **)ptrs;
 86	u8 *p, *q;
 87	int d, z, z0;
 88
 89	z0 = stop;		/* P/Q right side optimization */
 90	p = dptr[disks-2];	/* XOR parity */
 91	q = dptr[disks-1];	/* RS syndrome */
 92
 93	kernel_fpu_begin();
 94
 95	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 96
 97	for (d = 0 ; d < bytes ; d += 32) {
 98		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
 99		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
100		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
101		/* P/Q data pages */
102		for (z = z0-1 ; z >= start ; z--) {
103			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
104			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
105			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
106			asm volatile("vpand %ymm0,%ymm5,%ymm5");
107			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
108			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
109			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
110			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
111		}
112		/* P/Q left side optimization */
113		for (z = start-1 ; z >= 0 ; z--) {
114			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
115			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
116			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
117			asm volatile("vpand %ymm0,%ymm5,%ymm5");
118			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119		}
120		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
121		/* Don't use movntdq for r/w memory area < cache line */
122		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
123		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
124	}
125
126	asm volatile("sfence" : : : "memory");
127	kernel_fpu_end();
128}
129
130const struct raid6_calls raid6_avx2x1 = {
131	raid6_avx21_gen_syndrome,
132	raid6_avx21_xor_syndrome,
133	raid6_have_avx2,
134	"avx2x1",
135	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
136};
137
138/*
139 * Unrolled-by-2 AVX2 implementation
140 */
141static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
142{
143	u8 **dptr = (u8 **)ptrs;
144	u8 *p, *q;
145	int d, z, z0;
146
147	z0 = disks - 3;		/* Highest data disk */
148	p = dptr[z0+1];		/* XOR parity */
149	q = dptr[z0+2];		/* RS syndrome */
150
151	kernel_fpu_begin();
152
153	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
154	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
155
156	/* We uniformly assume a single prefetch covers at least 32 bytes */
157	for (d = 0; d < bytes; d += 64) {
158		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
159		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
160		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
161		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
162		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
163		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
164		for (z = z0-1; z >= 0; z--) {
165			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
166			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
167			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
168			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
169			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
170			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
171			asm volatile("vpand %ymm0,%ymm5,%ymm5");
172			asm volatile("vpand %ymm0,%ymm7,%ymm7");
173			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
174			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
175			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
176			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
177			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
178			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
179			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
180			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
181		}
182		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
183		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
184		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
185		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
186	}
187
188	asm volatile("sfence" : : : "memory");
189	kernel_fpu_end();
190}
191
192static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
193				     size_t bytes, void **ptrs)
194{
195	u8 **dptr = (u8 **)ptrs;
196	u8 *p, *q;
197	int d, z, z0;
198
199	z0 = stop;		/* P/Q right side optimization */
200	p = dptr[disks-2];	/* XOR parity */
201	q = dptr[disks-1];	/* RS syndrome */
202
203	kernel_fpu_begin();
204
205	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
206
207	for (d = 0 ; d < bytes ; d += 64) {
208		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
209		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
210		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
211		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
212		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
213		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
214		/* P/Q data pages */
215		for (z = z0-1 ; z >= start ; z--) {
216			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
217			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
218			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
219			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
220			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
221			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
222			asm volatile("vpand %ymm0,%ymm5,%ymm5");
223			asm volatile("vpand %ymm0,%ymm7,%ymm7");
224			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
225			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
226			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
227			asm volatile("vmovdqa %0,%%ymm7"
228				     :: "m" (dptr[z][d+32]));
229			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
230			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
231			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
232			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
233		}
234		/* P/Q left side optimization */
235		for (z = start-1 ; z >= 0 ; z--) {
236			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
237			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
238			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
239			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
240			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
241			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
242			asm volatile("vpand %ymm0,%ymm5,%ymm5");
243			asm volatile("vpand %ymm0,%ymm7,%ymm7");
244			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
245			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
246		}
247		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
248		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
249		/* Don't use movntdq for r/w memory area < cache line */
250		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
251		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
252		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
253		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
254	}
255
256	asm volatile("sfence" : : : "memory");
257	kernel_fpu_end();
258}
259
260const struct raid6_calls raid6_avx2x2 = {
261	raid6_avx22_gen_syndrome,
262	raid6_avx22_xor_syndrome,
263	raid6_have_avx2,
264	"avx2x2",
265	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
266};
267
268#ifdef CONFIG_X86_64
269
270/*
271 * Unrolled-by-4 AVX2 implementation
272 */
273static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
274{
275	u8 **dptr = (u8 **)ptrs;
276	u8 *p, *q;
277	int d, z, z0;
278
279	z0 = disks - 3;		/* Highest data disk */
280	p = dptr[z0+1];		/* XOR parity */
281	q = dptr[z0+2];		/* RS syndrome */
282
283	kernel_fpu_begin();
284
285	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
286	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
287	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
288	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
289	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
290	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
291	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
292	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
293	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
294	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
295
296	for (d = 0; d < bytes; d += 128) {
297		for (z = z0; z >= 0; z--) {
298			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
299			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
300			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
301			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
302			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
303			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
304			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
305			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
306			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
307			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
308			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
309			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
310			asm volatile("vpand %ymm0,%ymm5,%ymm5");
311			asm volatile("vpand %ymm0,%ymm7,%ymm7");
312			asm volatile("vpand %ymm0,%ymm13,%ymm13");
313			asm volatile("vpand %ymm0,%ymm15,%ymm15");
314			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
315			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
316			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
317			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
318			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
319			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
320			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
321			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
322			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
323			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
324			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
325			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
326			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
327			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
328			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
329			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
330		}
331		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
332		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
333		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
334		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
335		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
336		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
337		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
338		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
339		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
340		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
341		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
342		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
343		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
344		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
345		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
346		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
347	}
348
349	asm volatile("sfence" : : : "memory");
350	kernel_fpu_end();
351}
352
353static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
354				     size_t bytes, void **ptrs)
355{
356	u8 **dptr = (u8 **)ptrs;
357	u8 *p, *q;
358	int d, z, z0;
359
360	z0 = stop;		/* P/Q right side optimization */
361	p = dptr[disks-2];	/* XOR parity */
362	q = dptr[disks-1];	/* RS syndrome */
363
364	kernel_fpu_begin();
365
366	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
367
368	for (d = 0 ; d < bytes ; d += 128) {
369		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
370		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
371		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
372		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
373		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
374		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
375		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
376		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
377		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
378		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
379		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
380		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
381		/* P/Q data pages */
382		for (z = z0-1 ; z >= start ; z--) {
383			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
384			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
385			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
386			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
387			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
388			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
389			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
390			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
391			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
392			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
393			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
394			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
395			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
396			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
397			asm volatile("vpand %ymm0,%ymm5,%ymm5");
398			asm volatile("vpand %ymm0,%ymm7,%ymm7");
399			asm volatile("vpand %ymm0,%ymm13,%ymm13");
400			asm volatile("vpand %ymm0,%ymm15,%ymm15");
401			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
402			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
403			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
404			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
405			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
406			asm volatile("vmovdqa %0,%%ymm7"
407				     :: "m" (dptr[z][d+32]));
408			asm volatile("vmovdqa %0,%%ymm13"
409				     :: "m" (dptr[z][d+64]));
410			asm volatile("vmovdqa %0,%%ymm15"
411				     :: "m" (dptr[z][d+96]));
412			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
413			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
414			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
415			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
416			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
417			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
418			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
419			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
420		}
421		asm volatile("prefetchnta %0" :: "m" (q[d]));
422		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
423		/* P/Q left side optimization */
424		for (z = start-1 ; z >= 0 ; z--) {
425			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
426			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
427			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
428			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
429			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
430			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
431			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
432			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
433			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
434			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
435			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
436			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
437			asm volatile("vpand %ymm0,%ymm5,%ymm5");
438			asm volatile("vpand %ymm0,%ymm7,%ymm7");
439			asm volatile("vpand %ymm0,%ymm13,%ymm13");
440			asm volatile("vpand %ymm0,%ymm15,%ymm15");
441			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
442			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
443			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
444			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
445		}
446		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
447		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
448		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
449		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
450		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
451		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
452		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
453		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
454		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
455		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
456		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
457		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
458	}
459	asm volatile("sfence" : : : "memory");
460	kernel_fpu_end();
461}
462
463const struct raid6_calls raid6_avx2x4 = {
464	raid6_avx24_gen_syndrome,
465	raid6_avx24_xor_syndrome,
466	raid6_have_avx2,
467	"avx2x4",
468	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
469};
470#endif /* CONFIG_X86_64 */