Linux Audio

Check our new training course

Loading...
v4.17
 
  1/* -*- linux-c -*- ------------------------------------------------------- *
  2 *
  3 *   Copyright (C) 2012 Intel Corporation
  4 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
  5 *
  6 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  7 *
  8 *
  9 *   This program is free software; you can redistribute it and/or modify
 10 *   it under the terms of the GNU General Public License as published by
 11 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 12 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 13 *   (at your option) any later version; incorporated herein by reference.
 14 *
 15 * ----------------------------------------------------------------------- */
 16
 17/*
 18 * AVX2 implementation of RAID-6 syndrome functions
 19 *
 20 */
 21
 22#ifdef CONFIG_AS_AVX2
 23
 24#include <linux/raid/pq.h>
 25#include "x86.h"
 26
 27static const struct raid6_avx2_constants {
 28	u64 x1d[4];
 29} raid6_avx2_constants __aligned(32) = {
 30	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 31	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 32};
 33
 34static int raid6_have_avx2(void)
 35{
 36	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
 37}
 38
 39/*
 40 * Plain AVX2 implementation
 41 */
 42static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 43{
 44	u8 **dptr = (u8 **)ptrs;
 45	u8 *p, *q;
 46	int d, z, z0;
 47
 48	z0 = disks - 3;		/* Highest data disk */
 49	p = dptr[z0+1];		/* XOR parity */
 50	q = dptr[z0+2];		/* RS syndrome */
 51
 52	kernel_fpu_begin();
 53
 54	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 55	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
 56
 57	for (d = 0; d < bytes; d += 32) {
 58		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 59		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 60		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 61		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
 62		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
 63		for (z = z0-2; z >= 0; z--) {
 64			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 65			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 66			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 67			asm volatile("vpand %ymm0,%ymm5,%ymm5");
 68			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 69			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 70			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 71			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
 72		}
 73		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 74		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 75		asm volatile("vpand %ymm0,%ymm5,%ymm5");
 76		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 77		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 78		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 79
 80		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 81		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 82		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 83		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 84	}
 85
 86	asm volatile("sfence" : : : "memory");
 87	kernel_fpu_end();
 88}
 89
 90static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
 91				     size_t bytes, void **ptrs)
 92{
 93	u8 **dptr = (u8 **)ptrs;
 94	u8 *p, *q;
 95	int d, z, z0;
 96
 97	z0 = stop;		/* P/Q right side optimization */
 98	p = dptr[disks-2];	/* XOR parity */
 99	q = dptr[disks-1];	/* RS syndrome */
100
101	kernel_fpu_begin();
102
103	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
104
105	for (d = 0 ; d < bytes ; d += 32) {
106		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
107		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
108		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
109		/* P/Q data pages */
110		for (z = z0-1 ; z >= start ; z--) {
111			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
112			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
113			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
114			asm volatile("vpand %ymm0,%ymm5,%ymm5");
115			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
116			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
117			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
118			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119		}
120		/* P/Q left side optimization */
121		for (z = start-1 ; z >= 0 ; z--) {
122			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
123			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
124			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
125			asm volatile("vpand %ymm0,%ymm5,%ymm5");
126			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
127		}
128		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
129		/* Don't use movntdq for r/w memory area < cache line */
130		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
131		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
132	}
133
134	asm volatile("sfence" : : : "memory");
135	kernel_fpu_end();
136}
137
138const struct raid6_calls raid6_avx2x1 = {
139	raid6_avx21_gen_syndrome,
140	raid6_avx21_xor_syndrome,
141	raid6_have_avx2,
142	"avx2x1",
143	1			/* Has cache hints */
144};
145
146/*
147 * Unrolled-by-2 AVX2 implementation
148 */
149static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
150{
151	u8 **dptr = (u8 **)ptrs;
152	u8 *p, *q;
153	int d, z, z0;
154
155	z0 = disks - 3;		/* Highest data disk */
156	p = dptr[z0+1];		/* XOR parity */
157	q = dptr[z0+2];		/* RS syndrome */
158
159	kernel_fpu_begin();
160
161	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
162	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
163
164	/* We uniformly assume a single prefetch covers at least 32 bytes */
165	for (d = 0; d < bytes; d += 64) {
166		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
167		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
168		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
169		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
170		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
171		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
172		for (z = z0-1; z >= 0; z--) {
173			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
174			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
175			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
176			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
177			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
178			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
179			asm volatile("vpand %ymm0,%ymm5,%ymm5");
180			asm volatile("vpand %ymm0,%ymm7,%ymm7");
181			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
184			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
185			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
186			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
187			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
188			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
189		}
190		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
191		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
192		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
193		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
194	}
195
196	asm volatile("sfence" : : : "memory");
197	kernel_fpu_end();
198}
199
200static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
201				     size_t bytes, void **ptrs)
202{
203	u8 **dptr = (u8 **)ptrs;
204	u8 *p, *q;
205	int d, z, z0;
206
207	z0 = stop;		/* P/Q right side optimization */
208	p = dptr[disks-2];	/* XOR parity */
209	q = dptr[disks-1];	/* RS syndrome */
210
211	kernel_fpu_begin();
212
213	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
214
215	for (d = 0 ; d < bytes ; d += 64) {
216		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
217		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
218		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
219		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
220		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
221		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
222		/* P/Q data pages */
223		for (z = z0-1 ; z >= start ; z--) {
224			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
225			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
226			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
227			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
228			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
229			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
230			asm volatile("vpand %ymm0,%ymm5,%ymm5");
231			asm volatile("vpand %ymm0,%ymm7,%ymm7");
232			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
233			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
234			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
235			asm volatile("vmovdqa %0,%%ymm7"
236				     :: "m" (dptr[z][d+32]));
237			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
238			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
239			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
240			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
241		}
242		/* P/Q left side optimization */
243		for (z = start-1 ; z >= 0 ; z--) {
244			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
245			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
246			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
247			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
248			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
249			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
250			asm volatile("vpand %ymm0,%ymm5,%ymm5");
251			asm volatile("vpand %ymm0,%ymm7,%ymm7");
252			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
253			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
254		}
255		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
256		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
257		/* Don't use movntdq for r/w memory area < cache line */
258		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
259		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
260		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
261		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
262	}
263
264	asm volatile("sfence" : : : "memory");
265	kernel_fpu_end();
266}
267
268const struct raid6_calls raid6_avx2x2 = {
269	raid6_avx22_gen_syndrome,
270	raid6_avx22_xor_syndrome,
271	raid6_have_avx2,
272	"avx2x2",
273	1			/* Has cache hints */
274};
275
276#ifdef CONFIG_X86_64
277
278/*
279 * Unrolled-by-4 AVX2 implementation
280 */
281static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
282{
283	u8 **dptr = (u8 **)ptrs;
284	u8 *p, *q;
285	int d, z, z0;
286
287	z0 = disks - 3;		/* Highest data disk */
288	p = dptr[z0+1];		/* XOR parity */
289	q = dptr[z0+2];		/* RS syndrome */
290
291	kernel_fpu_begin();
292
293	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
294	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
295	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
296	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
297	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
298	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
299	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
300	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
301	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
302	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
303
304	for (d = 0; d < bytes; d += 128) {
305		for (z = z0; z >= 0; z--) {
306			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
307			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
308			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
309			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
310			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
311			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
312			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
313			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
314			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
315			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
316			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
317			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
318			asm volatile("vpand %ymm0,%ymm5,%ymm5");
319			asm volatile("vpand %ymm0,%ymm7,%ymm7");
320			asm volatile("vpand %ymm0,%ymm13,%ymm13");
321			asm volatile("vpand %ymm0,%ymm15,%ymm15");
322			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
323			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
324			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
325			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
326			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
327			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
328			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
329			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
330			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
331			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
332			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
333			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
334			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
335			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
336			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
337			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
338		}
339		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
340		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
341		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
342		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
343		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
344		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
345		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
346		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
347		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
348		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
349		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
350		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
351		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
352		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
353		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
354		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
355	}
356
357	asm volatile("sfence" : : : "memory");
358	kernel_fpu_end();
359}
360
361static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
362				     size_t bytes, void **ptrs)
363{
364	u8 **dptr = (u8 **)ptrs;
365	u8 *p, *q;
366	int d, z, z0;
367
368	z0 = stop;		/* P/Q right side optimization */
369	p = dptr[disks-2];	/* XOR parity */
370	q = dptr[disks-1];	/* RS syndrome */
371
372	kernel_fpu_begin();
373
374	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
375
376	for (d = 0 ; d < bytes ; d += 128) {
377		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
378		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
379		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
380		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
381		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
382		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
383		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
384		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
385		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
386		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
387		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
388		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
389		/* P/Q data pages */
390		for (z = z0-1 ; z >= start ; z--) {
391			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
392			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
393			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
394			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
395			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
396			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
397			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
398			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
399			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
400			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
401			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
402			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
403			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
404			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
405			asm volatile("vpand %ymm0,%ymm5,%ymm5");
406			asm volatile("vpand %ymm0,%ymm7,%ymm7");
407			asm volatile("vpand %ymm0,%ymm13,%ymm13");
408			asm volatile("vpand %ymm0,%ymm15,%ymm15");
409			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
410			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
411			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
412			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
413			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
414			asm volatile("vmovdqa %0,%%ymm7"
415				     :: "m" (dptr[z][d+32]));
416			asm volatile("vmovdqa %0,%%ymm13"
417				     :: "m" (dptr[z][d+64]));
418			asm volatile("vmovdqa %0,%%ymm15"
419				     :: "m" (dptr[z][d+96]));
420			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
421			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
422			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
423			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
424			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
425			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
426			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
427			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
428		}
429		asm volatile("prefetchnta %0" :: "m" (q[d]));
430		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
431		/* P/Q left side optimization */
432		for (z = start-1 ; z >= 0 ; z--) {
433			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
434			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
435			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
436			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
437			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
438			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
439			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
440			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
441			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
442			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
443			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
444			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
445			asm volatile("vpand %ymm0,%ymm5,%ymm5");
446			asm volatile("vpand %ymm0,%ymm7,%ymm7");
447			asm volatile("vpand %ymm0,%ymm13,%ymm13");
448			asm volatile("vpand %ymm0,%ymm15,%ymm15");
449			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
450			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
451			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
452			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
453		}
454		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
455		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
456		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
457		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
458		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
459		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
460		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
461		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
462		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
463		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
464		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
465		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
466	}
467	asm volatile("sfence" : : : "memory");
468	kernel_fpu_end();
469}
470
471const struct raid6_calls raid6_avx2x4 = {
472	raid6_avx24_gen_syndrome,
473	raid6_avx24_xor_syndrome,
474	raid6_have_avx2,
475	"avx2x4",
476	1			/* Has cache hints */
477};
478#endif
479
480#endif /* CONFIG_AS_AVX2 */
v5.4
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* -*- linux-c -*- ------------------------------------------------------- *
  3 *
  4 *   Copyright (C) 2012 Intel Corporation
  5 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
  6 *
  7 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
 
 
 
 
 
 
 
  8 *
  9 * ----------------------------------------------------------------------- */
 10
 11/*
 12 * AVX2 implementation of RAID-6 syndrome functions
 13 *
 14 */
 15
 16#ifdef CONFIG_AS_AVX2
 17
 18#include <linux/raid/pq.h>
 19#include "x86.h"
 20
 21static const struct raid6_avx2_constants {
 22	u64 x1d[4];
 23} raid6_avx2_constants __aligned(32) = {
 24	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 25	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 26};
 27
 28static int raid6_have_avx2(void)
 29{
 30	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
 31}
 32
 33/*
 34 * Plain AVX2 implementation
 35 */
 36static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 37{
 38	u8 **dptr = (u8 **)ptrs;
 39	u8 *p, *q;
 40	int d, z, z0;
 41
 42	z0 = disks - 3;		/* Highest data disk */
 43	p = dptr[z0+1];		/* XOR parity */
 44	q = dptr[z0+2];		/* RS syndrome */
 45
 46	kernel_fpu_begin();
 47
 48	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 49	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
 50
 51	for (d = 0; d < bytes; d += 32) {
 52		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 53		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 54		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 55		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
 56		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
 57		for (z = z0-2; z >= 0; z--) {
 58			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 59			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 60			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 61			asm volatile("vpand %ymm0,%ymm5,%ymm5");
 62			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 63			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 64			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 65			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
 66		}
 67		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 68		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 69		asm volatile("vpand %ymm0,%ymm5,%ymm5");
 70		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 71		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 72		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 73
 74		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 75		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 76		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 77		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 78	}
 79
 80	asm volatile("sfence" : : : "memory");
 81	kernel_fpu_end();
 82}
 83
 84static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
 85				     size_t bytes, void **ptrs)
 86{
 87	u8 **dptr = (u8 **)ptrs;
 88	u8 *p, *q;
 89	int d, z, z0;
 90
 91	z0 = stop;		/* P/Q right side optimization */
 92	p = dptr[disks-2];	/* XOR parity */
 93	q = dptr[disks-1];	/* RS syndrome */
 94
 95	kernel_fpu_begin();
 96
 97	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 98
 99	for (d = 0 ; d < bytes ; d += 32) {
100		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
101		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
102		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
103		/* P/Q data pages */
104		for (z = z0-1 ; z >= start ; z--) {
105			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
106			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
107			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
108			asm volatile("vpand %ymm0,%ymm5,%ymm5");
109			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
110			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
111			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
112			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
113		}
114		/* P/Q left side optimization */
115		for (z = start-1 ; z >= 0 ; z--) {
116			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
117			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
118			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
119			asm volatile("vpand %ymm0,%ymm5,%ymm5");
120			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
121		}
122		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
123		/* Don't use movntdq for r/w memory area < cache line */
124		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
125		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
126	}
127
128	asm volatile("sfence" : : : "memory");
129	kernel_fpu_end();
130}
131
132const struct raid6_calls raid6_avx2x1 = {
133	raid6_avx21_gen_syndrome,
134	raid6_avx21_xor_syndrome,
135	raid6_have_avx2,
136	"avx2x1",
137	1			/* Has cache hints */
138};
139
140/*
141 * Unrolled-by-2 AVX2 implementation
142 */
143static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
144{
145	u8 **dptr = (u8 **)ptrs;
146	u8 *p, *q;
147	int d, z, z0;
148
149	z0 = disks - 3;		/* Highest data disk */
150	p = dptr[z0+1];		/* XOR parity */
151	q = dptr[z0+2];		/* RS syndrome */
152
153	kernel_fpu_begin();
154
155	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
156	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
157
158	/* We uniformly assume a single prefetch covers at least 32 bytes */
159	for (d = 0; d < bytes; d += 64) {
160		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
161		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
162		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
163		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
164		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
165		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
166		for (z = z0-1; z >= 0; z--) {
167			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
168			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
169			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
170			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
171			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
172			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
173			asm volatile("vpand %ymm0,%ymm5,%ymm5");
174			asm volatile("vpand %ymm0,%ymm7,%ymm7");
175			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
176			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
177			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
178			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
179			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
180			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
181			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183		}
184		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
185		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
186		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
187		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
188	}
189
190	asm volatile("sfence" : : : "memory");
191	kernel_fpu_end();
192}
193
194static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
195				     size_t bytes, void **ptrs)
196{
197	u8 **dptr = (u8 **)ptrs;
198	u8 *p, *q;
199	int d, z, z0;
200
201	z0 = stop;		/* P/Q right side optimization */
202	p = dptr[disks-2];	/* XOR parity */
203	q = dptr[disks-1];	/* RS syndrome */
204
205	kernel_fpu_begin();
206
207	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
208
209	for (d = 0 ; d < bytes ; d += 64) {
210		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
211		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
212		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
213		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
214		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
215		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
216		/* P/Q data pages */
217		for (z = z0-1 ; z >= start ; z--) {
218			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
219			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
220			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
221			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
222			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
223			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
224			asm volatile("vpand %ymm0,%ymm5,%ymm5");
225			asm volatile("vpand %ymm0,%ymm7,%ymm7");
226			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
227			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
228			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
229			asm volatile("vmovdqa %0,%%ymm7"
230				     :: "m" (dptr[z][d+32]));
231			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
232			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
233			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
234			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
235		}
236		/* P/Q left side optimization */
237		for (z = start-1 ; z >= 0 ; z--) {
238			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
239			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
240			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
241			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
242			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
243			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
244			asm volatile("vpand %ymm0,%ymm5,%ymm5");
245			asm volatile("vpand %ymm0,%ymm7,%ymm7");
246			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
247			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
248		}
249		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
250		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
251		/* Don't use movntdq for r/w memory area < cache line */
252		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
253		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
254		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
255		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
256	}
257
258	asm volatile("sfence" : : : "memory");
259	kernel_fpu_end();
260}
261
262const struct raid6_calls raid6_avx2x2 = {
263	raid6_avx22_gen_syndrome,
264	raid6_avx22_xor_syndrome,
265	raid6_have_avx2,
266	"avx2x2",
267	1			/* Has cache hints */
268};
269
270#ifdef CONFIG_X86_64
271
272/*
273 * Unrolled-by-4 AVX2 implementation
274 */
275static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
276{
277	u8 **dptr = (u8 **)ptrs;
278	u8 *p, *q;
279	int d, z, z0;
280
281	z0 = disks - 3;		/* Highest data disk */
282	p = dptr[z0+1];		/* XOR parity */
283	q = dptr[z0+2];		/* RS syndrome */
284
285	kernel_fpu_begin();
286
287	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
288	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
289	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
290	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
291	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
292	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
293	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
294	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
295	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
296	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
297
298	for (d = 0; d < bytes; d += 128) {
299		for (z = z0; z >= 0; z--) {
300			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
301			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
302			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
303			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
304			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
305			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
306			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
307			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
308			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
309			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
310			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
311			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
312			asm volatile("vpand %ymm0,%ymm5,%ymm5");
313			asm volatile("vpand %ymm0,%ymm7,%ymm7");
314			asm volatile("vpand %ymm0,%ymm13,%ymm13");
315			asm volatile("vpand %ymm0,%ymm15,%ymm15");
316			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
317			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
318			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
319			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
320			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
321			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
322			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
323			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
324			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
325			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
326			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
327			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
328			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
329			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
330			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
331			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
332		}
333		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
334		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
335		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
336		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
337		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
338		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
339		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
340		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
341		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
342		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
343		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
344		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
345		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
346		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
347		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
348		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
349	}
350
351	asm volatile("sfence" : : : "memory");
352	kernel_fpu_end();
353}
354
355static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
356				     size_t bytes, void **ptrs)
357{
358	u8 **dptr = (u8 **)ptrs;
359	u8 *p, *q;
360	int d, z, z0;
361
362	z0 = stop;		/* P/Q right side optimization */
363	p = dptr[disks-2];	/* XOR parity */
364	q = dptr[disks-1];	/* RS syndrome */
365
366	kernel_fpu_begin();
367
368	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
369
370	for (d = 0 ; d < bytes ; d += 128) {
371		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
372		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
373		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
374		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
375		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
376		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
377		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
378		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
379		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
380		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
381		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
382		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
383		/* P/Q data pages */
384		for (z = z0-1 ; z >= start ; z--) {
385			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
386			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
387			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
388			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
389			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
390			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
391			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
392			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
393			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
394			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
395			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
396			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
397			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
398			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
399			asm volatile("vpand %ymm0,%ymm5,%ymm5");
400			asm volatile("vpand %ymm0,%ymm7,%ymm7");
401			asm volatile("vpand %ymm0,%ymm13,%ymm13");
402			asm volatile("vpand %ymm0,%ymm15,%ymm15");
403			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
404			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
405			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
406			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
407			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
408			asm volatile("vmovdqa %0,%%ymm7"
409				     :: "m" (dptr[z][d+32]));
410			asm volatile("vmovdqa %0,%%ymm13"
411				     :: "m" (dptr[z][d+64]));
412			asm volatile("vmovdqa %0,%%ymm15"
413				     :: "m" (dptr[z][d+96]));
414			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
415			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
416			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
417			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
418			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
419			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
420			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
421			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
422		}
423		asm volatile("prefetchnta %0" :: "m" (q[d]));
424		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
425		/* P/Q left side optimization */
426		for (z = start-1 ; z >= 0 ; z--) {
427			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
428			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
429			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
430			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
431			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
432			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
433			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
434			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
435			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
436			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
437			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
438			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
439			asm volatile("vpand %ymm0,%ymm5,%ymm5");
440			asm volatile("vpand %ymm0,%ymm7,%ymm7");
441			asm volatile("vpand %ymm0,%ymm13,%ymm13");
442			asm volatile("vpand %ymm0,%ymm15,%ymm15");
443			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
444			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
445			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
446			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
447		}
448		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
449		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
450		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
451		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
452		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
453		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
454		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
455		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
456		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
457		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
458		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
459		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
460	}
461	asm volatile("sfence" : : : "memory");
462	kernel_fpu_end();
463}
464
465const struct raid6_calls raid6_avx2x4 = {
466	raid6_avx24_gen_syndrome,
467	raid6_avx24_xor_syndrome,
468	raid6_have_avx2,
469	"avx2x4",
470	1			/* Has cache hints */
471};
472#endif
473
474#endif /* CONFIG_AS_AVX2 */