Linux Audio

Check our new training course

Loading...
v4.17
  1/* -*- linux-c -*- ------------------------------------------------------- *
  2 *
  3 *   Copyright (C) 2012 Intel Corporation
  4 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
  5 *
  6 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  7 *
  8 *
  9 *   This program is free software; you can redistribute it and/or modify
 10 *   it under the terms of the GNU General Public License as published by
 11 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 12 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 13 *   (at your option) any later version; incorporated herein by reference.
 14 *
 15 * ----------------------------------------------------------------------- */
 16
 17/*
 18 * AVX2 implementation of RAID-6 syndrome functions
 19 *
 20 */
 21
 22#ifdef CONFIG_AS_AVX2
 23
 24#include <linux/raid/pq.h>
 25#include "x86.h"
 26
 27static const struct raid6_avx2_constants {
 28	u64 x1d[4];
 29} raid6_avx2_constants __aligned(32) = {
 30	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 31	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 32};
 33
 34static int raid6_have_avx2(void)
 35{
 36	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
 37}
 38
 39/*
 40 * Plain AVX2 implementation
 41 */
 42static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 43{
 44	u8 **dptr = (u8 **)ptrs;
 45	u8 *p, *q;
 46	int d, z, z0;
 47
 48	z0 = disks - 3;		/* Highest data disk */
 49	p = dptr[z0+1];		/* XOR parity */
 50	q = dptr[z0+2];		/* RS syndrome */
 51
 52	kernel_fpu_begin();
 53
 54	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 55	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
 56
 57	for (d = 0; d < bytes; d += 32) {
 58		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 59		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 60		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 61		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
 62		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
 63		for (z = z0-2; z >= 0; z--) {
 64			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 65			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 66			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 67			asm volatile("vpand %ymm0,%ymm5,%ymm5");
 68			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 69			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 70			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 71			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
 72		}
 73		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 74		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 75		asm volatile("vpand %ymm0,%ymm5,%ymm5");
 76		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 77		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 78		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 79
 80		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 81		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 82		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 83		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 84	}
 85
 86	asm volatile("sfence" : : : "memory");
 87	kernel_fpu_end();
 88}
 89
 90static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
 91				     size_t bytes, void **ptrs)
 92{
 93	u8 **dptr = (u8 **)ptrs;
 94	u8 *p, *q;
 95	int d, z, z0;
 96
 97	z0 = stop;		/* P/Q right side optimization */
 98	p = dptr[disks-2];	/* XOR parity */
 99	q = dptr[disks-1];	/* RS syndrome */
100
101	kernel_fpu_begin();
102
103	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
104
105	for (d = 0 ; d < bytes ; d += 32) {
106		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
107		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
108		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
109		/* P/Q data pages */
110		for (z = z0-1 ; z >= start ; z--) {
111			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
112			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
113			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
114			asm volatile("vpand %ymm0,%ymm5,%ymm5");
115			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
116			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
117			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
118			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119		}
120		/* P/Q left side optimization */
121		for (z = start-1 ; z >= 0 ; z--) {
122			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
123			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
124			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
125			asm volatile("vpand %ymm0,%ymm5,%ymm5");
126			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
127		}
128		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
129		/* Don't use movntdq for r/w memory area < cache line */
130		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
131		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
132	}
133
134	asm volatile("sfence" : : : "memory");
135	kernel_fpu_end();
136}
137
138const struct raid6_calls raid6_avx2x1 = {
139	raid6_avx21_gen_syndrome,
140	raid6_avx21_xor_syndrome,
141	raid6_have_avx2,
142	"avx2x1",
143	1			/* Has cache hints */
144};
145
146/*
147 * Unrolled-by-2 AVX2 implementation
148 */
149static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
150{
151	u8 **dptr = (u8 **)ptrs;
152	u8 *p, *q;
153	int d, z, z0;
154
155	z0 = disks - 3;		/* Highest data disk */
156	p = dptr[z0+1];		/* XOR parity */
157	q = dptr[z0+2];		/* RS syndrome */
158
159	kernel_fpu_begin();
160
161	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
162	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
163
164	/* We uniformly assume a single prefetch covers at least 32 bytes */
165	for (d = 0; d < bytes; d += 64) {
166		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
167		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
168		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
169		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
170		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
171		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
172		for (z = z0-1; z >= 0; z--) {
173			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
174			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
175			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
176			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
177			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
178			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
179			asm volatile("vpand %ymm0,%ymm5,%ymm5");
180			asm volatile("vpand %ymm0,%ymm7,%ymm7");
181			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
184			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
185			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
186			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
187			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
188			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
189		}
190		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
191		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
192		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
193		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
194	}
195
196	asm volatile("sfence" : : : "memory");
197	kernel_fpu_end();
198}
199
200static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
201				     size_t bytes, void **ptrs)
202{
203	u8 **dptr = (u8 **)ptrs;
204	u8 *p, *q;
205	int d, z, z0;
206
207	z0 = stop;		/* P/Q right side optimization */
208	p = dptr[disks-2];	/* XOR parity */
209	q = dptr[disks-1];	/* RS syndrome */
210
211	kernel_fpu_begin();
212
213	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
214
215	for (d = 0 ; d < bytes ; d += 64) {
216		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
217		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
218		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
219		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
220		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
221		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
222		/* P/Q data pages */
223		for (z = z0-1 ; z >= start ; z--) {
224			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
225			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
226			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
227			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
228			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
229			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
230			asm volatile("vpand %ymm0,%ymm5,%ymm5");
231			asm volatile("vpand %ymm0,%ymm7,%ymm7");
232			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
233			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
234			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
235			asm volatile("vmovdqa %0,%%ymm7"
236				     :: "m" (dptr[z][d+32]));
237			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
238			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
239			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
240			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
241		}
242		/* P/Q left side optimization */
243		for (z = start-1 ; z >= 0 ; z--) {
244			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
245			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
246			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
247			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
248			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
249			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
250			asm volatile("vpand %ymm0,%ymm5,%ymm5");
251			asm volatile("vpand %ymm0,%ymm7,%ymm7");
252			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
253			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
254		}
255		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
256		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
257		/* Don't use movntdq for r/w memory area < cache line */
258		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
259		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
260		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
261		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
262	}
263
264	asm volatile("sfence" : : : "memory");
265	kernel_fpu_end();
266}
267
268const struct raid6_calls raid6_avx2x2 = {
269	raid6_avx22_gen_syndrome,
270	raid6_avx22_xor_syndrome,
271	raid6_have_avx2,
272	"avx2x2",
273	1			/* Has cache hints */
274};
275
276#ifdef CONFIG_X86_64
277
278/*
279 * Unrolled-by-4 AVX2 implementation
280 */
281static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
282{
283	u8 **dptr = (u8 **)ptrs;
284	u8 *p, *q;
285	int d, z, z0;
286
287	z0 = disks - 3;		/* Highest data disk */
288	p = dptr[z0+1];		/* XOR parity */
289	q = dptr[z0+2];		/* RS syndrome */
290
291	kernel_fpu_begin();
292
293	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
294	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
295	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
296	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
297	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
298	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
299	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
300	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
301	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
302	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
303
304	for (d = 0; d < bytes; d += 128) {
305		for (z = z0; z >= 0; z--) {
306			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
307			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
308			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
309			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
310			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
311			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
312			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
313			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
314			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
315			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
316			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
317			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
318			asm volatile("vpand %ymm0,%ymm5,%ymm5");
319			asm volatile("vpand %ymm0,%ymm7,%ymm7");
320			asm volatile("vpand %ymm0,%ymm13,%ymm13");
321			asm volatile("vpand %ymm0,%ymm15,%ymm15");
322			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
323			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
324			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
325			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
326			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
327			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
328			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
329			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
330			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
331			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
332			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
333			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
334			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
335			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
336			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
337			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
338		}
339		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
340		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
341		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
342		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
343		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
344		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
345		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
346		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
347		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
348		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
349		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
350		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
351		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
352		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
353		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
354		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
355	}
356
357	asm volatile("sfence" : : : "memory");
358	kernel_fpu_end();
359}
360
361static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
362				     size_t bytes, void **ptrs)
363{
364	u8 **dptr = (u8 **)ptrs;
365	u8 *p, *q;
366	int d, z, z0;
367
368	z0 = stop;		/* P/Q right side optimization */
369	p = dptr[disks-2];	/* XOR parity */
370	q = dptr[disks-1];	/* RS syndrome */
371
372	kernel_fpu_begin();
373
374	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
375
376	for (d = 0 ; d < bytes ; d += 128) {
377		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
378		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
379		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
380		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
381		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
382		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
383		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
384		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
385		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
386		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
387		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
388		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
389		/* P/Q data pages */
390		for (z = z0-1 ; z >= start ; z--) {
391			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
392			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
393			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
394			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
395			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
396			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
397			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
398			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
399			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
400			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
401			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
402			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
403			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
404			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
405			asm volatile("vpand %ymm0,%ymm5,%ymm5");
406			asm volatile("vpand %ymm0,%ymm7,%ymm7");
407			asm volatile("vpand %ymm0,%ymm13,%ymm13");
408			asm volatile("vpand %ymm0,%ymm15,%ymm15");
409			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
410			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
411			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
412			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
413			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
414			asm volatile("vmovdqa %0,%%ymm7"
415				     :: "m" (dptr[z][d+32]));
416			asm volatile("vmovdqa %0,%%ymm13"
417				     :: "m" (dptr[z][d+64]));
418			asm volatile("vmovdqa %0,%%ymm15"
419				     :: "m" (dptr[z][d+96]));
420			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
421			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
422			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
423			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
424			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
425			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
426			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
427			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
428		}
429		asm volatile("prefetchnta %0" :: "m" (q[d]));
430		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
431		/* P/Q left side optimization */
432		for (z = start-1 ; z >= 0 ; z--) {
433			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
434			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
435			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
436			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
437			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
438			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
439			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
440			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
441			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
442			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
443			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
444			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
445			asm volatile("vpand %ymm0,%ymm5,%ymm5");
446			asm volatile("vpand %ymm0,%ymm7,%ymm7");
447			asm volatile("vpand %ymm0,%ymm13,%ymm13");
448			asm volatile("vpand %ymm0,%ymm15,%ymm15");
449			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
450			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
451			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
452			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
453		}
454		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
455		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
456		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
457		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
458		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
459		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
460		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
461		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
462		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
463		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
464		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
465		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
466	}
467	asm volatile("sfence" : : : "memory");
468	kernel_fpu_end();
469}
470
471const struct raid6_calls raid6_avx2x4 = {
472	raid6_avx24_gen_syndrome,
473	raid6_avx24_xor_syndrome,
474	raid6_have_avx2,
475	"avx2x4",
476	1			/* Has cache hints */
477};
478#endif
479
480#endif /* CONFIG_AS_AVX2 */
v4.6
  1/* -*- linux-c -*- ------------------------------------------------------- *
  2 *
  3 *   Copyright (C) 2012 Intel Corporation
  4 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
  5 *
  6 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  7 *
  8 *
  9 *   This program is free software; you can redistribute it and/or modify
 10 *   it under the terms of the GNU General Public License as published by
 11 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 12 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 13 *   (at your option) any later version; incorporated herein by reference.
 14 *
 15 * ----------------------------------------------------------------------- */
 16
 17/*
 18 * AVX2 implementation of RAID-6 syndrome functions
 19 *
 20 */
 21
 22#ifdef CONFIG_AS_AVX2
 23
 24#include <linux/raid/pq.h>
 25#include "x86.h"
 26
 27static const struct raid6_avx2_constants {
 28	u64 x1d[4];
 29} raid6_avx2_constants __aligned(32) = {
 30	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 31	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
 32};
 33
 34static int raid6_have_avx2(void)
 35{
 36	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
 37}
 38
 39/*
 40 * Plain AVX2 implementation
 41 */
 42static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 43{
 44	u8 **dptr = (u8 **)ptrs;
 45	u8 *p, *q;
 46	int d, z, z0;
 47
 48	z0 = disks - 3;		/* Highest data disk */
 49	p = dptr[z0+1];		/* XOR parity */
 50	q = dptr[z0+2];		/* RS syndrome */
 51
 52	kernel_fpu_begin();
 53
 54	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 55	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
 56
 57	for (d = 0; d < bytes; d += 32) {
 58		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 59		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 60		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 61		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
 62		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
 63		for (z = z0-2; z >= 0; z--) {
 64			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 65			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 66			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 67			asm volatile("vpand %ymm0,%ymm5,%ymm5");
 68			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 69			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 70			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 71			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
 72		}
 73		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
 74		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 75		asm volatile("vpand %ymm0,%ymm5,%ymm5");
 76		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 77		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
 78		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
 79
 80		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 81		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 82		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 83		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 84	}
 85
 86	asm volatile("sfence" : : : "memory");
 87	kernel_fpu_end();
 88}
 89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 90const struct raid6_calls raid6_avx2x1 = {
 91	raid6_avx21_gen_syndrome,
 92	NULL,			/* XOR not yet implemented */
 93	raid6_have_avx2,
 94	"avx2x1",
 95	1			/* Has cache hints */
 96};
 97
 98/*
 99 * Unrolled-by-2 AVX2 implementation
100 */
101static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
102{
103	u8 **dptr = (u8 **)ptrs;
104	u8 *p, *q;
105	int d, z, z0;
106
107	z0 = disks - 3;		/* Highest data disk */
108	p = dptr[z0+1];		/* XOR parity */
109	q = dptr[z0+2];		/* RS syndrome */
110
111	kernel_fpu_begin();
112
113	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
114	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
115
116	/* We uniformly assume a single prefetch covers at least 32 bytes */
117	for (d = 0; d < bytes; d += 64) {
118		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
119		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
120		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
121		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
122		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
123		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
124		for (z = z0-1; z >= 0; z--) {
125			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
126			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
127			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
128			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
129			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
130			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
131			asm volatile("vpand %ymm0,%ymm5,%ymm5");
132			asm volatile("vpand %ymm0,%ymm7,%ymm7");
133			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
134			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
135			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
136			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
137			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
138			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
139			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
140			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
141		}
142		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
143		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
144		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
145		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
146	}
147
148	asm volatile("sfence" : : : "memory");
149	kernel_fpu_end();
150}
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152const struct raid6_calls raid6_avx2x2 = {
153	raid6_avx22_gen_syndrome,
154	NULL,			/* XOR not yet implemented */
155	raid6_have_avx2,
156	"avx2x2",
157	1			/* Has cache hints */
158};
159
160#ifdef CONFIG_X86_64
161
162/*
163 * Unrolled-by-4 AVX2 implementation
164 */
165static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
166{
167	u8 **dptr = (u8 **)ptrs;
168	u8 *p, *q;
169	int d, z, z0;
170
171	z0 = disks - 3;		/* Highest data disk */
172	p = dptr[z0+1];		/* XOR parity */
173	q = dptr[z0+2];		/* RS syndrome */
174
175	kernel_fpu_begin();
176
177	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
178	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
179	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
180	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
181	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
182	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
183	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
184	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
185	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
186	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
187
188	for (d = 0; d < bytes; d += 128) {
189		for (z = z0; z >= 0; z--) {
190			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
191			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
192			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
193			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
194			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
195			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
196			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
197			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
198			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
199			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
200			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
201			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
202			asm volatile("vpand %ymm0,%ymm5,%ymm5");
203			asm volatile("vpand %ymm0,%ymm7,%ymm7");
204			asm volatile("vpand %ymm0,%ymm13,%ymm13");
205			asm volatile("vpand %ymm0,%ymm15,%ymm15");
206			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
207			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
208			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
209			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
210			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
211			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
212			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
213			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
214			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
215			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
216			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
217			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
218			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
219			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
220			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
221			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
222		}
223		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
224		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
225		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
226		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
227		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
228		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
229		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
230		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
231		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
232		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
233		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
234		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
235		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
236		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
237		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
238		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
239	}
240
241	asm volatile("sfence" : : : "memory");
242	kernel_fpu_end();
243}
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245const struct raid6_calls raid6_avx2x4 = {
246	raid6_avx24_gen_syndrome,
247	NULL,			/* XOR not yet implemented */
248	raid6_have_avx2,
249	"avx2x4",
250	1			/* Has cache hints */
251};
252#endif
253
254#endif /* CONFIG_AS_AVX2 */