Linux Audio

Check our new training course

Loading...
v4.17
 
  1/* -*- linux-c -*- ------------------------------------------------------- *
  2 *
  3 *   Copyright 2002 H. Peter Anvin - All Rights Reserved
  4 *
  5 *   This program is free software; you can redistribute it and/or modify
  6 *   it under the terms of the GNU General Public License as published by
  7 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
  8 *   Boston MA 02111-1307, USA; either version 2 of the License, or
  9 *   (at your option) any later version; incorporated herein by reference.
 10 *
 11 * ----------------------------------------------------------------------- */
 12
 13/*
 14 * raid6/sse2.c
 15 *
 16 * SSE-2 implementation of RAID-6 syndrome functions
 17 *
 18 */
 19
 20#include <linux/raid/pq.h>
 21#include "x86.h"
 22
 23static const struct raid6_sse_constants {
 24	u64 x1d[2];
 25} raid6_sse_constants  __attribute__((aligned(16))) = {
 26	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
 27};
 28
 29static int raid6_have_sse2(void)
 30{
 31	/* Not really boot_cpu but "all_cpus" */
 32	return boot_cpu_has(X86_FEATURE_MMX) &&
 33		boot_cpu_has(X86_FEATURE_FXSR) &&
 34		boot_cpu_has(X86_FEATURE_XMM) &&
 35		boot_cpu_has(X86_FEATURE_XMM2);
 36}
 37
 38/*
 39 * Plain SSE2 implementation
 40 */
 41static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 42{
 43	u8 **dptr = (u8 **)ptrs;
 44	u8 *p, *q;
 45	int d, z, z0;
 46
 47	z0 = disks - 3;		/* Highest data disk */
 48	p = dptr[z0+1];		/* XOR parity */
 49	q = dptr[z0+2];		/* RS syndrome */
 50
 51	kernel_fpu_begin();
 52
 53	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
 54	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
 55
 56	for ( d = 0 ; d < bytes ; d += 16 ) {
 57		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 58		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
 59		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 60		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
 61		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
 62		for ( z = z0-2 ; z >= 0 ; z-- ) {
 63			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 64			asm volatile("pcmpgtb %xmm4,%xmm5");
 65			asm volatile("paddb %xmm4,%xmm4");
 66			asm volatile("pand %xmm0,%xmm5");
 67			asm volatile("pxor %xmm5,%xmm4");
 68			asm volatile("pxor %xmm5,%xmm5");
 69			asm volatile("pxor %xmm6,%xmm2");
 70			asm volatile("pxor %xmm6,%xmm4");
 71			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
 72		}
 73		asm volatile("pcmpgtb %xmm4,%xmm5");
 74		asm volatile("paddb %xmm4,%xmm4");
 75		asm volatile("pand %xmm0,%xmm5");
 76		asm volatile("pxor %xmm5,%xmm4");
 77		asm volatile("pxor %xmm5,%xmm5");
 78		asm volatile("pxor %xmm6,%xmm2");
 79		asm volatile("pxor %xmm6,%xmm4");
 80
 81		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
 82		asm volatile("pxor %xmm2,%xmm2");
 83		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
 84		asm volatile("pxor %xmm4,%xmm4");
 85	}
 86
 87	asm volatile("sfence" : : : "memory");
 88	kernel_fpu_end();
 89}
 90
 91
 92static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
 93				     size_t bytes, void **ptrs)
 94{
 95	u8 **dptr = (u8 **)ptrs;
 96	u8 *p, *q;
 97	int d, z, z0;
 98
 99	z0 = stop;		/* P/Q right side optimization */
100	p = dptr[disks-2];	/* XOR parity */
101	q = dptr[disks-1];	/* RS syndrome */
102
103	kernel_fpu_begin();
104
105	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
106
107	for ( d = 0 ; d < bytes ; d += 16 ) {
108		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
109		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
110		asm volatile("pxor %xmm4,%xmm2");
111		/* P/Q data pages */
112		for ( z = z0-1 ; z >= start ; z-- ) {
113			asm volatile("pxor %xmm5,%xmm5");
114			asm volatile("pcmpgtb %xmm4,%xmm5");
115			asm volatile("paddb %xmm4,%xmm4");
116			asm volatile("pand %xmm0,%xmm5");
117			asm volatile("pxor %xmm5,%xmm4");
118			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
119			asm volatile("pxor %xmm5,%xmm2");
120			asm volatile("pxor %xmm5,%xmm4");
121		}
122		/* P/Q left side optimization */
123		for ( z = start-1 ; z >= 0 ; z-- ) {
124			asm volatile("pxor %xmm5,%xmm5");
125			asm volatile("pcmpgtb %xmm4,%xmm5");
126			asm volatile("paddb %xmm4,%xmm4");
127			asm volatile("pand %xmm0,%xmm5");
128			asm volatile("pxor %xmm5,%xmm4");
129		}
130		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
131		/* Don't use movntdq for r/w memory area < cache line */
132		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
133		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
134	}
135
136	asm volatile("sfence" : : : "memory");
137	kernel_fpu_end();
138}
139
140const struct raid6_calls raid6_sse2x1 = {
141	raid6_sse21_gen_syndrome,
142	raid6_sse21_xor_syndrome,
143	raid6_have_sse2,
144	"sse2x1",
145	1			/* Has cache hints */
146};
147
148/*
149 * Unrolled-by-2 SSE2 implementation
150 */
151static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
152{
153	u8 **dptr = (u8 **)ptrs;
154	u8 *p, *q;
155	int d, z, z0;
156
157	z0 = disks - 3;		/* Highest data disk */
158	p = dptr[z0+1];		/* XOR parity */
159	q = dptr[z0+2];		/* RS syndrome */
160
161	kernel_fpu_begin();
162
163	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
164	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
165	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
166
167	/* We uniformly assume a single prefetch covers at least 32 bytes */
168	for ( d = 0 ; d < bytes ; d += 32 ) {
169		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
170		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
171		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
172		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
173		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
174		for ( z = z0-1 ; z >= 0 ; z-- ) {
175			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
176			asm volatile("pcmpgtb %xmm4,%xmm5");
177			asm volatile("pcmpgtb %xmm6,%xmm7");
178			asm volatile("paddb %xmm4,%xmm4");
179			asm volatile("paddb %xmm6,%xmm6");
180			asm volatile("pand %xmm0,%xmm5");
181			asm volatile("pand %xmm0,%xmm7");
182			asm volatile("pxor %xmm5,%xmm4");
183			asm volatile("pxor %xmm7,%xmm6");
184			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
185			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
186			asm volatile("pxor %xmm5,%xmm2");
187			asm volatile("pxor %xmm7,%xmm3");
188			asm volatile("pxor %xmm5,%xmm4");
189			asm volatile("pxor %xmm7,%xmm6");
190			asm volatile("pxor %xmm5,%xmm5");
191			asm volatile("pxor %xmm7,%xmm7");
192		}
193		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
194		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
195		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
196		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
197	}
198
199	asm volatile("sfence" : : : "memory");
200	kernel_fpu_end();
201}
202
203static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
204				     size_t bytes, void **ptrs)
205{
206	u8 **dptr = (u8 **)ptrs;
207	u8 *p, *q;
208	int d, z, z0;
209
210	z0 = stop;		/* P/Q right side optimization */
211	p = dptr[disks-2];	/* XOR parity */
212	q = dptr[disks-1];	/* RS syndrome */
213
214	kernel_fpu_begin();
215
216	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
217
218	for ( d = 0 ; d < bytes ; d += 32 ) {
219		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
220		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
221		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
222		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
223		asm volatile("pxor %xmm4,%xmm2");
224		asm volatile("pxor %xmm6,%xmm3");
225		/* P/Q data pages */
226		for ( z = z0-1 ; z >= start ; z-- ) {
227			asm volatile("pxor %xmm5,%xmm5");
228			asm volatile("pxor %xmm7,%xmm7");
229			asm volatile("pcmpgtb %xmm4,%xmm5");
230			asm volatile("pcmpgtb %xmm6,%xmm7");
231			asm volatile("paddb %xmm4,%xmm4");
232			asm volatile("paddb %xmm6,%xmm6");
233			asm volatile("pand %xmm0,%xmm5");
234			asm volatile("pand %xmm0,%xmm7");
235			asm volatile("pxor %xmm5,%xmm4");
236			asm volatile("pxor %xmm7,%xmm6");
237			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
238			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
239			asm volatile("pxor %xmm5,%xmm2");
240			asm volatile("pxor %xmm7,%xmm3");
241			asm volatile("pxor %xmm5,%xmm4");
242			asm volatile("pxor %xmm7,%xmm6");
243		}
244		/* P/Q left side optimization */
245		for ( z = start-1 ; z >= 0 ; z-- ) {
246			asm volatile("pxor %xmm5,%xmm5");
247			asm volatile("pxor %xmm7,%xmm7");
248			asm volatile("pcmpgtb %xmm4,%xmm5");
249			asm volatile("pcmpgtb %xmm6,%xmm7");
250			asm volatile("paddb %xmm4,%xmm4");
251			asm volatile("paddb %xmm6,%xmm6");
252			asm volatile("pand %xmm0,%xmm5");
253			asm volatile("pand %xmm0,%xmm7");
254			asm volatile("pxor %xmm5,%xmm4");
255			asm volatile("pxor %xmm7,%xmm6");
256		}
257		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
258		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
259		/* Don't use movntdq for r/w memory area < cache line */
260		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
261		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
262		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
263		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
264	}
265
266	asm volatile("sfence" : : : "memory");
267	kernel_fpu_end();
268}
269
270const struct raid6_calls raid6_sse2x2 = {
271	raid6_sse22_gen_syndrome,
272	raid6_sse22_xor_syndrome,
273	raid6_have_sse2,
274	"sse2x2",
275	1			/* Has cache hints */
276};
277
278#ifdef CONFIG_X86_64
279
280/*
281 * Unrolled-by-4 SSE2 implementation
282 */
283static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
284{
285	u8 **dptr = (u8 **)ptrs;
286	u8 *p, *q;
287	int d, z, z0;
288
289	z0 = disks - 3;		/* Highest data disk */
290	p = dptr[z0+1];		/* XOR parity */
291	q = dptr[z0+2];		/* RS syndrome */
292
293	kernel_fpu_begin();
294
295	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
296	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
297	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
298	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
299	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
300	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
301	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
302	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
303	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
304	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
305	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
306	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
307	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
308
309	for ( d = 0 ; d < bytes ; d += 64 ) {
310		for ( z = z0 ; z >= 0 ; z-- ) {
311			/* The second prefetch seems to improve performance... */
312			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
313			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
314			asm volatile("pcmpgtb %xmm4,%xmm5");
315			asm volatile("pcmpgtb %xmm6,%xmm7");
316			asm volatile("pcmpgtb %xmm12,%xmm13");
317			asm volatile("pcmpgtb %xmm14,%xmm15");
318			asm volatile("paddb %xmm4,%xmm4");
319			asm volatile("paddb %xmm6,%xmm6");
320			asm volatile("paddb %xmm12,%xmm12");
321			asm volatile("paddb %xmm14,%xmm14");
322			asm volatile("pand %xmm0,%xmm5");
323			asm volatile("pand %xmm0,%xmm7");
324			asm volatile("pand %xmm0,%xmm13");
325			asm volatile("pand %xmm0,%xmm15");
326			asm volatile("pxor %xmm5,%xmm4");
327			asm volatile("pxor %xmm7,%xmm6");
328			asm volatile("pxor %xmm13,%xmm12");
329			asm volatile("pxor %xmm15,%xmm14");
330			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
331			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
332			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
333			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
334			asm volatile("pxor %xmm5,%xmm2");
335			asm volatile("pxor %xmm7,%xmm3");
336			asm volatile("pxor %xmm13,%xmm10");
337			asm volatile("pxor %xmm15,%xmm11");
338			asm volatile("pxor %xmm5,%xmm4");
339			asm volatile("pxor %xmm7,%xmm6");
340			asm volatile("pxor %xmm13,%xmm12");
341			asm volatile("pxor %xmm15,%xmm14");
342			asm volatile("pxor %xmm5,%xmm5");
343			asm volatile("pxor %xmm7,%xmm7");
344			asm volatile("pxor %xmm13,%xmm13");
345			asm volatile("pxor %xmm15,%xmm15");
346		}
347		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
348		asm volatile("pxor %xmm2,%xmm2");
349		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
350		asm volatile("pxor %xmm3,%xmm3");
351		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
352		asm volatile("pxor %xmm10,%xmm10");
353		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
354		asm volatile("pxor %xmm11,%xmm11");
355		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
356		asm volatile("pxor %xmm4,%xmm4");
357		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
358		asm volatile("pxor %xmm6,%xmm6");
359		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
360		asm volatile("pxor %xmm12,%xmm12");
361		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
362		asm volatile("pxor %xmm14,%xmm14");
363	}
364
365	asm volatile("sfence" : : : "memory");
366	kernel_fpu_end();
367}
368
369static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
370				     size_t bytes, void **ptrs)
371{
372	u8 **dptr = (u8 **)ptrs;
373	u8 *p, *q;
374	int d, z, z0;
375
376	z0 = stop;		/* P/Q right side optimization */
377	p = dptr[disks-2];	/* XOR parity */
378	q = dptr[disks-1];	/* RS syndrome */
379
380	kernel_fpu_begin();
381
382	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
383
384	for ( d = 0 ; d < bytes ; d += 64 ) {
385		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
386		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
387		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
388		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
389		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
390		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
391		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
392		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
393		asm volatile("pxor %xmm4,%xmm2");
394		asm volatile("pxor %xmm6,%xmm3");
395		asm volatile("pxor %xmm12,%xmm10");
396		asm volatile("pxor %xmm14,%xmm11");
397		/* P/Q data pages */
398		for ( z = z0-1 ; z >= start ; z-- ) {
399			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
400			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
401			asm volatile("pxor %xmm5,%xmm5");
402			asm volatile("pxor %xmm7,%xmm7");
403			asm volatile("pxor %xmm13,%xmm13");
404			asm volatile("pxor %xmm15,%xmm15");
405			asm volatile("pcmpgtb %xmm4,%xmm5");
406			asm volatile("pcmpgtb %xmm6,%xmm7");
407			asm volatile("pcmpgtb %xmm12,%xmm13");
408			asm volatile("pcmpgtb %xmm14,%xmm15");
409			asm volatile("paddb %xmm4,%xmm4");
410			asm volatile("paddb %xmm6,%xmm6");
411			asm volatile("paddb %xmm12,%xmm12");
412			asm volatile("paddb %xmm14,%xmm14");
413			asm volatile("pand %xmm0,%xmm5");
414			asm volatile("pand %xmm0,%xmm7");
415			asm volatile("pand %xmm0,%xmm13");
416			asm volatile("pand %xmm0,%xmm15");
417			asm volatile("pxor %xmm5,%xmm4");
418			asm volatile("pxor %xmm7,%xmm6");
419			asm volatile("pxor %xmm13,%xmm12");
420			asm volatile("pxor %xmm15,%xmm14");
421			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
422			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
423			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
424			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
425			asm volatile("pxor %xmm5,%xmm2");
426			asm volatile("pxor %xmm7,%xmm3");
427			asm volatile("pxor %xmm13,%xmm10");
428			asm volatile("pxor %xmm15,%xmm11");
429			asm volatile("pxor %xmm5,%xmm4");
430			asm volatile("pxor %xmm7,%xmm6");
431			asm volatile("pxor %xmm13,%xmm12");
432			asm volatile("pxor %xmm15,%xmm14");
433		}
434		asm volatile("prefetchnta %0" :: "m" (q[d]));
435		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
436		/* P/Q left side optimization */
437		for ( z = start-1 ; z >= 0 ; z-- ) {
438			asm volatile("pxor %xmm5,%xmm5");
439			asm volatile("pxor %xmm7,%xmm7");
440			asm volatile("pxor %xmm13,%xmm13");
441			asm volatile("pxor %xmm15,%xmm15");
442			asm volatile("pcmpgtb %xmm4,%xmm5");
443			asm volatile("pcmpgtb %xmm6,%xmm7");
444			asm volatile("pcmpgtb %xmm12,%xmm13");
445			asm volatile("pcmpgtb %xmm14,%xmm15");
446			asm volatile("paddb %xmm4,%xmm4");
447			asm volatile("paddb %xmm6,%xmm6");
448			asm volatile("paddb %xmm12,%xmm12");
449			asm volatile("paddb %xmm14,%xmm14");
450			asm volatile("pand %xmm0,%xmm5");
451			asm volatile("pand %xmm0,%xmm7");
452			asm volatile("pand %xmm0,%xmm13");
453			asm volatile("pand %xmm0,%xmm15");
454			asm volatile("pxor %xmm5,%xmm4");
455			asm volatile("pxor %xmm7,%xmm6");
456			asm volatile("pxor %xmm13,%xmm12");
457			asm volatile("pxor %xmm15,%xmm14");
458		}
459		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
460		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
461		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
462		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
463		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
464		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
465		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
466		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
467		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
468		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
469		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
470		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
471	}
472	asm volatile("sfence" : : : "memory");
473	kernel_fpu_end();
474}
475
476
477const struct raid6_calls raid6_sse2x4 = {
478	raid6_sse24_gen_syndrome,
479	raid6_sse24_xor_syndrome,
480	raid6_have_sse2,
481	"sse2x4",
482	1			/* Has cache hints */
483};
484
485#endif /* CONFIG_X86_64 */
v6.8
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* -*- linux-c -*- ------------------------------------------------------- *
  3 *
  4 *   Copyright 2002 H. Peter Anvin - All Rights Reserved
 
 
 
 
 
 
  5 *
  6 * ----------------------------------------------------------------------- */
  7
  8/*
  9 * raid6/sse2.c
 10 *
 11 * SSE-2 implementation of RAID-6 syndrome functions
 12 *
 13 */
 14
 15#include <linux/raid/pq.h>
 16#include "x86.h"
 17
 18static const struct raid6_sse_constants {
 19	u64 x1d[2];
 20} raid6_sse_constants  __attribute__((aligned(16))) = {
 21	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
 22};
 23
 24static int raid6_have_sse2(void)
 25{
 26	/* Not really boot_cpu but "all_cpus" */
 27	return boot_cpu_has(X86_FEATURE_MMX) &&
 28		boot_cpu_has(X86_FEATURE_FXSR) &&
 29		boot_cpu_has(X86_FEATURE_XMM) &&
 30		boot_cpu_has(X86_FEATURE_XMM2);
 31}
 32
 33/*
 34 * Plain SSE2 implementation
 35 */
 36static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 37{
 38	u8 **dptr = (u8 **)ptrs;
 39	u8 *p, *q;
 40	int d, z, z0;
 41
 42	z0 = disks - 3;		/* Highest data disk */
 43	p = dptr[z0+1];		/* XOR parity */
 44	q = dptr[z0+2];		/* RS syndrome */
 45
 46	kernel_fpu_begin();
 47
 48	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
 49	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
 50
 51	for ( d = 0 ; d < bytes ; d += 16 ) {
 52		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 53		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
 54		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 55		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
 56		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
 57		for ( z = z0-2 ; z >= 0 ; z-- ) {
 58			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 59			asm volatile("pcmpgtb %xmm4,%xmm5");
 60			asm volatile("paddb %xmm4,%xmm4");
 61			asm volatile("pand %xmm0,%xmm5");
 62			asm volatile("pxor %xmm5,%xmm4");
 63			asm volatile("pxor %xmm5,%xmm5");
 64			asm volatile("pxor %xmm6,%xmm2");
 65			asm volatile("pxor %xmm6,%xmm4");
 66			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
 67		}
 68		asm volatile("pcmpgtb %xmm4,%xmm5");
 69		asm volatile("paddb %xmm4,%xmm4");
 70		asm volatile("pand %xmm0,%xmm5");
 71		asm volatile("pxor %xmm5,%xmm4");
 72		asm volatile("pxor %xmm5,%xmm5");
 73		asm volatile("pxor %xmm6,%xmm2");
 74		asm volatile("pxor %xmm6,%xmm4");
 75
 76		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
 77		asm volatile("pxor %xmm2,%xmm2");
 78		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
 79		asm volatile("pxor %xmm4,%xmm4");
 80	}
 81
 82	asm volatile("sfence" : : : "memory");
 83	kernel_fpu_end();
 84}
 85
 86
 87static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
 88				     size_t bytes, void **ptrs)
 89{
 90	u8 **dptr = (u8 **)ptrs;
 91	u8 *p, *q;
 92	int d, z, z0;
 93
 94	z0 = stop;		/* P/Q right side optimization */
 95	p = dptr[disks-2];	/* XOR parity */
 96	q = dptr[disks-1];	/* RS syndrome */
 97
 98	kernel_fpu_begin();
 99
100	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
101
102	for ( d = 0 ; d < bytes ; d += 16 ) {
103		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
104		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
105		asm volatile("pxor %xmm4,%xmm2");
106		/* P/Q data pages */
107		for ( z = z0-1 ; z >= start ; z-- ) {
108			asm volatile("pxor %xmm5,%xmm5");
109			asm volatile("pcmpgtb %xmm4,%xmm5");
110			asm volatile("paddb %xmm4,%xmm4");
111			asm volatile("pand %xmm0,%xmm5");
112			asm volatile("pxor %xmm5,%xmm4");
113			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
114			asm volatile("pxor %xmm5,%xmm2");
115			asm volatile("pxor %xmm5,%xmm4");
116		}
117		/* P/Q left side optimization */
118		for ( z = start-1 ; z >= 0 ; z-- ) {
119			asm volatile("pxor %xmm5,%xmm5");
120			asm volatile("pcmpgtb %xmm4,%xmm5");
121			asm volatile("paddb %xmm4,%xmm4");
122			asm volatile("pand %xmm0,%xmm5");
123			asm volatile("pxor %xmm5,%xmm4");
124		}
125		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
126		/* Don't use movntdq for r/w memory area < cache line */
127		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
128		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
129	}
130
131	asm volatile("sfence" : : : "memory");
132	kernel_fpu_end();
133}
134
135const struct raid6_calls raid6_sse2x1 = {
136	raid6_sse21_gen_syndrome,
137	raid6_sse21_xor_syndrome,
138	raid6_have_sse2,
139	"sse2x1",
140	1			/* Has cache hints */
141};
142
143/*
144 * Unrolled-by-2 SSE2 implementation
145 */
146static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
147{
148	u8 **dptr = (u8 **)ptrs;
149	u8 *p, *q;
150	int d, z, z0;
151
152	z0 = disks - 3;		/* Highest data disk */
153	p = dptr[z0+1];		/* XOR parity */
154	q = dptr[z0+2];		/* RS syndrome */
155
156	kernel_fpu_begin();
157
158	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
159	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
160	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
161
162	/* We uniformly assume a single prefetch covers at least 32 bytes */
163	for ( d = 0 ; d < bytes ; d += 32 ) {
164		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
165		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
166		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
167		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
168		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
169		for ( z = z0-1 ; z >= 0 ; z-- ) {
170			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
171			asm volatile("pcmpgtb %xmm4,%xmm5");
172			asm volatile("pcmpgtb %xmm6,%xmm7");
173			asm volatile("paddb %xmm4,%xmm4");
174			asm volatile("paddb %xmm6,%xmm6");
175			asm volatile("pand %xmm0,%xmm5");
176			asm volatile("pand %xmm0,%xmm7");
177			asm volatile("pxor %xmm5,%xmm4");
178			asm volatile("pxor %xmm7,%xmm6");
179			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
180			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
181			asm volatile("pxor %xmm5,%xmm2");
182			asm volatile("pxor %xmm7,%xmm3");
183			asm volatile("pxor %xmm5,%xmm4");
184			asm volatile("pxor %xmm7,%xmm6");
185			asm volatile("pxor %xmm5,%xmm5");
186			asm volatile("pxor %xmm7,%xmm7");
187		}
188		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
189		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
190		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
191		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
192	}
193
194	asm volatile("sfence" : : : "memory");
195	kernel_fpu_end();
196}
197
198static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
199				     size_t bytes, void **ptrs)
200{
201	u8 **dptr = (u8 **)ptrs;
202	u8 *p, *q;
203	int d, z, z0;
204
205	z0 = stop;		/* P/Q right side optimization */
206	p = dptr[disks-2];	/* XOR parity */
207	q = dptr[disks-1];	/* RS syndrome */
208
209	kernel_fpu_begin();
210
211	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
212
213	for ( d = 0 ; d < bytes ; d += 32 ) {
214		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
215		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
216		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
217		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
218		asm volatile("pxor %xmm4,%xmm2");
219		asm volatile("pxor %xmm6,%xmm3");
220		/* P/Q data pages */
221		for ( z = z0-1 ; z >= start ; z-- ) {
222			asm volatile("pxor %xmm5,%xmm5");
223			asm volatile("pxor %xmm7,%xmm7");
224			asm volatile("pcmpgtb %xmm4,%xmm5");
225			asm volatile("pcmpgtb %xmm6,%xmm7");
226			asm volatile("paddb %xmm4,%xmm4");
227			asm volatile("paddb %xmm6,%xmm6");
228			asm volatile("pand %xmm0,%xmm5");
229			asm volatile("pand %xmm0,%xmm7");
230			asm volatile("pxor %xmm5,%xmm4");
231			asm volatile("pxor %xmm7,%xmm6");
232			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
233			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
234			asm volatile("pxor %xmm5,%xmm2");
235			asm volatile("pxor %xmm7,%xmm3");
236			asm volatile("pxor %xmm5,%xmm4");
237			asm volatile("pxor %xmm7,%xmm6");
238		}
239		/* P/Q left side optimization */
240		for ( z = start-1 ; z >= 0 ; z-- ) {
241			asm volatile("pxor %xmm5,%xmm5");
242			asm volatile("pxor %xmm7,%xmm7");
243			asm volatile("pcmpgtb %xmm4,%xmm5");
244			asm volatile("pcmpgtb %xmm6,%xmm7");
245			asm volatile("paddb %xmm4,%xmm4");
246			asm volatile("paddb %xmm6,%xmm6");
247			asm volatile("pand %xmm0,%xmm5");
248			asm volatile("pand %xmm0,%xmm7");
249			asm volatile("pxor %xmm5,%xmm4");
250			asm volatile("pxor %xmm7,%xmm6");
251		}
252		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
253		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
254		/* Don't use movntdq for r/w memory area < cache line */
255		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
256		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
257		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
258		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
259	}
260
261	asm volatile("sfence" : : : "memory");
262	kernel_fpu_end();
263}
264
265const struct raid6_calls raid6_sse2x2 = {
266	raid6_sse22_gen_syndrome,
267	raid6_sse22_xor_syndrome,
268	raid6_have_sse2,
269	"sse2x2",
270	1			/* Has cache hints */
271};
272
273#ifdef CONFIG_X86_64
274
275/*
276 * Unrolled-by-4 SSE2 implementation
277 */
278static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
279{
280	u8 **dptr = (u8 **)ptrs;
281	u8 *p, *q;
282	int d, z, z0;
283
284	z0 = disks - 3;		/* Highest data disk */
285	p = dptr[z0+1];		/* XOR parity */
286	q = dptr[z0+2];		/* RS syndrome */
287
288	kernel_fpu_begin();
289
290	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
291	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
292	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
293	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
294	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
295	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
296	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
297	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
298	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
299	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
300	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
301	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
302	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
303
304	for ( d = 0 ; d < bytes ; d += 64 ) {
305		for ( z = z0 ; z >= 0 ; z-- ) {
306			/* The second prefetch seems to improve performance... */
307			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
308			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
309			asm volatile("pcmpgtb %xmm4,%xmm5");
310			asm volatile("pcmpgtb %xmm6,%xmm7");
311			asm volatile("pcmpgtb %xmm12,%xmm13");
312			asm volatile("pcmpgtb %xmm14,%xmm15");
313			asm volatile("paddb %xmm4,%xmm4");
314			asm volatile("paddb %xmm6,%xmm6");
315			asm volatile("paddb %xmm12,%xmm12");
316			asm volatile("paddb %xmm14,%xmm14");
317			asm volatile("pand %xmm0,%xmm5");
318			asm volatile("pand %xmm0,%xmm7");
319			asm volatile("pand %xmm0,%xmm13");
320			asm volatile("pand %xmm0,%xmm15");
321			asm volatile("pxor %xmm5,%xmm4");
322			asm volatile("pxor %xmm7,%xmm6");
323			asm volatile("pxor %xmm13,%xmm12");
324			asm volatile("pxor %xmm15,%xmm14");
325			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
326			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
327			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
328			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
329			asm volatile("pxor %xmm5,%xmm2");
330			asm volatile("pxor %xmm7,%xmm3");
331			asm volatile("pxor %xmm13,%xmm10");
332			asm volatile("pxor %xmm15,%xmm11");
333			asm volatile("pxor %xmm5,%xmm4");
334			asm volatile("pxor %xmm7,%xmm6");
335			asm volatile("pxor %xmm13,%xmm12");
336			asm volatile("pxor %xmm15,%xmm14");
337			asm volatile("pxor %xmm5,%xmm5");
338			asm volatile("pxor %xmm7,%xmm7");
339			asm volatile("pxor %xmm13,%xmm13");
340			asm volatile("pxor %xmm15,%xmm15");
341		}
342		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
343		asm volatile("pxor %xmm2,%xmm2");
344		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
345		asm volatile("pxor %xmm3,%xmm3");
346		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
347		asm volatile("pxor %xmm10,%xmm10");
348		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
349		asm volatile("pxor %xmm11,%xmm11");
350		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
351		asm volatile("pxor %xmm4,%xmm4");
352		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
353		asm volatile("pxor %xmm6,%xmm6");
354		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
355		asm volatile("pxor %xmm12,%xmm12");
356		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
357		asm volatile("pxor %xmm14,%xmm14");
358	}
359
360	asm volatile("sfence" : : : "memory");
361	kernel_fpu_end();
362}
363
364static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
365				     size_t bytes, void **ptrs)
366{
367	u8 **dptr = (u8 **)ptrs;
368	u8 *p, *q;
369	int d, z, z0;
370
371	z0 = stop;		/* P/Q right side optimization */
372	p = dptr[disks-2];	/* XOR parity */
373	q = dptr[disks-1];	/* RS syndrome */
374
375	kernel_fpu_begin();
376
377	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
378
379	for ( d = 0 ; d < bytes ; d += 64 ) {
380		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
381		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
382		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
383		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
384		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
385		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
386		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
387		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
388		asm volatile("pxor %xmm4,%xmm2");
389		asm volatile("pxor %xmm6,%xmm3");
390		asm volatile("pxor %xmm12,%xmm10");
391		asm volatile("pxor %xmm14,%xmm11");
392		/* P/Q data pages */
393		for ( z = z0-1 ; z >= start ; z-- ) {
394			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
395			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
396			asm volatile("pxor %xmm5,%xmm5");
397			asm volatile("pxor %xmm7,%xmm7");
398			asm volatile("pxor %xmm13,%xmm13");
399			asm volatile("pxor %xmm15,%xmm15");
400			asm volatile("pcmpgtb %xmm4,%xmm5");
401			asm volatile("pcmpgtb %xmm6,%xmm7");
402			asm volatile("pcmpgtb %xmm12,%xmm13");
403			asm volatile("pcmpgtb %xmm14,%xmm15");
404			asm volatile("paddb %xmm4,%xmm4");
405			asm volatile("paddb %xmm6,%xmm6");
406			asm volatile("paddb %xmm12,%xmm12");
407			asm volatile("paddb %xmm14,%xmm14");
408			asm volatile("pand %xmm0,%xmm5");
409			asm volatile("pand %xmm0,%xmm7");
410			asm volatile("pand %xmm0,%xmm13");
411			asm volatile("pand %xmm0,%xmm15");
412			asm volatile("pxor %xmm5,%xmm4");
413			asm volatile("pxor %xmm7,%xmm6");
414			asm volatile("pxor %xmm13,%xmm12");
415			asm volatile("pxor %xmm15,%xmm14");
416			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
417			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
418			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
419			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
420			asm volatile("pxor %xmm5,%xmm2");
421			asm volatile("pxor %xmm7,%xmm3");
422			asm volatile("pxor %xmm13,%xmm10");
423			asm volatile("pxor %xmm15,%xmm11");
424			asm volatile("pxor %xmm5,%xmm4");
425			asm volatile("pxor %xmm7,%xmm6");
426			asm volatile("pxor %xmm13,%xmm12");
427			asm volatile("pxor %xmm15,%xmm14");
428		}
429		asm volatile("prefetchnta %0" :: "m" (q[d]));
430		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
431		/* P/Q left side optimization */
432		for ( z = start-1 ; z >= 0 ; z-- ) {
433			asm volatile("pxor %xmm5,%xmm5");
434			asm volatile("pxor %xmm7,%xmm7");
435			asm volatile("pxor %xmm13,%xmm13");
436			asm volatile("pxor %xmm15,%xmm15");
437			asm volatile("pcmpgtb %xmm4,%xmm5");
438			asm volatile("pcmpgtb %xmm6,%xmm7");
439			asm volatile("pcmpgtb %xmm12,%xmm13");
440			asm volatile("pcmpgtb %xmm14,%xmm15");
441			asm volatile("paddb %xmm4,%xmm4");
442			asm volatile("paddb %xmm6,%xmm6");
443			asm volatile("paddb %xmm12,%xmm12");
444			asm volatile("paddb %xmm14,%xmm14");
445			asm volatile("pand %xmm0,%xmm5");
446			asm volatile("pand %xmm0,%xmm7");
447			asm volatile("pand %xmm0,%xmm13");
448			asm volatile("pand %xmm0,%xmm15");
449			asm volatile("pxor %xmm5,%xmm4");
450			asm volatile("pxor %xmm7,%xmm6");
451			asm volatile("pxor %xmm13,%xmm12");
452			asm volatile("pxor %xmm15,%xmm14");
453		}
454		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
455		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
456		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
457		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
458		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
459		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
460		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
461		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
462		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
463		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
464		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
465		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
466	}
467	asm volatile("sfence" : : : "memory");
468	kernel_fpu_end();
469}
470
471
472const struct raid6_calls raid6_sse2x4 = {
473	raid6_sse24_gen_syndrome,
474	raid6_sse24_xor_syndrome,
475	raid6_have_sse2,
476	"sse2x4",
477	1			/* Has cache hints */
478};
479
480#endif /* CONFIG_X86_64 */