Linux Audio

Check our new training course

Loading...
v3.5.6
 
  1/* -*- linux-c -*- ------------------------------------------------------- *
  2 *
  3 *   Copyright 2002 H. Peter Anvin - All Rights Reserved
  4 *
  5 *   This program is free software; you can redistribute it and/or modify
  6 *   it under the terms of the GNU General Public License as published by
  7 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
  8 *   Boston MA 02111-1307, USA; either version 2 of the License, or
  9 *   (at your option) any later version; incorporated herein by reference.
 10 *
 11 * ----------------------------------------------------------------------- */
 12
 13/*
 14 * raid6/sse2.c
 15 *
 16 * SSE-2 implementation of RAID-6 syndrome functions
 17 *
 18 */
 19
 20#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
 21
 22#include <linux/raid/pq.h>
 23#include "x86.h"
 24
 25static const struct raid6_sse_constants {
 26	u64 x1d[2];
 27} raid6_sse_constants  __attribute__((aligned(16))) = {
 28	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
 29};
 30
 31static int raid6_have_sse2(void)
 32{
 33	/* Not really boot_cpu but "all_cpus" */
 34	return boot_cpu_has(X86_FEATURE_MMX) &&
 35		boot_cpu_has(X86_FEATURE_FXSR) &&
 36		boot_cpu_has(X86_FEATURE_XMM) &&
 37		boot_cpu_has(X86_FEATURE_XMM2);
 38}
 39
 40/*
 41 * Plain SSE2 implementation
 42 */
 43static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 44{
 45	u8 **dptr = (u8 **)ptrs;
 46	u8 *p, *q;
 47	int d, z, z0;
 48
 49	z0 = disks - 3;		/* Highest data disk */
 50	p = dptr[z0+1];		/* XOR parity */
 51	q = dptr[z0+2];		/* RS syndrome */
 52
 53	kernel_fpu_begin();
 54
 55	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
 56	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
 57
 58	for ( d = 0 ; d < bytes ; d += 16 ) {
 59		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 60		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
 61		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 62		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
 63		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
 64		for ( z = z0-2 ; z >= 0 ; z-- ) {
 65			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 66			asm volatile("pcmpgtb %xmm4,%xmm5");
 67			asm volatile("paddb %xmm4,%xmm4");
 68			asm volatile("pand %xmm0,%xmm5");
 69			asm volatile("pxor %xmm5,%xmm4");
 70			asm volatile("pxor %xmm5,%xmm5");
 71			asm volatile("pxor %xmm6,%xmm2");
 72			asm volatile("pxor %xmm6,%xmm4");
 73			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
 74		}
 75		asm volatile("pcmpgtb %xmm4,%xmm5");
 76		asm volatile("paddb %xmm4,%xmm4");
 77		asm volatile("pand %xmm0,%xmm5");
 78		asm volatile("pxor %xmm5,%xmm4");
 79		asm volatile("pxor %xmm5,%xmm5");
 80		asm volatile("pxor %xmm6,%xmm2");
 81		asm volatile("pxor %xmm6,%xmm4");
 82
 83		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
 84		asm volatile("pxor %xmm2,%xmm2");
 85		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
 86		asm volatile("pxor %xmm4,%xmm4");
 87	}
 88
 89	asm volatile("sfence" : : : "memory");
 90	kernel_fpu_end();
 91}
 92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 93const struct raid6_calls raid6_sse2x1 = {
 94	raid6_sse21_gen_syndrome,
 
 95	raid6_have_sse2,
 96	"sse2x1",
 97	1			/* Has cache hints */
 98};
 99
100/*
101 * Unrolled-by-2 SSE2 implementation
102 */
103static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
104{
105	u8 **dptr = (u8 **)ptrs;
106	u8 *p, *q;
107	int d, z, z0;
108
109	z0 = disks - 3;		/* Highest data disk */
110	p = dptr[z0+1];		/* XOR parity */
111	q = dptr[z0+2];		/* RS syndrome */
112
113	kernel_fpu_begin();
114
115	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
116	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
117	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
118
119	/* We uniformly assume a single prefetch covers at least 32 bytes */
120	for ( d = 0 ; d < bytes ; d += 32 ) {
121		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
122		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
123		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
124		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
125		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
126		for ( z = z0-1 ; z >= 0 ; z-- ) {
127			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
128			asm volatile("pcmpgtb %xmm4,%xmm5");
129			asm volatile("pcmpgtb %xmm6,%xmm7");
130			asm volatile("paddb %xmm4,%xmm4");
131			asm volatile("paddb %xmm6,%xmm6");
132			asm volatile("pand %xmm0,%xmm5");
133			asm volatile("pand %xmm0,%xmm7");
134			asm volatile("pxor %xmm5,%xmm4");
135			asm volatile("pxor %xmm7,%xmm6");
136			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
137			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
138			asm volatile("pxor %xmm5,%xmm2");
139			asm volatile("pxor %xmm7,%xmm3");
140			asm volatile("pxor %xmm5,%xmm4");
141			asm volatile("pxor %xmm7,%xmm6");
142			asm volatile("pxor %xmm5,%xmm5");
143			asm volatile("pxor %xmm7,%xmm7");
144		}
145		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
146		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
147		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
148		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
149	}
150
151	asm volatile("sfence" : : : "memory");
152	kernel_fpu_end();
153}
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155const struct raid6_calls raid6_sse2x2 = {
156	raid6_sse22_gen_syndrome,
 
157	raid6_have_sse2,
158	"sse2x2",
159	1			/* Has cache hints */
160};
161
162#endif
163
164#if defined(__x86_64__) && !defined(__arch_um__)
165
166/*
167 * Unrolled-by-4 SSE2 implementation
168 */
169static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
170{
171	u8 **dptr = (u8 **)ptrs;
172	u8 *p, *q;
173	int d, z, z0;
174
175	z0 = disks - 3;		/* Highest data disk */
176	p = dptr[z0+1];		/* XOR parity */
177	q = dptr[z0+2];		/* RS syndrome */
178
179	kernel_fpu_begin();
180
181	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
182	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
183	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
184	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
185	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
186	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
187	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
188	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
189	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
190	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
191	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
192	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
193	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
194
195	for ( d = 0 ; d < bytes ; d += 64 ) {
196		for ( z = z0 ; z >= 0 ; z-- ) {
197			/* The second prefetch seems to improve performance... */
198			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
199			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
200			asm volatile("pcmpgtb %xmm4,%xmm5");
201			asm volatile("pcmpgtb %xmm6,%xmm7");
202			asm volatile("pcmpgtb %xmm12,%xmm13");
203			asm volatile("pcmpgtb %xmm14,%xmm15");
204			asm volatile("paddb %xmm4,%xmm4");
205			asm volatile("paddb %xmm6,%xmm6");
206			asm volatile("paddb %xmm12,%xmm12");
207			asm volatile("paddb %xmm14,%xmm14");
208			asm volatile("pand %xmm0,%xmm5");
209			asm volatile("pand %xmm0,%xmm7");
210			asm volatile("pand %xmm0,%xmm13");
211			asm volatile("pand %xmm0,%xmm15");
212			asm volatile("pxor %xmm5,%xmm4");
213			asm volatile("pxor %xmm7,%xmm6");
214			asm volatile("pxor %xmm13,%xmm12");
215			asm volatile("pxor %xmm15,%xmm14");
216			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
217			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
218			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
219			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
220			asm volatile("pxor %xmm5,%xmm2");
221			asm volatile("pxor %xmm7,%xmm3");
222			asm volatile("pxor %xmm13,%xmm10");
223			asm volatile("pxor %xmm15,%xmm11");
224			asm volatile("pxor %xmm5,%xmm4");
225			asm volatile("pxor %xmm7,%xmm6");
226			asm volatile("pxor %xmm13,%xmm12");
227			asm volatile("pxor %xmm15,%xmm14");
228			asm volatile("pxor %xmm5,%xmm5");
229			asm volatile("pxor %xmm7,%xmm7");
230			asm volatile("pxor %xmm13,%xmm13");
231			asm volatile("pxor %xmm15,%xmm15");
232		}
233		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
234		asm volatile("pxor %xmm2,%xmm2");
235		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
236		asm volatile("pxor %xmm3,%xmm3");
237		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
238		asm volatile("pxor %xmm10,%xmm10");
239		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
240		asm volatile("pxor %xmm11,%xmm11");
241		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
242		asm volatile("pxor %xmm4,%xmm4");
243		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
244		asm volatile("pxor %xmm6,%xmm6");
245		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
246		asm volatile("pxor %xmm12,%xmm12");
247		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
248		asm volatile("pxor %xmm14,%xmm14");
249	}
250
251	asm volatile("sfence" : : : "memory");
252	kernel_fpu_end();
253}
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255const struct raid6_calls raid6_sse2x4 = {
256	raid6_sse24_gen_syndrome,
 
257	raid6_have_sse2,
258	"sse2x4",
259	1			/* Has cache hints */
260};
261
262#endif
v6.8
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* -*- linux-c -*- ------------------------------------------------------- *
  3 *
  4 *   Copyright 2002 H. Peter Anvin - All Rights Reserved
  5 *
 
 
 
 
 
 
  6 * ----------------------------------------------------------------------- */
  7
  8/*
  9 * raid6/sse2.c
 10 *
 11 * SSE-2 implementation of RAID-6 syndrome functions
 12 *
 13 */
 14
 
 
 15#include <linux/raid/pq.h>
 16#include "x86.h"
 17
 18static const struct raid6_sse_constants {
 19	u64 x1d[2];
 20} raid6_sse_constants  __attribute__((aligned(16))) = {
 21	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
 22};
 23
 24static int raid6_have_sse2(void)
 25{
 26	/* Not really boot_cpu but "all_cpus" */
 27	return boot_cpu_has(X86_FEATURE_MMX) &&
 28		boot_cpu_has(X86_FEATURE_FXSR) &&
 29		boot_cpu_has(X86_FEATURE_XMM) &&
 30		boot_cpu_has(X86_FEATURE_XMM2);
 31}
 32
 33/*
 34 * Plain SSE2 implementation
 35 */
 36static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 37{
 38	u8 **dptr = (u8 **)ptrs;
 39	u8 *p, *q;
 40	int d, z, z0;
 41
 42	z0 = disks - 3;		/* Highest data disk */
 43	p = dptr[z0+1];		/* XOR parity */
 44	q = dptr[z0+2];		/* RS syndrome */
 45
 46	kernel_fpu_begin();
 47
 48	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
 49	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
 50
 51	for ( d = 0 ; d < bytes ; d += 16 ) {
 52		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 53		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
 54		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
 55		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
 56		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
 57		for ( z = z0-2 ; z >= 0 ; z-- ) {
 58			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 59			asm volatile("pcmpgtb %xmm4,%xmm5");
 60			asm volatile("paddb %xmm4,%xmm4");
 61			asm volatile("pand %xmm0,%xmm5");
 62			asm volatile("pxor %xmm5,%xmm4");
 63			asm volatile("pxor %xmm5,%xmm5");
 64			asm volatile("pxor %xmm6,%xmm2");
 65			asm volatile("pxor %xmm6,%xmm4");
 66			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
 67		}
 68		asm volatile("pcmpgtb %xmm4,%xmm5");
 69		asm volatile("paddb %xmm4,%xmm4");
 70		asm volatile("pand %xmm0,%xmm5");
 71		asm volatile("pxor %xmm5,%xmm4");
 72		asm volatile("pxor %xmm5,%xmm5");
 73		asm volatile("pxor %xmm6,%xmm2");
 74		asm volatile("pxor %xmm6,%xmm4");
 75
 76		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
 77		asm volatile("pxor %xmm2,%xmm2");
 78		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
 79		asm volatile("pxor %xmm4,%xmm4");
 80	}
 81
 82	asm volatile("sfence" : : : "memory");
 83	kernel_fpu_end();
 84}
 85
 86
 87static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
 88				     size_t bytes, void **ptrs)
 89{
 90	u8 **dptr = (u8 **)ptrs;
 91	u8 *p, *q;
 92	int d, z, z0;
 93
 94	z0 = stop;		/* P/Q right side optimization */
 95	p = dptr[disks-2];	/* XOR parity */
 96	q = dptr[disks-1];	/* RS syndrome */
 97
 98	kernel_fpu_begin();
 99
100	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
101
102	for ( d = 0 ; d < bytes ; d += 16 ) {
103		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
104		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
105		asm volatile("pxor %xmm4,%xmm2");
106		/* P/Q data pages */
107		for ( z = z0-1 ; z >= start ; z-- ) {
108			asm volatile("pxor %xmm5,%xmm5");
109			asm volatile("pcmpgtb %xmm4,%xmm5");
110			asm volatile("paddb %xmm4,%xmm4");
111			asm volatile("pand %xmm0,%xmm5");
112			asm volatile("pxor %xmm5,%xmm4");
113			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
114			asm volatile("pxor %xmm5,%xmm2");
115			asm volatile("pxor %xmm5,%xmm4");
116		}
117		/* P/Q left side optimization */
118		for ( z = start-1 ; z >= 0 ; z-- ) {
119			asm volatile("pxor %xmm5,%xmm5");
120			asm volatile("pcmpgtb %xmm4,%xmm5");
121			asm volatile("paddb %xmm4,%xmm4");
122			asm volatile("pand %xmm0,%xmm5");
123			asm volatile("pxor %xmm5,%xmm4");
124		}
125		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
126		/* Don't use movntdq for r/w memory area < cache line */
127		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
128		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
129	}
130
131	asm volatile("sfence" : : : "memory");
132	kernel_fpu_end();
133}
134
135const struct raid6_calls raid6_sse2x1 = {
136	raid6_sse21_gen_syndrome,
137	raid6_sse21_xor_syndrome,
138	raid6_have_sse2,
139	"sse2x1",
140	1			/* Has cache hints */
141};
142
143/*
144 * Unrolled-by-2 SSE2 implementation
145 */
146static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
147{
148	u8 **dptr = (u8 **)ptrs;
149	u8 *p, *q;
150	int d, z, z0;
151
152	z0 = disks - 3;		/* Highest data disk */
153	p = dptr[z0+1];		/* XOR parity */
154	q = dptr[z0+2];		/* RS syndrome */
155
156	kernel_fpu_begin();
157
158	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
159	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
160	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
161
162	/* We uniformly assume a single prefetch covers at least 32 bytes */
163	for ( d = 0 ; d < bytes ; d += 32 ) {
164		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
165		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
166		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
167		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
168		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
169		for ( z = z0-1 ; z >= 0 ; z-- ) {
170			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
171			asm volatile("pcmpgtb %xmm4,%xmm5");
172			asm volatile("pcmpgtb %xmm6,%xmm7");
173			asm volatile("paddb %xmm4,%xmm4");
174			asm volatile("paddb %xmm6,%xmm6");
175			asm volatile("pand %xmm0,%xmm5");
176			asm volatile("pand %xmm0,%xmm7");
177			asm volatile("pxor %xmm5,%xmm4");
178			asm volatile("pxor %xmm7,%xmm6");
179			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
180			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
181			asm volatile("pxor %xmm5,%xmm2");
182			asm volatile("pxor %xmm7,%xmm3");
183			asm volatile("pxor %xmm5,%xmm4");
184			asm volatile("pxor %xmm7,%xmm6");
185			asm volatile("pxor %xmm5,%xmm5");
186			asm volatile("pxor %xmm7,%xmm7");
187		}
188		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
189		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
190		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
191		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
192	}
193
194	asm volatile("sfence" : : : "memory");
195	kernel_fpu_end();
196}
197
198static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
199				     size_t bytes, void **ptrs)
200{
201	u8 **dptr = (u8 **)ptrs;
202	u8 *p, *q;
203	int d, z, z0;
204
205	z0 = stop;		/* P/Q right side optimization */
206	p = dptr[disks-2];	/* XOR parity */
207	q = dptr[disks-1];	/* RS syndrome */
208
209	kernel_fpu_begin();
210
211	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
212
213	for ( d = 0 ; d < bytes ; d += 32 ) {
214		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
215		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
216		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
217		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
218		asm volatile("pxor %xmm4,%xmm2");
219		asm volatile("pxor %xmm6,%xmm3");
220		/* P/Q data pages */
221		for ( z = z0-1 ; z >= start ; z-- ) {
222			asm volatile("pxor %xmm5,%xmm5");
223			asm volatile("pxor %xmm7,%xmm7");
224			asm volatile("pcmpgtb %xmm4,%xmm5");
225			asm volatile("pcmpgtb %xmm6,%xmm7");
226			asm volatile("paddb %xmm4,%xmm4");
227			asm volatile("paddb %xmm6,%xmm6");
228			asm volatile("pand %xmm0,%xmm5");
229			asm volatile("pand %xmm0,%xmm7");
230			asm volatile("pxor %xmm5,%xmm4");
231			asm volatile("pxor %xmm7,%xmm6");
232			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
233			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
234			asm volatile("pxor %xmm5,%xmm2");
235			asm volatile("pxor %xmm7,%xmm3");
236			asm volatile("pxor %xmm5,%xmm4");
237			asm volatile("pxor %xmm7,%xmm6");
238		}
239		/* P/Q left side optimization */
240		for ( z = start-1 ; z >= 0 ; z-- ) {
241			asm volatile("pxor %xmm5,%xmm5");
242			asm volatile("pxor %xmm7,%xmm7");
243			asm volatile("pcmpgtb %xmm4,%xmm5");
244			asm volatile("pcmpgtb %xmm6,%xmm7");
245			asm volatile("paddb %xmm4,%xmm4");
246			asm volatile("paddb %xmm6,%xmm6");
247			asm volatile("pand %xmm0,%xmm5");
248			asm volatile("pand %xmm0,%xmm7");
249			asm volatile("pxor %xmm5,%xmm4");
250			asm volatile("pxor %xmm7,%xmm6");
251		}
252		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
253		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
254		/* Don't use movntdq for r/w memory area < cache line */
255		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
256		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
257		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
258		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
259	}
260
261	asm volatile("sfence" : : : "memory");
262	kernel_fpu_end();
263}
264
265const struct raid6_calls raid6_sse2x2 = {
266	raid6_sse22_gen_syndrome,
267	raid6_sse22_xor_syndrome,
268	raid6_have_sse2,
269	"sse2x2",
270	1			/* Has cache hints */
271};
272
273#ifdef CONFIG_X86_64
 
 
274
275/*
276 * Unrolled-by-4 SSE2 implementation
277 */
278static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
279{
280	u8 **dptr = (u8 **)ptrs;
281	u8 *p, *q;
282	int d, z, z0;
283
284	z0 = disks - 3;		/* Highest data disk */
285	p = dptr[z0+1];		/* XOR parity */
286	q = dptr[z0+2];		/* RS syndrome */
287
288	kernel_fpu_begin();
289
290	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
291	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
292	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
293	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
294	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
295	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
296	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
297	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
298	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
299	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
300	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
301	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
302	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
303
304	for ( d = 0 ; d < bytes ; d += 64 ) {
305		for ( z = z0 ; z >= 0 ; z-- ) {
306			/* The second prefetch seems to improve performance... */
307			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
308			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
309			asm volatile("pcmpgtb %xmm4,%xmm5");
310			asm volatile("pcmpgtb %xmm6,%xmm7");
311			asm volatile("pcmpgtb %xmm12,%xmm13");
312			asm volatile("pcmpgtb %xmm14,%xmm15");
313			asm volatile("paddb %xmm4,%xmm4");
314			asm volatile("paddb %xmm6,%xmm6");
315			asm volatile("paddb %xmm12,%xmm12");
316			asm volatile("paddb %xmm14,%xmm14");
317			asm volatile("pand %xmm0,%xmm5");
318			asm volatile("pand %xmm0,%xmm7");
319			asm volatile("pand %xmm0,%xmm13");
320			asm volatile("pand %xmm0,%xmm15");
321			asm volatile("pxor %xmm5,%xmm4");
322			asm volatile("pxor %xmm7,%xmm6");
323			asm volatile("pxor %xmm13,%xmm12");
324			asm volatile("pxor %xmm15,%xmm14");
325			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
326			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
327			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
328			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
329			asm volatile("pxor %xmm5,%xmm2");
330			asm volatile("pxor %xmm7,%xmm3");
331			asm volatile("pxor %xmm13,%xmm10");
332			asm volatile("pxor %xmm15,%xmm11");
333			asm volatile("pxor %xmm5,%xmm4");
334			asm volatile("pxor %xmm7,%xmm6");
335			asm volatile("pxor %xmm13,%xmm12");
336			asm volatile("pxor %xmm15,%xmm14");
337			asm volatile("pxor %xmm5,%xmm5");
338			asm volatile("pxor %xmm7,%xmm7");
339			asm volatile("pxor %xmm13,%xmm13");
340			asm volatile("pxor %xmm15,%xmm15");
341		}
342		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
343		asm volatile("pxor %xmm2,%xmm2");
344		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
345		asm volatile("pxor %xmm3,%xmm3");
346		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
347		asm volatile("pxor %xmm10,%xmm10");
348		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
349		asm volatile("pxor %xmm11,%xmm11");
350		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
351		asm volatile("pxor %xmm4,%xmm4");
352		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
353		asm volatile("pxor %xmm6,%xmm6");
354		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
355		asm volatile("pxor %xmm12,%xmm12");
356		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
357		asm volatile("pxor %xmm14,%xmm14");
358	}
359
360	asm volatile("sfence" : : : "memory");
361	kernel_fpu_end();
362}
363
364static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
365				     size_t bytes, void **ptrs)
366{
367	u8 **dptr = (u8 **)ptrs;
368	u8 *p, *q;
369	int d, z, z0;
370
371	z0 = stop;		/* P/Q right side optimization */
372	p = dptr[disks-2];	/* XOR parity */
373	q = dptr[disks-1];	/* RS syndrome */
374
375	kernel_fpu_begin();
376
377	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
378
379	for ( d = 0 ; d < bytes ; d += 64 ) {
380		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
381		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
382		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
383		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
384		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
385		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
386		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
387		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
388		asm volatile("pxor %xmm4,%xmm2");
389		asm volatile("pxor %xmm6,%xmm3");
390		asm volatile("pxor %xmm12,%xmm10");
391		asm volatile("pxor %xmm14,%xmm11");
392		/* P/Q data pages */
393		for ( z = z0-1 ; z >= start ; z-- ) {
394			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
395			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
396			asm volatile("pxor %xmm5,%xmm5");
397			asm volatile("pxor %xmm7,%xmm7");
398			asm volatile("pxor %xmm13,%xmm13");
399			asm volatile("pxor %xmm15,%xmm15");
400			asm volatile("pcmpgtb %xmm4,%xmm5");
401			asm volatile("pcmpgtb %xmm6,%xmm7");
402			asm volatile("pcmpgtb %xmm12,%xmm13");
403			asm volatile("pcmpgtb %xmm14,%xmm15");
404			asm volatile("paddb %xmm4,%xmm4");
405			asm volatile("paddb %xmm6,%xmm6");
406			asm volatile("paddb %xmm12,%xmm12");
407			asm volatile("paddb %xmm14,%xmm14");
408			asm volatile("pand %xmm0,%xmm5");
409			asm volatile("pand %xmm0,%xmm7");
410			asm volatile("pand %xmm0,%xmm13");
411			asm volatile("pand %xmm0,%xmm15");
412			asm volatile("pxor %xmm5,%xmm4");
413			asm volatile("pxor %xmm7,%xmm6");
414			asm volatile("pxor %xmm13,%xmm12");
415			asm volatile("pxor %xmm15,%xmm14");
416			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
417			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
418			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
419			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
420			asm volatile("pxor %xmm5,%xmm2");
421			asm volatile("pxor %xmm7,%xmm3");
422			asm volatile("pxor %xmm13,%xmm10");
423			asm volatile("pxor %xmm15,%xmm11");
424			asm volatile("pxor %xmm5,%xmm4");
425			asm volatile("pxor %xmm7,%xmm6");
426			asm volatile("pxor %xmm13,%xmm12");
427			asm volatile("pxor %xmm15,%xmm14");
428		}
429		asm volatile("prefetchnta %0" :: "m" (q[d]));
430		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
431		/* P/Q left side optimization */
432		for ( z = start-1 ; z >= 0 ; z-- ) {
433			asm volatile("pxor %xmm5,%xmm5");
434			asm volatile("pxor %xmm7,%xmm7");
435			asm volatile("pxor %xmm13,%xmm13");
436			asm volatile("pxor %xmm15,%xmm15");
437			asm volatile("pcmpgtb %xmm4,%xmm5");
438			asm volatile("pcmpgtb %xmm6,%xmm7");
439			asm volatile("pcmpgtb %xmm12,%xmm13");
440			asm volatile("pcmpgtb %xmm14,%xmm15");
441			asm volatile("paddb %xmm4,%xmm4");
442			asm volatile("paddb %xmm6,%xmm6");
443			asm volatile("paddb %xmm12,%xmm12");
444			asm volatile("paddb %xmm14,%xmm14");
445			asm volatile("pand %xmm0,%xmm5");
446			asm volatile("pand %xmm0,%xmm7");
447			asm volatile("pand %xmm0,%xmm13");
448			asm volatile("pand %xmm0,%xmm15");
449			asm volatile("pxor %xmm5,%xmm4");
450			asm volatile("pxor %xmm7,%xmm6");
451			asm volatile("pxor %xmm13,%xmm12");
452			asm volatile("pxor %xmm15,%xmm14");
453		}
454		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
455		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
456		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
457		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
458		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
459		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
460		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
461		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
462		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
463		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
464		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
465		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
466	}
467	asm volatile("sfence" : : : "memory");
468	kernel_fpu_end();
469}
470
471
472const struct raid6_calls raid6_sse2x4 = {
473	raid6_sse24_gen_syndrome,
474	raid6_sse24_xor_syndrome,
475	raid6_have_sse2,
476	"sse2x4",
477	1			/* Has cache hints */
478};
479
480#endif /* CONFIG_X86_64 */