Loading...
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 2012 Intel Corporation
4 * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
5 *
6 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
7 *
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
12 * Boston MA 02111-1307, USA; either version 2 of the License, or
13 * (at your option) any later version; incorporated herein by reference.
14 *
15 * ----------------------------------------------------------------------- */
16
17/*
18 * AVX2 implementation of RAID-6 syndrome functions
19 *
20 */
21
22#ifdef CONFIG_AS_AVX2
23
24#include <linux/raid/pq.h>
25#include "x86.h"
26
27static const struct raid6_avx2_constants {
28 u64 x1d[4];
29} raid6_avx2_constants __aligned(32) = {
30 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
31 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
32};
33
34static int raid6_have_avx2(void)
35{
36 return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
37}
38
39/*
40 * Plain AVX2 implementation
41 */
42static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
43{
44 u8 **dptr = (u8 **)ptrs;
45 u8 *p, *q;
46 int d, z, z0;
47
48 z0 = disks - 3; /* Highest data disk */
49 p = dptr[z0+1]; /* XOR parity */
50 q = dptr[z0+2]; /* RS syndrome */
51
52 kernel_fpu_begin();
53
54 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
55 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */
56
57 for (d = 0; d < bytes; d += 32) {
58 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
59 asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
60 asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
61 asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
62 asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
63 for (z = z0-2; z >= 0; z--) {
64 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
65 asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
66 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
67 asm volatile("vpand %ymm0,%ymm5,%ymm5");
68 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
69 asm volatile("vpxor %ymm6,%ymm2,%ymm2");
70 asm volatile("vpxor %ymm6,%ymm4,%ymm4");
71 asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
72 }
73 asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
74 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
75 asm volatile("vpand %ymm0,%ymm5,%ymm5");
76 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
77 asm volatile("vpxor %ymm6,%ymm2,%ymm2");
78 asm volatile("vpxor %ymm6,%ymm4,%ymm4");
79
80 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
81 asm volatile("vpxor %ymm2,%ymm2,%ymm2");
82 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
83 asm volatile("vpxor %ymm4,%ymm4,%ymm4");
84 }
85
86 asm volatile("sfence" : : : "memory");
87 kernel_fpu_end();
88}
89
90static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
91 size_t bytes, void **ptrs)
92{
93 u8 **dptr = (u8 **)ptrs;
94 u8 *p, *q;
95 int d, z, z0;
96
97 z0 = stop; /* P/Q right side optimization */
98 p = dptr[disks-2]; /* XOR parity */
99 q = dptr[disks-1]; /* RS syndrome */
100
101 kernel_fpu_begin();
102
103 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
104
105 for (d = 0 ; d < bytes ; d += 32) {
106 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
107 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
108 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
109 /* P/Q data pages */
110 for (z = z0-1 ; z >= start ; z--) {
111 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
112 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
113 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
114 asm volatile("vpand %ymm0,%ymm5,%ymm5");
115 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
116 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
117 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
118 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119 }
120 /* P/Q left side optimization */
121 for (z = start-1 ; z >= 0 ; z--) {
122 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
123 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
124 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
125 asm volatile("vpand %ymm0,%ymm5,%ymm5");
126 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
127 }
128 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
129 /* Don't use movntdq for r/w memory area < cache line */
130 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
131 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
132 }
133
134 asm volatile("sfence" : : : "memory");
135 kernel_fpu_end();
136}
137
138const struct raid6_calls raid6_avx2x1 = {
139 raid6_avx21_gen_syndrome,
140 raid6_avx21_xor_syndrome,
141 raid6_have_avx2,
142 "avx2x1",
143 1 /* Has cache hints */
144};
145
146/*
147 * Unrolled-by-2 AVX2 implementation
148 */
149static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
150{
151 u8 **dptr = (u8 **)ptrs;
152 u8 *p, *q;
153 int d, z, z0;
154
155 z0 = disks - 3; /* Highest data disk */
156 p = dptr[z0+1]; /* XOR parity */
157 q = dptr[z0+2]; /* RS syndrome */
158
159 kernel_fpu_begin();
160
161 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
162 asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
163
164 /* We uniformly assume a single prefetch covers at least 32 bytes */
165 for (d = 0; d < bytes; d += 64) {
166 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
167 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
168 asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
169 asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
170 asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
171 asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
172 for (z = z0-1; z >= 0; z--) {
173 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
174 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
175 asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
176 asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
177 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
178 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
179 asm volatile("vpand %ymm0,%ymm5,%ymm5");
180 asm volatile("vpand %ymm0,%ymm7,%ymm7");
181 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183 asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
184 asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
185 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
186 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
187 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
188 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
189 }
190 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
191 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
192 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
193 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
194 }
195
196 asm volatile("sfence" : : : "memory");
197 kernel_fpu_end();
198}
199
200static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
201 size_t bytes, void **ptrs)
202{
203 u8 **dptr = (u8 **)ptrs;
204 u8 *p, *q;
205 int d, z, z0;
206
207 z0 = stop; /* P/Q right side optimization */
208 p = dptr[disks-2]; /* XOR parity */
209 q = dptr[disks-1]; /* RS syndrome */
210
211 kernel_fpu_begin();
212
213 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
214
215 for (d = 0 ; d < bytes ; d += 64) {
216 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
217 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
218 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
219 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
220 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
221 asm volatile("vpxor %ymm6,%ymm3,%ymm3");
222 /* P/Q data pages */
223 for (z = z0-1 ; z >= start ; z--) {
224 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
225 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
226 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
227 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
228 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
229 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
230 asm volatile("vpand %ymm0,%ymm5,%ymm5");
231 asm volatile("vpand %ymm0,%ymm7,%ymm7");
232 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
233 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
234 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
235 asm volatile("vmovdqa %0,%%ymm7"
236 :: "m" (dptr[z][d+32]));
237 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
238 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
239 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
240 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
241 }
242 /* P/Q left side optimization */
243 for (z = start-1 ; z >= 0 ; z--) {
244 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
245 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
246 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
247 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
248 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
249 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
250 asm volatile("vpand %ymm0,%ymm5,%ymm5");
251 asm volatile("vpand %ymm0,%ymm7,%ymm7");
252 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
253 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
254 }
255 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
256 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
257 /* Don't use movntdq for r/w memory area < cache line */
258 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
259 asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
260 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
261 asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
262 }
263
264 asm volatile("sfence" : : : "memory");
265 kernel_fpu_end();
266}
267
268const struct raid6_calls raid6_avx2x2 = {
269 raid6_avx22_gen_syndrome,
270 raid6_avx22_xor_syndrome,
271 raid6_have_avx2,
272 "avx2x2",
273 1 /* Has cache hints */
274};
275
276#ifdef CONFIG_X86_64
277
278/*
279 * Unrolled-by-4 AVX2 implementation
280 */
281static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
282{
283 u8 **dptr = (u8 **)ptrs;
284 u8 *p, *q;
285 int d, z, z0;
286
287 z0 = disks - 3; /* Highest data disk */
288 p = dptr[z0+1]; /* XOR parity */
289 q = dptr[z0+2]; /* RS syndrome */
290
291 kernel_fpu_begin();
292
293 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
294 asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
295 asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */
296 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */
297 asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */
298 asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */
299 asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */
300 asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */
301 asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */
302 asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */
303
304 for (d = 0; d < bytes; d += 128) {
305 for (z = z0; z >= 0; z--) {
306 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
307 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
308 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
309 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
310 asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
311 asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
312 asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
313 asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
314 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
315 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
316 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
317 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
318 asm volatile("vpand %ymm0,%ymm5,%ymm5");
319 asm volatile("vpand %ymm0,%ymm7,%ymm7");
320 asm volatile("vpand %ymm0,%ymm13,%ymm13");
321 asm volatile("vpand %ymm0,%ymm15,%ymm15");
322 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
323 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
324 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
325 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
326 asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
327 asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
328 asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
329 asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
330 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
331 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
332 asm volatile("vpxor %ymm13,%ymm10,%ymm10");
333 asm volatile("vpxor %ymm15,%ymm11,%ymm11");
334 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
335 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
336 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
337 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
338 }
339 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
340 asm volatile("vpxor %ymm2,%ymm2,%ymm2");
341 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
342 asm volatile("vpxor %ymm3,%ymm3,%ymm3");
343 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
344 asm volatile("vpxor %ymm10,%ymm10,%ymm10");
345 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
346 asm volatile("vpxor %ymm11,%ymm11,%ymm11");
347 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
348 asm volatile("vpxor %ymm4,%ymm4,%ymm4");
349 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
350 asm volatile("vpxor %ymm6,%ymm6,%ymm6");
351 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
352 asm volatile("vpxor %ymm12,%ymm12,%ymm12");
353 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
354 asm volatile("vpxor %ymm14,%ymm14,%ymm14");
355 }
356
357 asm volatile("sfence" : : : "memory");
358 kernel_fpu_end();
359}
360
361static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
362 size_t bytes, void **ptrs)
363{
364 u8 **dptr = (u8 **)ptrs;
365 u8 *p, *q;
366 int d, z, z0;
367
368 z0 = stop; /* P/Q right side optimization */
369 p = dptr[disks-2]; /* XOR parity */
370 q = dptr[disks-1]; /* RS syndrome */
371
372 kernel_fpu_begin();
373
374 asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
375
376 for (d = 0 ; d < bytes ; d += 128) {
377 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
378 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
379 asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
380 asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
381 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
382 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
383 asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
384 asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
385 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
386 asm volatile("vpxor %ymm6,%ymm3,%ymm3");
387 asm volatile("vpxor %ymm12,%ymm10,%ymm10");
388 asm volatile("vpxor %ymm14,%ymm11,%ymm11");
389 /* P/Q data pages */
390 for (z = z0-1 ; z >= start ; z--) {
391 asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
392 asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
393 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
394 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
395 asm volatile("vpxor %ymm13,%ymm13,%ymm13");
396 asm volatile("vpxor %ymm15,%ymm15,%ymm15");
397 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
398 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
399 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
400 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
401 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
402 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
403 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
404 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
405 asm volatile("vpand %ymm0,%ymm5,%ymm5");
406 asm volatile("vpand %ymm0,%ymm7,%ymm7");
407 asm volatile("vpand %ymm0,%ymm13,%ymm13");
408 asm volatile("vpand %ymm0,%ymm15,%ymm15");
409 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
410 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
411 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
412 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
413 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
414 asm volatile("vmovdqa %0,%%ymm7"
415 :: "m" (dptr[z][d+32]));
416 asm volatile("vmovdqa %0,%%ymm13"
417 :: "m" (dptr[z][d+64]));
418 asm volatile("vmovdqa %0,%%ymm15"
419 :: "m" (dptr[z][d+96]));
420 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
421 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
422 asm volatile("vpxor %ymm13,%ymm10,%ymm10");
423 asm volatile("vpxor %ymm15,%ymm11,%ymm11");
424 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
425 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
426 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
427 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
428 }
429 asm volatile("prefetchnta %0" :: "m" (q[d]));
430 asm volatile("prefetchnta %0" :: "m" (q[d+64]));
431 /* P/Q left side optimization */
432 for (z = start-1 ; z >= 0 ; z--) {
433 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
434 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
435 asm volatile("vpxor %ymm13,%ymm13,%ymm13");
436 asm volatile("vpxor %ymm15,%ymm15,%ymm15");
437 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
438 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
439 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
440 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
441 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
442 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
443 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
444 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
445 asm volatile("vpand %ymm0,%ymm5,%ymm5");
446 asm volatile("vpand %ymm0,%ymm7,%ymm7");
447 asm volatile("vpand %ymm0,%ymm13,%ymm13");
448 asm volatile("vpand %ymm0,%ymm15,%ymm15");
449 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
450 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
451 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
452 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
453 }
454 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
455 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
456 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
457 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
458 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
459 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
460 asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
461 asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
462 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
463 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
464 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
465 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
466 }
467 asm volatile("sfence" : : : "memory");
468 kernel_fpu_end();
469}
470
471const struct raid6_calls raid6_avx2x4 = {
472 raid6_avx24_gen_syndrome,
473 raid6_avx24_xor_syndrome,
474 raid6_have_avx2,
475 "avx2x4",
476 1 /* Has cache hints */
477};
478#endif
479
480#endif /* CONFIG_AS_AVX2 */
1// SPDX-License-Identifier: GPL-2.0-or-later
2/* -*- linux-c -*- ------------------------------------------------------- *
3 *
4 * Copyright (C) 2012 Intel Corporation
5 * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
6 *
7 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * AVX2 implementation of RAID-6 syndrome functions
13 *
14 */
15
16#ifdef CONFIG_AS_AVX2
17
18#include <linux/raid/pq.h>
19#include "x86.h"
20
21static const struct raid6_avx2_constants {
22 u64 x1d[4];
23} raid6_avx2_constants __aligned(32) = {
24 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
25 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
26};
27
28static int raid6_have_avx2(void)
29{
30 return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
31}
32
33/*
34 * Plain AVX2 implementation
35 */
36static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
37{
38 u8 **dptr = (u8 **)ptrs;
39 u8 *p, *q;
40 int d, z, z0;
41
42 z0 = disks - 3; /* Highest data disk */
43 p = dptr[z0+1]; /* XOR parity */
44 q = dptr[z0+2]; /* RS syndrome */
45
46 kernel_fpu_begin();
47
48 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
49 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */
50
51 for (d = 0; d < bytes; d += 32) {
52 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
53 asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
54 asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
55 asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
56 asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
57 for (z = z0-2; z >= 0; z--) {
58 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
59 asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
60 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
61 asm volatile("vpand %ymm0,%ymm5,%ymm5");
62 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
63 asm volatile("vpxor %ymm6,%ymm2,%ymm2");
64 asm volatile("vpxor %ymm6,%ymm4,%ymm4");
65 asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
66 }
67 asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
68 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
69 asm volatile("vpand %ymm0,%ymm5,%ymm5");
70 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
71 asm volatile("vpxor %ymm6,%ymm2,%ymm2");
72 asm volatile("vpxor %ymm6,%ymm4,%ymm4");
73
74 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
75 asm volatile("vpxor %ymm2,%ymm2,%ymm2");
76 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
77 asm volatile("vpxor %ymm4,%ymm4,%ymm4");
78 }
79
80 asm volatile("sfence" : : : "memory");
81 kernel_fpu_end();
82}
83
84static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
85 size_t bytes, void **ptrs)
86{
87 u8 **dptr = (u8 **)ptrs;
88 u8 *p, *q;
89 int d, z, z0;
90
91 z0 = stop; /* P/Q right side optimization */
92 p = dptr[disks-2]; /* XOR parity */
93 q = dptr[disks-1]; /* RS syndrome */
94
95 kernel_fpu_begin();
96
97 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
98
99 for (d = 0 ; d < bytes ; d += 32) {
100 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
101 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
102 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
103 /* P/Q data pages */
104 for (z = z0-1 ; z >= start ; z--) {
105 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
106 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
107 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
108 asm volatile("vpand %ymm0,%ymm5,%ymm5");
109 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
110 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
111 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
112 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
113 }
114 /* P/Q left side optimization */
115 for (z = start-1 ; z >= 0 ; z--) {
116 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
117 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
118 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
119 asm volatile("vpand %ymm0,%ymm5,%ymm5");
120 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
121 }
122 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
123 /* Don't use movntdq for r/w memory area < cache line */
124 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
125 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
126 }
127
128 asm volatile("sfence" : : : "memory");
129 kernel_fpu_end();
130}
131
132const struct raid6_calls raid6_avx2x1 = {
133 raid6_avx21_gen_syndrome,
134 raid6_avx21_xor_syndrome,
135 raid6_have_avx2,
136 "avx2x1",
137 1 /* Has cache hints */
138};
139
140/*
141 * Unrolled-by-2 AVX2 implementation
142 */
143static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
144{
145 u8 **dptr = (u8 **)ptrs;
146 u8 *p, *q;
147 int d, z, z0;
148
149 z0 = disks - 3; /* Highest data disk */
150 p = dptr[z0+1]; /* XOR parity */
151 q = dptr[z0+2]; /* RS syndrome */
152
153 kernel_fpu_begin();
154
155 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
156 asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
157
158 /* We uniformly assume a single prefetch covers at least 32 bytes */
159 for (d = 0; d < bytes; d += 64) {
160 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
161 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
162 asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
163 asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
164 asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
165 asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
166 for (z = z0-1; z >= 0; z--) {
167 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
168 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
169 asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
170 asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
171 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
172 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
173 asm volatile("vpand %ymm0,%ymm5,%ymm5");
174 asm volatile("vpand %ymm0,%ymm7,%ymm7");
175 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
176 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
177 asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
178 asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
179 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
180 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
181 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183 }
184 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
185 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
186 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
187 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
188 }
189
190 asm volatile("sfence" : : : "memory");
191 kernel_fpu_end();
192}
193
194static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
195 size_t bytes, void **ptrs)
196{
197 u8 **dptr = (u8 **)ptrs;
198 u8 *p, *q;
199 int d, z, z0;
200
201 z0 = stop; /* P/Q right side optimization */
202 p = dptr[disks-2]; /* XOR parity */
203 q = dptr[disks-1]; /* RS syndrome */
204
205 kernel_fpu_begin();
206
207 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
208
209 for (d = 0 ; d < bytes ; d += 64) {
210 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
211 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
212 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
213 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
214 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
215 asm volatile("vpxor %ymm6,%ymm3,%ymm3");
216 /* P/Q data pages */
217 for (z = z0-1 ; z >= start ; z--) {
218 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
219 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
220 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
221 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
222 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
223 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
224 asm volatile("vpand %ymm0,%ymm5,%ymm5");
225 asm volatile("vpand %ymm0,%ymm7,%ymm7");
226 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
227 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
228 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
229 asm volatile("vmovdqa %0,%%ymm7"
230 :: "m" (dptr[z][d+32]));
231 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
232 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
233 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
234 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
235 }
236 /* P/Q left side optimization */
237 for (z = start-1 ; z >= 0 ; z--) {
238 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
239 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
240 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
241 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
242 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
243 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
244 asm volatile("vpand %ymm0,%ymm5,%ymm5");
245 asm volatile("vpand %ymm0,%ymm7,%ymm7");
246 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
247 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
248 }
249 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
250 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
251 /* Don't use movntdq for r/w memory area < cache line */
252 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
253 asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
254 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
255 asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
256 }
257
258 asm volatile("sfence" : : : "memory");
259 kernel_fpu_end();
260}
261
262const struct raid6_calls raid6_avx2x2 = {
263 raid6_avx22_gen_syndrome,
264 raid6_avx22_xor_syndrome,
265 raid6_have_avx2,
266 "avx2x2",
267 1 /* Has cache hints */
268};
269
270#ifdef CONFIG_X86_64
271
272/*
273 * Unrolled-by-4 AVX2 implementation
274 */
275static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
276{
277 u8 **dptr = (u8 **)ptrs;
278 u8 *p, *q;
279 int d, z, z0;
280
281 z0 = disks - 3; /* Highest data disk */
282 p = dptr[z0+1]; /* XOR parity */
283 q = dptr[z0+2]; /* RS syndrome */
284
285 kernel_fpu_begin();
286
287 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
288 asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
289 asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */
290 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */
291 asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */
292 asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */
293 asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */
294 asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */
295 asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */
296 asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */
297
298 for (d = 0; d < bytes; d += 128) {
299 for (z = z0; z >= 0; z--) {
300 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
301 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
302 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
303 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
304 asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
305 asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
306 asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
307 asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
308 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
309 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
310 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
311 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
312 asm volatile("vpand %ymm0,%ymm5,%ymm5");
313 asm volatile("vpand %ymm0,%ymm7,%ymm7");
314 asm volatile("vpand %ymm0,%ymm13,%ymm13");
315 asm volatile("vpand %ymm0,%ymm15,%ymm15");
316 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
317 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
318 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
319 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
320 asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
321 asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
322 asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
323 asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
324 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
325 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
326 asm volatile("vpxor %ymm13,%ymm10,%ymm10");
327 asm volatile("vpxor %ymm15,%ymm11,%ymm11");
328 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
329 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
330 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
331 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
332 }
333 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
334 asm volatile("vpxor %ymm2,%ymm2,%ymm2");
335 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
336 asm volatile("vpxor %ymm3,%ymm3,%ymm3");
337 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
338 asm volatile("vpxor %ymm10,%ymm10,%ymm10");
339 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
340 asm volatile("vpxor %ymm11,%ymm11,%ymm11");
341 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
342 asm volatile("vpxor %ymm4,%ymm4,%ymm4");
343 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
344 asm volatile("vpxor %ymm6,%ymm6,%ymm6");
345 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
346 asm volatile("vpxor %ymm12,%ymm12,%ymm12");
347 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
348 asm volatile("vpxor %ymm14,%ymm14,%ymm14");
349 }
350
351 asm volatile("sfence" : : : "memory");
352 kernel_fpu_end();
353}
354
355static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
356 size_t bytes, void **ptrs)
357{
358 u8 **dptr = (u8 **)ptrs;
359 u8 *p, *q;
360 int d, z, z0;
361
362 z0 = stop; /* P/Q right side optimization */
363 p = dptr[disks-2]; /* XOR parity */
364 q = dptr[disks-1]; /* RS syndrome */
365
366 kernel_fpu_begin();
367
368 asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
369
370 for (d = 0 ; d < bytes ; d += 128) {
371 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
372 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
373 asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
374 asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
375 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
376 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
377 asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
378 asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
379 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
380 asm volatile("vpxor %ymm6,%ymm3,%ymm3");
381 asm volatile("vpxor %ymm12,%ymm10,%ymm10");
382 asm volatile("vpxor %ymm14,%ymm11,%ymm11");
383 /* P/Q data pages */
384 for (z = z0-1 ; z >= start ; z--) {
385 asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
386 asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
387 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
388 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
389 asm volatile("vpxor %ymm13,%ymm13,%ymm13");
390 asm volatile("vpxor %ymm15,%ymm15,%ymm15");
391 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
392 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
393 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
394 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
395 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
396 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
397 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
398 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
399 asm volatile("vpand %ymm0,%ymm5,%ymm5");
400 asm volatile("vpand %ymm0,%ymm7,%ymm7");
401 asm volatile("vpand %ymm0,%ymm13,%ymm13");
402 asm volatile("vpand %ymm0,%ymm15,%ymm15");
403 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
404 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
405 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
406 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
407 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
408 asm volatile("vmovdqa %0,%%ymm7"
409 :: "m" (dptr[z][d+32]));
410 asm volatile("vmovdqa %0,%%ymm13"
411 :: "m" (dptr[z][d+64]));
412 asm volatile("vmovdqa %0,%%ymm15"
413 :: "m" (dptr[z][d+96]));
414 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
415 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
416 asm volatile("vpxor %ymm13,%ymm10,%ymm10");
417 asm volatile("vpxor %ymm15,%ymm11,%ymm11");
418 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
419 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
420 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
421 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
422 }
423 asm volatile("prefetchnta %0" :: "m" (q[d]));
424 asm volatile("prefetchnta %0" :: "m" (q[d+64]));
425 /* P/Q left side optimization */
426 for (z = start-1 ; z >= 0 ; z--) {
427 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
428 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
429 asm volatile("vpxor %ymm13,%ymm13,%ymm13");
430 asm volatile("vpxor %ymm15,%ymm15,%ymm15");
431 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
432 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
433 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
434 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
435 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
436 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
437 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
438 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
439 asm volatile("vpand %ymm0,%ymm5,%ymm5");
440 asm volatile("vpand %ymm0,%ymm7,%ymm7");
441 asm volatile("vpand %ymm0,%ymm13,%ymm13");
442 asm volatile("vpand %ymm0,%ymm15,%ymm15");
443 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
444 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
445 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
446 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
447 }
448 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
449 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
450 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
451 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
452 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
453 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
454 asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
455 asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
456 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
457 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
458 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
459 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
460 }
461 asm volatile("sfence" : : : "memory");
462 kernel_fpu_end();
463}
464
465const struct raid6_calls raid6_avx2x4 = {
466 raid6_avx24_gen_syndrome,
467 raid6_avx24_xor_syndrome,
468 raid6_have_avx2,
469 "avx2x4",
470 1 /* Has cache hints */
471};
472#endif
473
474#endif /* CONFIG_AS_AVX2 */