Loading...
1#ifndef _ASM_X86_XOR_AVX_H
2#define _ASM_X86_XOR_AVX_H
3
4/*
5 * Optimized RAID-5 checksumming functions for AVX
6 *
7 * Copyright (C) 2012 Intel Corporation
8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
9 *
10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; version 2
15 * of the License.
16 */
17
18#ifdef CONFIG_AS_AVX
19
20#include <linux/compiler.h>
21#include <asm/fpu/api.h>
22
23#define BLOCK4(i) \
24 BLOCK(32 * i, 0) \
25 BLOCK(32 * (i + 1), 1) \
26 BLOCK(32 * (i + 2), 2) \
27 BLOCK(32 * (i + 3), 3)
28
29#define BLOCK16() \
30 BLOCK4(0) \
31 BLOCK4(4) \
32 BLOCK4(8) \
33 BLOCK4(12)
34
35static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
36{
37 unsigned long lines = bytes >> 9;
38
39 kernel_fpu_begin();
40
41 while (lines--) {
42#undef BLOCK
43#define BLOCK(i, reg) \
44do { \
45 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
46 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
47 "m" (p0[i / sizeof(*p0)])); \
48 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
49 "=m" (p0[i / sizeof(*p0)])); \
50} while (0);
51
52 BLOCK16()
53
54 p0 = (unsigned long *)((uintptr_t)p0 + 512);
55 p1 = (unsigned long *)((uintptr_t)p1 + 512);
56 }
57
58 kernel_fpu_end();
59}
60
61static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
62 unsigned long *p2)
63{
64 unsigned long lines = bytes >> 9;
65
66 kernel_fpu_begin();
67
68 while (lines--) {
69#undef BLOCK
70#define BLOCK(i, reg) \
71do { \
72 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
73 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
74 "m" (p1[i / sizeof(*p1)])); \
75 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
76 "m" (p0[i / sizeof(*p0)])); \
77 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
78 "=m" (p0[i / sizeof(*p0)])); \
79} while (0);
80
81 BLOCK16()
82
83 p0 = (unsigned long *)((uintptr_t)p0 + 512);
84 p1 = (unsigned long *)((uintptr_t)p1 + 512);
85 p2 = (unsigned long *)((uintptr_t)p2 + 512);
86 }
87
88 kernel_fpu_end();
89}
90
91static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
92 unsigned long *p2, unsigned long *p3)
93{
94 unsigned long lines = bytes >> 9;
95
96 kernel_fpu_begin();
97
98 while (lines--) {
99#undef BLOCK
100#define BLOCK(i, reg) \
101do { \
102 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 "m" (p2[i / sizeof(*p2)])); \
105 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
106 "m" (p1[i / sizeof(*p1)])); \
107 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
108 "m" (p0[i / sizeof(*p0)])); \
109 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
110 "=m" (p0[i / sizeof(*p0)])); \
111} while (0);
112
113 BLOCK16();
114
115 p0 = (unsigned long *)((uintptr_t)p0 + 512);
116 p1 = (unsigned long *)((uintptr_t)p1 + 512);
117 p2 = (unsigned long *)((uintptr_t)p2 + 512);
118 p3 = (unsigned long *)((uintptr_t)p3 + 512);
119 }
120
121 kernel_fpu_end();
122}
123
124static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
125 unsigned long *p2, unsigned long *p3, unsigned long *p4)
126{
127 unsigned long lines = bytes >> 9;
128
129 kernel_fpu_begin();
130
131 while (lines--) {
132#undef BLOCK
133#define BLOCK(i, reg) \
134do { \
135 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 "m" (p3[i / sizeof(*p3)])); \
138 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
139 "m" (p2[i / sizeof(*p2)])); \
140 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
141 "m" (p1[i / sizeof(*p1)])); \
142 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
143 "m" (p0[i / sizeof(*p0)])); \
144 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
145 "=m" (p0[i / sizeof(*p0)])); \
146} while (0);
147
148 BLOCK16()
149
150 p0 = (unsigned long *)((uintptr_t)p0 + 512);
151 p1 = (unsigned long *)((uintptr_t)p1 + 512);
152 p2 = (unsigned long *)((uintptr_t)p2 + 512);
153 p3 = (unsigned long *)((uintptr_t)p3 + 512);
154 p4 = (unsigned long *)((uintptr_t)p4 + 512);
155 }
156
157 kernel_fpu_end();
158}
159
160static struct xor_block_template xor_block_avx = {
161 .name = "avx",
162 .do_2 = xor_avx_2,
163 .do_3 = xor_avx_3,
164 .do_4 = xor_avx_4,
165 .do_5 = xor_avx_5,
166};
167
168#define AVX_XOR_SPEED \
169do { \
170 if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
171 xor_speed(&xor_block_avx); \
172} while (0)
173
174#define AVX_SELECT(FASTEST) \
175 (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
176
177#else
178
179#define AVX_XOR_SPEED {}
180
181#define AVX_SELECT(FASTEST) (FASTEST)
182
183#endif
184#endif
1/* SPDX-License-Identifier: GPL-2.0-only */
2#ifndef _ASM_X86_XOR_AVX_H
3#define _ASM_X86_XOR_AVX_H
4
5/*
6 * Optimized RAID-5 checksumming functions for AVX
7 *
8 * Copyright (C) 2012 Intel Corporation
9 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10 *
11 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12 */
13
14#include <linux/compiler.h>
15#include <asm/fpu/api.h>
16
17#define BLOCK4(i) \
18 BLOCK(32 * i, 0) \
19 BLOCK(32 * (i + 1), 1) \
20 BLOCK(32 * (i + 2), 2) \
21 BLOCK(32 * (i + 3), 3)
22
23#define BLOCK16() \
24 BLOCK4(0) \
25 BLOCK4(4) \
26 BLOCK4(8) \
27 BLOCK4(12)
28
29static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
30 const unsigned long * __restrict p1)
31{
32 unsigned long lines = bytes >> 9;
33
34 kernel_fpu_begin();
35
36 while (lines--) {
37#undef BLOCK
38#define BLOCK(i, reg) \
39do { \
40 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
41 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
42 "m" (p0[i / sizeof(*p0)])); \
43 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
44 "=m" (p0[i / sizeof(*p0)])); \
45} while (0);
46
47 BLOCK16()
48
49 p0 = (unsigned long *)((uintptr_t)p0 + 512);
50 p1 = (unsigned long *)((uintptr_t)p1 + 512);
51 }
52
53 kernel_fpu_end();
54}
55
56static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
57 const unsigned long * __restrict p1,
58 const unsigned long * __restrict p2)
59{
60 unsigned long lines = bytes >> 9;
61
62 kernel_fpu_begin();
63
64 while (lines--) {
65#undef BLOCK
66#define BLOCK(i, reg) \
67do { \
68 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
69 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70 "m" (p1[i / sizeof(*p1)])); \
71 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
72 "m" (p0[i / sizeof(*p0)])); \
73 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
74 "=m" (p0[i / sizeof(*p0)])); \
75} while (0);
76
77 BLOCK16()
78
79 p0 = (unsigned long *)((uintptr_t)p0 + 512);
80 p1 = (unsigned long *)((uintptr_t)p1 + 512);
81 p2 = (unsigned long *)((uintptr_t)p2 + 512);
82 }
83
84 kernel_fpu_end();
85}
86
87static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
88 const unsigned long * __restrict p1,
89 const unsigned long * __restrict p2,
90 const unsigned long * __restrict p3)
91{
92 unsigned long lines = bytes >> 9;
93
94 kernel_fpu_begin();
95
96 while (lines--) {
97#undef BLOCK
98#define BLOCK(i, reg) \
99do { \
100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 "m" (p2[i / sizeof(*p2)])); \
103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 "m" (p1[i / sizeof(*p1)])); \
105 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
106 "m" (p0[i / sizeof(*p0)])); \
107 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
108 "=m" (p0[i / sizeof(*p0)])); \
109} while (0);
110
111 BLOCK16();
112
113 p0 = (unsigned long *)((uintptr_t)p0 + 512);
114 p1 = (unsigned long *)((uintptr_t)p1 + 512);
115 p2 = (unsigned long *)((uintptr_t)p2 + 512);
116 p3 = (unsigned long *)((uintptr_t)p3 + 512);
117 }
118
119 kernel_fpu_end();
120}
121
122static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
123 const unsigned long * __restrict p1,
124 const unsigned long * __restrict p2,
125 const unsigned long * __restrict p3,
126 const unsigned long * __restrict p4)
127{
128 unsigned long lines = bytes >> 9;
129
130 kernel_fpu_begin();
131
132 while (lines--) {
133#undef BLOCK
134#define BLOCK(i, reg) \
135do { \
136 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
137 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
138 "m" (p3[i / sizeof(*p3)])); \
139 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
140 "m" (p2[i / sizeof(*p2)])); \
141 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
142 "m" (p1[i / sizeof(*p1)])); \
143 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
144 "m" (p0[i / sizeof(*p0)])); \
145 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
146 "=m" (p0[i / sizeof(*p0)])); \
147} while (0);
148
149 BLOCK16()
150
151 p0 = (unsigned long *)((uintptr_t)p0 + 512);
152 p1 = (unsigned long *)((uintptr_t)p1 + 512);
153 p2 = (unsigned long *)((uintptr_t)p2 + 512);
154 p3 = (unsigned long *)((uintptr_t)p3 + 512);
155 p4 = (unsigned long *)((uintptr_t)p4 + 512);
156 }
157
158 kernel_fpu_end();
159}
160
161static struct xor_block_template xor_block_avx = {
162 .name = "avx",
163 .do_2 = xor_avx_2,
164 .do_3 = xor_avx_3,
165 .do_4 = xor_avx_4,
166 .do_5 = xor_avx_5,
167};
168
169#define AVX_XOR_SPEED \
170do { \
171 if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
172 xor_speed(&xor_block_avx); \
173} while (0)
174
175#define AVX_SELECT(FASTEST) \
176 (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
177
178#endif