xor_avx.h - arch/x86/include/asm/xor_avx.h - Linux diff v4.10.11

 
  1#ifndef _ASM_X86_XOR_AVX_H
  2#define _ASM_X86_XOR_AVX_H
  3
  4/*
  5 * Optimized RAID-5 checksumming functions for AVX
  6 *
  7 * Copyright (C) 2012 Intel Corporation
  8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
  9 *
 10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
 11 *
 12 * This program is free software; you can redistribute it and/or
 13 * modify it under the terms of the GNU General Public License
 14 * as published by the Free Software Foundation; version 2
 15 * of the License.
 16 */
 17
 18#ifdef CONFIG_AS_AVX
 19
 20#include <linux/compiler.h>
 21#include <asm/fpu/api.h>
 22
 23#define BLOCK4(i) \
 24		BLOCK(32 * i, 0) \
 25		BLOCK(32 * (i + 1), 1) \
 26		BLOCK(32 * (i + 2), 2) \
 27		BLOCK(32 * (i + 3), 3)
 28
 29#define BLOCK16() \
 30		BLOCK4(0) \
 31		BLOCK4(4) \
 32		BLOCK4(8) \
 33		BLOCK4(12)
 34
 35static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
 
 36{
 37	unsigned long lines = bytes >> 9;
 38
 39	kernel_fpu_begin();
 40
 41	while (lines--) {
 42#undef BLOCK
 43#define BLOCK(i, reg) \
 44do { \
 45	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
 46	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
 47		"m" (p0[i / sizeof(*p0)])); \
 48	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 49		"=m" (p0[i / sizeof(*p0)])); \
 50} while (0);
 51
 52		BLOCK16()
 53
 54		p0 = (unsigned long *)((uintptr_t)p0 + 512);
 55		p1 = (unsigned long *)((uintptr_t)p1 + 512);
 56	}
 57
 58	kernel_fpu_end();
 59}
 60
 61static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 62	unsigned long *p2)
 
 63{
 64	unsigned long lines = bytes >> 9;
 65
 66	kernel_fpu_begin();
 67
 68	while (lines--) {
 69#undef BLOCK
 70#define BLOCK(i, reg) \
 71do { \
 72	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
 73	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 74		"m" (p1[i / sizeof(*p1)])); \
 75	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 76		"m" (p0[i / sizeof(*p0)])); \
 77	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 78		"=m" (p0[i / sizeof(*p0)])); \
 79} while (0);
 80
 81		BLOCK16()
 82
 83		p0 = (unsigned long *)((uintptr_t)p0 + 512);
 84		p1 = (unsigned long *)((uintptr_t)p1 + 512);
 85		p2 = (unsigned long *)((uintptr_t)p2 + 512);
 86	}
 87
 88	kernel_fpu_end();
 89}
 90
 91static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 92	unsigned long *p2, unsigned long *p3)
 
 
 93{
 94	unsigned long lines = bytes >> 9;
 95
 96	kernel_fpu_begin();
 97
 98	while (lines--) {
 99#undef BLOCK
100#define BLOCK(i, reg) \
101do { \
102	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104		"m" (p2[i / sizeof(*p2)])); \
105	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
106		"m" (p1[i / sizeof(*p1)])); \
107	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
108		"m" (p0[i / sizeof(*p0)])); \
109	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
110		"=m" (p0[i / sizeof(*p0)])); \
111} while (0);
112
113		BLOCK16();
114
115		p0 = (unsigned long *)((uintptr_t)p0 + 512);
116		p1 = (unsigned long *)((uintptr_t)p1 + 512);
117		p2 = (unsigned long *)((uintptr_t)p2 + 512);
118		p3 = (unsigned long *)((uintptr_t)p3 + 512);
119	}
120
121	kernel_fpu_end();
122}
123
124static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
125	unsigned long *p2, unsigned long *p3, unsigned long *p4)
 
 
 
126{
127	unsigned long lines = bytes >> 9;
128
129	kernel_fpu_begin();
130
131	while (lines--) {
132#undef BLOCK
133#define BLOCK(i, reg) \
134do { \
135	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
136	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137		"m" (p3[i / sizeof(*p3)])); \
138	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
139		"m" (p2[i / sizeof(*p2)])); \
140	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
141		"m" (p1[i / sizeof(*p1)])); \
142	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
143		"m" (p0[i / sizeof(*p0)])); \
144	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
145		"=m" (p0[i / sizeof(*p0)])); \
146} while (0);
147
148		BLOCK16()
149
150		p0 = (unsigned long *)((uintptr_t)p0 + 512);
151		p1 = (unsigned long *)((uintptr_t)p1 + 512);
152		p2 = (unsigned long *)((uintptr_t)p2 + 512);
153		p3 = (unsigned long *)((uintptr_t)p3 + 512);
154		p4 = (unsigned long *)((uintptr_t)p4 + 512);
155	}
156
157	kernel_fpu_end();
158}
159
160static struct xor_block_template xor_block_avx = {
161	.name = "avx",
162	.do_2 = xor_avx_2,
163	.do_3 = xor_avx_3,
164	.do_4 = xor_avx_4,
165	.do_5 = xor_avx_5,
166};
167
168#define AVX_XOR_SPEED \
169do { \
170	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
171		xor_speed(&xor_block_avx); \
172} while (0)
173
174#define AVX_SELECT(FASTEST) \
175	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
176
177#else
178
179#define AVX_XOR_SPEED {}
180
181#define AVX_SELECT(FASTEST) (FASTEST)
182
183#endif
184#endif

  1/* SPDX-License-Identifier: GPL-2.0-only */
  2#ifndef _ASM_X86_XOR_AVX_H
  3#define _ASM_X86_XOR_AVX_H
  4
  5/*
  6 * Optimized RAID-5 checksumming functions for AVX
  7 *
  8 * Copyright (C) 2012 Intel Corporation
  9 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
 10 *
 11 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
 
 
 
 
 
 12 */
 13
 
 
 14#include <linux/compiler.h>
 15#include <asm/fpu/api.h>
 16
 17#define BLOCK4(i) \
 18		BLOCK(32 * i, 0) \
 19		BLOCK(32 * (i + 1), 1) \
 20		BLOCK(32 * (i + 2), 2) \
 21		BLOCK(32 * (i + 3), 3)
 22
 23#define BLOCK16() \
 24		BLOCK4(0) \
 25		BLOCK4(4) \
 26		BLOCK4(8) \
 27		BLOCK4(12)
 28
 29static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
 30		      const unsigned long * __restrict p1)
 31{
 32	unsigned long lines = bytes >> 9;
 33
 34	kernel_fpu_begin();
 35
 36	while (lines--) {
 37#undef BLOCK
 38#define BLOCK(i, reg) \
 39do { \
 40	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
 41	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
 42		"m" (p0[i / sizeof(*p0)])); \
 43	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 44		"=m" (p0[i / sizeof(*p0)])); \
 45} while (0);
 46
 47		BLOCK16()
 48
 49		p0 = (unsigned long *)((uintptr_t)p0 + 512);
 50		p1 = (unsigned long *)((uintptr_t)p1 + 512);
 51	}
 52
 53	kernel_fpu_end();
 54}
 55
 56static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
 57		      const unsigned long * __restrict p1,
 58		      const unsigned long * __restrict p2)
 59{
 60	unsigned long lines = bytes >> 9;
 61
 62	kernel_fpu_begin();
 63
 64	while (lines--) {
 65#undef BLOCK
 66#define BLOCK(i, reg) \
 67do { \
 68	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
 69	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 70		"m" (p1[i / sizeof(*p1)])); \
 71	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 72		"m" (p0[i / sizeof(*p0)])); \
 73	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 74		"=m" (p0[i / sizeof(*p0)])); \
 75} while (0);
 76
 77		BLOCK16()
 78
 79		p0 = (unsigned long *)((uintptr_t)p0 + 512);
 80		p1 = (unsigned long *)((uintptr_t)p1 + 512);
 81		p2 = (unsigned long *)((uintptr_t)p2 + 512);
 82	}
 83
 84	kernel_fpu_end();
 85}
 86
 87static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
 88		      const unsigned long * __restrict p1,
 89		      const unsigned long * __restrict p2,
 90		      const unsigned long * __restrict p3)
 91{
 92	unsigned long lines = bytes >> 9;
 93
 94	kernel_fpu_begin();
 95
 96	while (lines--) {
 97#undef BLOCK
 98#define BLOCK(i, reg) \
 99do { \
100	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
101	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102		"m" (p2[i / sizeof(*p2)])); \
103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104		"m" (p1[i / sizeof(*p1)])); \
105	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
106		"m" (p0[i / sizeof(*p0)])); \
107	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
108		"=m" (p0[i / sizeof(*p0)])); \
109} while (0);
110
111		BLOCK16();
112
113		p0 = (unsigned long *)((uintptr_t)p0 + 512);
114		p1 = (unsigned long *)((uintptr_t)p1 + 512);
115		p2 = (unsigned long *)((uintptr_t)p2 + 512);
116		p3 = (unsigned long *)((uintptr_t)p3 + 512);
117	}
118
119	kernel_fpu_end();
120}
121
122static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
123	     const unsigned long * __restrict p1,
124	     const unsigned long * __restrict p2,
125	     const unsigned long * __restrict p3,
126	     const unsigned long * __restrict p4)
127{
128	unsigned long lines = bytes >> 9;
129
130	kernel_fpu_begin();
131
132	while (lines--) {
133#undef BLOCK
134#define BLOCK(i, reg) \
135do { \
136	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
137	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
138		"m" (p3[i / sizeof(*p3)])); \
139	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
140		"m" (p2[i / sizeof(*p2)])); \
141	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
142		"m" (p1[i / sizeof(*p1)])); \
143	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
144		"m" (p0[i / sizeof(*p0)])); \
145	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
146		"=m" (p0[i / sizeof(*p0)])); \
147} while (0);
148
149		BLOCK16()
150
151		p0 = (unsigned long *)((uintptr_t)p0 + 512);
152		p1 = (unsigned long *)((uintptr_t)p1 + 512);
153		p2 = (unsigned long *)((uintptr_t)p2 + 512);
154		p3 = (unsigned long *)((uintptr_t)p3 + 512);
155		p4 = (unsigned long *)((uintptr_t)p4 + 512);
156	}
157
158	kernel_fpu_end();
159}
160
161static struct xor_block_template xor_block_avx = {
162	.name = "avx",
163	.do_2 = xor_avx_2,
164	.do_3 = xor_avx_3,
165	.do_4 = xor_avx_4,
166	.do_5 = xor_avx_5,
167};
168
169#define AVX_XOR_SPEED \
170do { \
171	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
172		xor_speed(&xor_block_avx); \
173} while (0)
174
175#define AVX_SELECT(FASTEST) \
176	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
177
 
 
 
 
 
 
 
178#endif