Linux Audio

Check our new training course

Loading...
v6.8
  1/* SPDX-License-Identifier: GPL-2.0-only */
  2/* Copyright 2002 Andi Kleen */
  3
  4#include <linux/export.h>
  5#include <linux/linkage.h>
  6#include <linux/cfi_types.h>
  7#include <asm/errno.h>
  8#include <asm/cpufeatures.h>
  9#include <asm/alternative.h>
 10
 11.section .noinstr.text, "ax"
 
 
 
 
 
 
 
 12
 13/*
 14 * memcpy - Copy a memory block.
 15 *
 16 * Input:
 17 *  rdi destination
 18 *  rsi source
 19 *  rdx count
 20 *
 21 * Output:
 22 * rax original destination
 23 *
 24 * The FSRM alternative should be done inline (avoiding the call and
 25 * the disgusting return handling), but that would require some help
 26 * from the compiler for better calling conventions.
 27 *
 28 * The 'rep movsb' itself is small enough to replace the call, but the
 29 * two register moves blow up the code. And one of them is "needed"
 30 * only for the return value that is the same as the source input,
 31 * which the compiler could/should do much better anyway.
 32 */
 33SYM_TYPED_FUNC_START(__memcpy)
 34	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
 
 
 35
 36	movq %rdi, %rax
 37	movq %rdx, %rcx
 
 
 
 
 38	rep movsb
 39	RET
 40SYM_FUNC_END(__memcpy)
 41EXPORT_SYMBOL(__memcpy)
 42
 43SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
 44EXPORT_SYMBOL(memcpy)
 
 
 
 
 
 
 
 
 45
 46SYM_FUNC_START_LOCAL(memcpy_orig)
 47	movq %rdi, %rax
 48
 49	cmpq $0x20, %rdx
 50	jb .Lhandle_tail
 51
 52	/*
 53	 * We check whether memory false dependence could occur,
 54	 * then jump to corresponding copy mode.
 55	 */
 56	cmp  %dil, %sil
 57	jl .Lcopy_backward
 58	subq $0x20, %rdx
 59.Lcopy_forward_loop:
 60	subq $0x20,	%rdx
 61
 62	/*
 63	 * Move in blocks of 4x8 bytes:
 64	 */
 65	movq 0*8(%rsi),	%r8
 66	movq 1*8(%rsi),	%r9
 67	movq 2*8(%rsi),	%r10
 68	movq 3*8(%rsi),	%r11
 69	leaq 4*8(%rsi),	%rsi
 70
 71	movq %r8,	0*8(%rdi)
 72	movq %r9,	1*8(%rdi)
 73	movq %r10,	2*8(%rdi)
 74	movq %r11,	3*8(%rdi)
 75	leaq 4*8(%rdi),	%rdi
 76	jae  .Lcopy_forward_loop
 77	addl $0x20,	%edx
 78	jmp  .Lhandle_tail
 79
 80.Lcopy_backward:
 81	/*
 82	 * Calculate copy position to tail.
 83	 */
 84	addq %rdx,	%rsi
 85	addq %rdx,	%rdi
 86	subq $0x20,	%rdx
 87	/*
 88	 * At most 3 ALU operations in one cycle,
 89	 * so append NOPS in the same 16 bytes trunk.
 90	 */
 91	.p2align 4
 92.Lcopy_backward_loop:
 93	subq $0x20,	%rdx
 94	movq -1*8(%rsi),	%r8
 95	movq -2*8(%rsi),	%r9
 96	movq -3*8(%rsi),	%r10
 97	movq -4*8(%rsi),	%r11
 98	leaq -4*8(%rsi),	%rsi
 99	movq %r8,		-1*8(%rdi)
100	movq %r9,		-2*8(%rdi)
101	movq %r10,		-3*8(%rdi)
102	movq %r11,		-4*8(%rdi)
103	leaq -4*8(%rdi),	%rdi
104	jae  .Lcopy_backward_loop
105
106	/*
107	 * Calculate copy position to head.
108	 */
109	addl $0x20,	%edx
110	subq %rdx,	%rsi
111	subq %rdx,	%rdi
112.Lhandle_tail:
113	cmpl $16,	%edx
114	jb   .Lless_16bytes
115
116	/*
117	 * Move data from 16 bytes to 31 bytes.
118	 */
119	movq 0*8(%rsi), %r8
120	movq 1*8(%rsi),	%r9
121	movq -2*8(%rsi, %rdx),	%r10
122	movq -1*8(%rsi, %rdx),	%r11
123	movq %r8,	0*8(%rdi)
124	movq %r9,	1*8(%rdi)
125	movq %r10,	-2*8(%rdi, %rdx)
126	movq %r11,	-1*8(%rdi, %rdx)
127	RET
128	.p2align 4
129.Lless_16bytes:
130	cmpl $8,	%edx
131	jb   .Lless_8bytes
132	/*
133	 * Move data from 8 bytes to 15 bytes.
134	 */
135	movq 0*8(%rsi),	%r8
136	movq -1*8(%rsi, %rdx),	%r9
137	movq %r8,	0*8(%rdi)
138	movq %r9,	-1*8(%rdi, %rdx)
139	RET
140	.p2align 4
141.Lless_8bytes:
142	cmpl $4,	%edx
143	jb   .Lless_3bytes
144
145	/*
146	 * Move data from 4 bytes to 7 bytes.
147	 */
148	movl (%rsi), %ecx
149	movl -4(%rsi, %rdx), %r8d
150	movl %ecx, (%rdi)
151	movl %r8d, -4(%rdi, %rdx)
152	RET
153	.p2align 4
154.Lless_3bytes:
155	subl $1, %edx
156	jb .Lend
157	/*
158	 * Move data from 1 bytes to 3 bytes.
159	 */
160	movzbl (%rsi), %ecx
161	jz .Lstore_1byte
162	movzbq 1(%rsi), %r8
163	movzbq (%rsi, %rdx), %r9
164	movb %r8b, 1(%rdi)
165	movb %r9b, (%rdi, %rdx)
166.Lstore_1byte:
167	movb %cl, (%rdi)
168
169.Lend:
170	RET
171SYM_FUNC_END(memcpy_orig)
172
v4.6
 
  1/* Copyright 2002 Andi Kleen */
  2
 
  3#include <linux/linkage.h>
 
  4#include <asm/errno.h>
  5#include <asm/cpufeatures.h>
  6#include <asm/alternative-asm.h>
  7
  8/*
  9 * We build a jump to memcpy_orig by default which gets NOPped out on
 10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
 11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
 12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
 13 */
 14
 15.weak memcpy
 16
 17/*
 18 * memcpy - Copy a memory block.
 19 *
 20 * Input:
 21 *  rdi destination
 22 *  rsi source
 23 *  rdx count
 24 *
 25 * Output:
 26 * rax original destination
 
 
 
 
 
 
 
 
 
 27 */
 28ENTRY(__memcpy)
 29ENTRY(memcpy)
 30	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
 31		      "jmp memcpy_erms", X86_FEATURE_ERMS
 32
 33	movq %rdi, %rax
 34	movq %rdx, %rcx
 35	shrq $3, %rcx
 36	andl $7, %edx
 37	rep movsq
 38	movl %edx, %ecx
 39	rep movsb
 40	ret
 41ENDPROC(memcpy)
 42ENDPROC(__memcpy)
 43
 44/*
 45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
 46 * simpler than memcpy. Use memcpy_erms when possible.
 47 */
 48ENTRY(memcpy_erms)
 49	movq %rdi, %rax
 50	movq %rdx, %rcx
 51	rep movsb
 52	ret
 53ENDPROC(memcpy_erms)
 54
 55ENTRY(memcpy_orig)
 56	movq %rdi, %rax
 57
 58	cmpq $0x20, %rdx
 59	jb .Lhandle_tail
 60
 61	/*
 62	 * We check whether memory false dependence could occur,
 63	 * then jump to corresponding copy mode.
 64	 */
 65	cmp  %dil, %sil
 66	jl .Lcopy_backward
 67	subq $0x20, %rdx
 68.Lcopy_forward_loop:
 69	subq $0x20,	%rdx
 70
 71	/*
 72	 * Move in blocks of 4x8 bytes:
 73	 */
 74	movq 0*8(%rsi),	%r8
 75	movq 1*8(%rsi),	%r9
 76	movq 2*8(%rsi),	%r10
 77	movq 3*8(%rsi),	%r11
 78	leaq 4*8(%rsi),	%rsi
 79
 80	movq %r8,	0*8(%rdi)
 81	movq %r9,	1*8(%rdi)
 82	movq %r10,	2*8(%rdi)
 83	movq %r11,	3*8(%rdi)
 84	leaq 4*8(%rdi),	%rdi
 85	jae  .Lcopy_forward_loop
 86	addl $0x20,	%edx
 87	jmp  .Lhandle_tail
 88
 89.Lcopy_backward:
 90	/*
 91	 * Calculate copy position to tail.
 92	 */
 93	addq %rdx,	%rsi
 94	addq %rdx,	%rdi
 95	subq $0x20,	%rdx
 96	/*
 97	 * At most 3 ALU operations in one cycle,
 98	 * so append NOPS in the same 16 bytes trunk.
 99	 */
100	.p2align 4
101.Lcopy_backward_loop:
102	subq $0x20,	%rdx
103	movq -1*8(%rsi),	%r8
104	movq -2*8(%rsi),	%r9
105	movq -3*8(%rsi),	%r10
106	movq -4*8(%rsi),	%r11
107	leaq -4*8(%rsi),	%rsi
108	movq %r8,		-1*8(%rdi)
109	movq %r9,		-2*8(%rdi)
110	movq %r10,		-3*8(%rdi)
111	movq %r11,		-4*8(%rdi)
112	leaq -4*8(%rdi),	%rdi
113	jae  .Lcopy_backward_loop
114
115	/*
116	 * Calculate copy position to head.
117	 */
118	addl $0x20,	%edx
119	subq %rdx,	%rsi
120	subq %rdx,	%rdi
121.Lhandle_tail:
122	cmpl $16,	%edx
123	jb   .Lless_16bytes
124
125	/*
126	 * Move data from 16 bytes to 31 bytes.
127	 */
128	movq 0*8(%rsi), %r8
129	movq 1*8(%rsi),	%r9
130	movq -2*8(%rsi, %rdx),	%r10
131	movq -1*8(%rsi, %rdx),	%r11
132	movq %r8,	0*8(%rdi)
133	movq %r9,	1*8(%rdi)
134	movq %r10,	-2*8(%rdi, %rdx)
135	movq %r11,	-1*8(%rdi, %rdx)
136	retq
137	.p2align 4
138.Lless_16bytes:
139	cmpl $8,	%edx
140	jb   .Lless_8bytes
141	/*
142	 * Move data from 8 bytes to 15 bytes.
143	 */
144	movq 0*8(%rsi),	%r8
145	movq -1*8(%rsi, %rdx),	%r9
146	movq %r8,	0*8(%rdi)
147	movq %r9,	-1*8(%rdi, %rdx)
148	retq
149	.p2align 4
150.Lless_8bytes:
151	cmpl $4,	%edx
152	jb   .Lless_3bytes
153
154	/*
155	 * Move data from 4 bytes to 7 bytes.
156	 */
157	movl (%rsi), %ecx
158	movl -4(%rsi, %rdx), %r8d
159	movl %ecx, (%rdi)
160	movl %r8d, -4(%rdi, %rdx)
161	retq
162	.p2align 4
163.Lless_3bytes:
164	subl $1, %edx
165	jb .Lend
166	/*
167	 * Move data from 1 bytes to 3 bytes.
168	 */
169	movzbl (%rsi), %ecx
170	jz .Lstore_1byte
171	movzbq 1(%rsi), %r8
172	movzbq (%rsi, %rdx), %r9
173	movb %r8b, 1(%rdi)
174	movb %r9b, (%rdi, %rdx)
175.Lstore_1byte:
176	movb %cl, (%rdi)
177
178.Lend:
179	retq
180ENDPROC(memcpy_orig)
181
182#ifndef CONFIG_UML
183/*
184 * memcpy_mcsafe - memory copy with machine check exception handling
185 * Note that we only catch machine checks when reading the source addresses.
186 * Writes to target are posted and don't generate machine checks.
187 */
188ENTRY(memcpy_mcsafe)
189	cmpl $8, %edx
190	/* Less than 8 bytes? Go to byte copy loop */
191	jb .L_no_whole_words
192
193	/* Check for bad alignment of source */
194	testl $7, %esi
195	/* Already aligned */
196	jz .L_8byte_aligned
197
198	/* Copy one byte at a time until source is 8-byte aligned */
199	movl %esi, %ecx
200	andl $7, %ecx
201	subl $8, %ecx
202	negl %ecx
203	subl %ecx, %edx
204.L_copy_leading_bytes:
205	movb (%rsi), %al
206	movb %al, (%rdi)
207	incq %rsi
208	incq %rdi
209	decl %ecx
210	jnz .L_copy_leading_bytes
211
212.L_8byte_aligned:
213	/* Figure out how many whole cache lines (64-bytes) to copy */
214	movl %edx, %ecx
215	andl $63, %edx
216	shrl $6, %ecx
217	jz .L_no_whole_cache_lines
218
219	/* Loop copying whole cache lines */
220.L_cache_w0: movq (%rsi), %r8
221.L_cache_w1: movq 1*8(%rsi), %r9
222.L_cache_w2: movq 2*8(%rsi), %r10
223.L_cache_w3: movq 3*8(%rsi), %r11
224	movq %r8, (%rdi)
225	movq %r9, 1*8(%rdi)
226	movq %r10, 2*8(%rdi)
227	movq %r11, 3*8(%rdi)
228.L_cache_w4: movq 4*8(%rsi), %r8
229.L_cache_w5: movq 5*8(%rsi), %r9
230.L_cache_w6: movq 6*8(%rsi), %r10
231.L_cache_w7: movq 7*8(%rsi), %r11
232	movq %r8, 4*8(%rdi)
233	movq %r9, 5*8(%rdi)
234	movq %r10, 6*8(%rdi)
235	movq %r11, 7*8(%rdi)
236	leaq 64(%rsi), %rsi
237	leaq 64(%rdi), %rdi
238	decl %ecx
239	jnz .L_cache_w0
240
241	/* Are there any trailing 8-byte words? */
242.L_no_whole_cache_lines:
243	movl %edx, %ecx
244	andl $7, %edx
245	shrl $3, %ecx
246	jz .L_no_whole_words
247
248	/* Copy trailing words */
249.L_copy_trailing_words:
250	movq (%rsi), %r8
251	mov %r8, (%rdi)
252	leaq 8(%rsi), %rsi
253	leaq 8(%rdi), %rdi
254	decl %ecx
255	jnz .L_copy_trailing_words
256
257	/* Any trailing bytes? */
258.L_no_whole_words:
259	andl %edx, %edx
260	jz .L_done_memcpy_trap
261
262	/* Copy trailing bytes */
263	movl %edx, %ecx
264.L_copy_trailing_bytes:
265	movb (%rsi), %al
266	movb %al, (%rdi)
267	incq %rsi
268	incq %rdi
269	decl %ecx
270	jnz .L_copy_trailing_bytes
271
272	/* Copy successful. Return zero */
273.L_done_memcpy_trap:
274	xorq %rax, %rax
275	ret
276ENDPROC(memcpy_mcsafe)
277
278	.section .fixup, "ax"
279	/* Return -EFAULT for any failure */
280.L_memcpy_mcsafe_fail:
281	mov	$-EFAULT, %rax
282	ret
283
284	.previous
285
286	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
287	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
288	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
289	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
290	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
291	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
292	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
293	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
294	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
295	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
296	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
297#endif