Loading...
1/* SPDX-License-Identifier: GPL-2.0-only */
2/* Copyright 2002 Andi Kleen */
3
4#include <linux/export.h>
5#include <linux/linkage.h>
6#include <linux/cfi_types.h>
7#include <asm/errno.h>
8#include <asm/cpufeatures.h>
9#include <asm/alternative.h>
10
11.section .noinstr.text, "ax"
12
13/*
14 * memcpy - Copy a memory block.
15 *
16 * Input:
17 * rdi destination
18 * rsi source
19 * rdx count
20 *
21 * Output:
22 * rax original destination
23 *
24 * The FSRM alternative should be done inline (avoiding the call and
25 * the disgusting return handling), but that would require some help
26 * from the compiler for better calling conventions.
27 *
28 * The 'rep movsb' itself is small enough to replace the call, but the
29 * two register moves blow up the code. And one of them is "needed"
30 * only for the return value that is the same as the source input,
31 * which the compiler could/should do much better anyway.
32 */
33SYM_TYPED_FUNC_START(__memcpy)
34 ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
35
36 movq %rdi, %rax
37 movq %rdx, %rcx
38 rep movsb
39 RET
40SYM_FUNC_END(__memcpy)
41EXPORT_SYMBOL(__memcpy)
42
43SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
44EXPORT_SYMBOL(memcpy)
45
46SYM_FUNC_START_LOCAL(memcpy_orig)
47 movq %rdi, %rax
48
49 cmpq $0x20, %rdx
50 jb .Lhandle_tail
51
52 /*
53 * We check whether memory false dependence could occur,
54 * then jump to corresponding copy mode.
55 */
56 cmp %dil, %sil
57 jl .Lcopy_backward
58 subq $0x20, %rdx
59.Lcopy_forward_loop:
60 subq $0x20, %rdx
61
62 /*
63 * Move in blocks of 4x8 bytes:
64 */
65 movq 0*8(%rsi), %r8
66 movq 1*8(%rsi), %r9
67 movq 2*8(%rsi), %r10
68 movq 3*8(%rsi), %r11
69 leaq 4*8(%rsi), %rsi
70
71 movq %r8, 0*8(%rdi)
72 movq %r9, 1*8(%rdi)
73 movq %r10, 2*8(%rdi)
74 movq %r11, 3*8(%rdi)
75 leaq 4*8(%rdi), %rdi
76 jae .Lcopy_forward_loop
77 addl $0x20, %edx
78 jmp .Lhandle_tail
79
80.Lcopy_backward:
81 /*
82 * Calculate copy position to tail.
83 */
84 addq %rdx, %rsi
85 addq %rdx, %rdi
86 subq $0x20, %rdx
87 /*
88 * At most 3 ALU operations in one cycle,
89 * so append NOPS in the same 16 bytes trunk.
90 */
91 .p2align 4
92.Lcopy_backward_loop:
93 subq $0x20, %rdx
94 movq -1*8(%rsi), %r8
95 movq -2*8(%rsi), %r9
96 movq -3*8(%rsi), %r10
97 movq -4*8(%rsi), %r11
98 leaq -4*8(%rsi), %rsi
99 movq %r8, -1*8(%rdi)
100 movq %r9, -2*8(%rdi)
101 movq %r10, -3*8(%rdi)
102 movq %r11, -4*8(%rdi)
103 leaq -4*8(%rdi), %rdi
104 jae .Lcopy_backward_loop
105
106 /*
107 * Calculate copy position to head.
108 */
109 addl $0x20, %edx
110 subq %rdx, %rsi
111 subq %rdx, %rdi
112.Lhandle_tail:
113 cmpl $16, %edx
114 jb .Lless_16bytes
115
116 /*
117 * Move data from 16 bytes to 31 bytes.
118 */
119 movq 0*8(%rsi), %r8
120 movq 1*8(%rsi), %r9
121 movq -2*8(%rsi, %rdx), %r10
122 movq -1*8(%rsi, %rdx), %r11
123 movq %r8, 0*8(%rdi)
124 movq %r9, 1*8(%rdi)
125 movq %r10, -2*8(%rdi, %rdx)
126 movq %r11, -1*8(%rdi, %rdx)
127 RET
128 .p2align 4
129.Lless_16bytes:
130 cmpl $8, %edx
131 jb .Lless_8bytes
132 /*
133 * Move data from 8 bytes to 15 bytes.
134 */
135 movq 0*8(%rsi), %r8
136 movq -1*8(%rsi, %rdx), %r9
137 movq %r8, 0*8(%rdi)
138 movq %r9, -1*8(%rdi, %rdx)
139 RET
140 .p2align 4
141.Lless_8bytes:
142 cmpl $4, %edx
143 jb .Lless_3bytes
144
145 /*
146 * Move data from 4 bytes to 7 bytes.
147 */
148 movl (%rsi), %ecx
149 movl -4(%rsi, %rdx), %r8d
150 movl %ecx, (%rdi)
151 movl %r8d, -4(%rdi, %rdx)
152 RET
153 .p2align 4
154.Lless_3bytes:
155 subl $1, %edx
156 jb .Lend
157 /*
158 * Move data from 1 bytes to 3 bytes.
159 */
160 movzbl (%rsi), %ecx
161 jz .Lstore_1byte
162 movzbq 1(%rsi), %r8
163 movzbq (%rsi, %rdx), %r9
164 movb %r8b, 1(%rdi)
165 movb %r9b, (%rdi, %rdx)
166.Lstore_1byte:
167 movb %cl, (%rdi)
168
169.Lend:
170 RET
171SYM_FUNC_END(memcpy_orig)
172
1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4#include <asm/errno.h>
5#include <asm/cpufeatures.h>
6#include <asm/alternative-asm.h>
7
8/*
9 * We build a jump to memcpy_orig by default which gets NOPped out on
10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
13 */
14
15.weak memcpy
16
17/*
18 * memcpy - Copy a memory block.
19 *
20 * Input:
21 * rdi destination
22 * rsi source
23 * rdx count
24 *
25 * Output:
26 * rax original destination
27 */
28ENTRY(__memcpy)
29ENTRY(memcpy)
30 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
31 "jmp memcpy_erms", X86_FEATURE_ERMS
32
33 movq %rdi, %rax
34 movq %rdx, %rcx
35 shrq $3, %rcx
36 andl $7, %edx
37 rep movsq
38 movl %edx, %ecx
39 rep movsb
40 ret
41ENDPROC(memcpy)
42ENDPROC(__memcpy)
43
44/*
45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
46 * simpler than memcpy. Use memcpy_erms when possible.
47 */
48ENTRY(memcpy_erms)
49 movq %rdi, %rax
50 movq %rdx, %rcx
51 rep movsb
52 ret
53ENDPROC(memcpy_erms)
54
55ENTRY(memcpy_orig)
56 movq %rdi, %rax
57
58 cmpq $0x20, %rdx
59 jb .Lhandle_tail
60
61 /*
62 * We check whether memory false dependence could occur,
63 * then jump to corresponding copy mode.
64 */
65 cmp %dil, %sil
66 jl .Lcopy_backward
67 subq $0x20, %rdx
68.Lcopy_forward_loop:
69 subq $0x20, %rdx
70
71 /*
72 * Move in blocks of 4x8 bytes:
73 */
74 movq 0*8(%rsi), %r8
75 movq 1*8(%rsi), %r9
76 movq 2*8(%rsi), %r10
77 movq 3*8(%rsi), %r11
78 leaq 4*8(%rsi), %rsi
79
80 movq %r8, 0*8(%rdi)
81 movq %r9, 1*8(%rdi)
82 movq %r10, 2*8(%rdi)
83 movq %r11, 3*8(%rdi)
84 leaq 4*8(%rdi), %rdi
85 jae .Lcopy_forward_loop
86 addl $0x20, %edx
87 jmp .Lhandle_tail
88
89.Lcopy_backward:
90 /*
91 * Calculate copy position to tail.
92 */
93 addq %rdx, %rsi
94 addq %rdx, %rdi
95 subq $0x20, %rdx
96 /*
97 * At most 3 ALU operations in one cycle,
98 * so append NOPS in the same 16 bytes trunk.
99 */
100 .p2align 4
101.Lcopy_backward_loop:
102 subq $0x20, %rdx
103 movq -1*8(%rsi), %r8
104 movq -2*8(%rsi), %r9
105 movq -3*8(%rsi), %r10
106 movq -4*8(%rsi), %r11
107 leaq -4*8(%rsi), %rsi
108 movq %r8, -1*8(%rdi)
109 movq %r9, -2*8(%rdi)
110 movq %r10, -3*8(%rdi)
111 movq %r11, -4*8(%rdi)
112 leaq -4*8(%rdi), %rdi
113 jae .Lcopy_backward_loop
114
115 /*
116 * Calculate copy position to head.
117 */
118 addl $0x20, %edx
119 subq %rdx, %rsi
120 subq %rdx, %rdi
121.Lhandle_tail:
122 cmpl $16, %edx
123 jb .Lless_16bytes
124
125 /*
126 * Move data from 16 bytes to 31 bytes.
127 */
128 movq 0*8(%rsi), %r8
129 movq 1*8(%rsi), %r9
130 movq -2*8(%rsi, %rdx), %r10
131 movq -1*8(%rsi, %rdx), %r11
132 movq %r8, 0*8(%rdi)
133 movq %r9, 1*8(%rdi)
134 movq %r10, -2*8(%rdi, %rdx)
135 movq %r11, -1*8(%rdi, %rdx)
136 retq
137 .p2align 4
138.Lless_16bytes:
139 cmpl $8, %edx
140 jb .Lless_8bytes
141 /*
142 * Move data from 8 bytes to 15 bytes.
143 */
144 movq 0*8(%rsi), %r8
145 movq -1*8(%rsi, %rdx), %r9
146 movq %r8, 0*8(%rdi)
147 movq %r9, -1*8(%rdi, %rdx)
148 retq
149 .p2align 4
150.Lless_8bytes:
151 cmpl $4, %edx
152 jb .Lless_3bytes
153
154 /*
155 * Move data from 4 bytes to 7 bytes.
156 */
157 movl (%rsi), %ecx
158 movl -4(%rsi, %rdx), %r8d
159 movl %ecx, (%rdi)
160 movl %r8d, -4(%rdi, %rdx)
161 retq
162 .p2align 4
163.Lless_3bytes:
164 subl $1, %edx
165 jb .Lend
166 /*
167 * Move data from 1 bytes to 3 bytes.
168 */
169 movzbl (%rsi), %ecx
170 jz .Lstore_1byte
171 movzbq 1(%rsi), %r8
172 movzbq (%rsi, %rdx), %r9
173 movb %r8b, 1(%rdi)
174 movb %r9b, (%rdi, %rdx)
175.Lstore_1byte:
176 movb %cl, (%rdi)
177
178.Lend:
179 retq
180ENDPROC(memcpy_orig)
181
182#ifndef CONFIG_UML
183/*
184 * memcpy_mcsafe - memory copy with machine check exception handling
185 * Note that we only catch machine checks when reading the source addresses.
186 * Writes to target are posted and don't generate machine checks.
187 */
188ENTRY(memcpy_mcsafe)
189 cmpl $8, %edx
190 /* Less than 8 bytes? Go to byte copy loop */
191 jb .L_no_whole_words
192
193 /* Check for bad alignment of source */
194 testl $7, %esi
195 /* Already aligned */
196 jz .L_8byte_aligned
197
198 /* Copy one byte at a time until source is 8-byte aligned */
199 movl %esi, %ecx
200 andl $7, %ecx
201 subl $8, %ecx
202 negl %ecx
203 subl %ecx, %edx
204.L_copy_leading_bytes:
205 movb (%rsi), %al
206 movb %al, (%rdi)
207 incq %rsi
208 incq %rdi
209 decl %ecx
210 jnz .L_copy_leading_bytes
211
212.L_8byte_aligned:
213 /* Figure out how many whole cache lines (64-bytes) to copy */
214 movl %edx, %ecx
215 andl $63, %edx
216 shrl $6, %ecx
217 jz .L_no_whole_cache_lines
218
219 /* Loop copying whole cache lines */
220.L_cache_w0: movq (%rsi), %r8
221.L_cache_w1: movq 1*8(%rsi), %r9
222.L_cache_w2: movq 2*8(%rsi), %r10
223.L_cache_w3: movq 3*8(%rsi), %r11
224 movq %r8, (%rdi)
225 movq %r9, 1*8(%rdi)
226 movq %r10, 2*8(%rdi)
227 movq %r11, 3*8(%rdi)
228.L_cache_w4: movq 4*8(%rsi), %r8
229.L_cache_w5: movq 5*8(%rsi), %r9
230.L_cache_w6: movq 6*8(%rsi), %r10
231.L_cache_w7: movq 7*8(%rsi), %r11
232 movq %r8, 4*8(%rdi)
233 movq %r9, 5*8(%rdi)
234 movq %r10, 6*8(%rdi)
235 movq %r11, 7*8(%rdi)
236 leaq 64(%rsi), %rsi
237 leaq 64(%rdi), %rdi
238 decl %ecx
239 jnz .L_cache_w0
240
241 /* Are there any trailing 8-byte words? */
242.L_no_whole_cache_lines:
243 movl %edx, %ecx
244 andl $7, %edx
245 shrl $3, %ecx
246 jz .L_no_whole_words
247
248 /* Copy trailing words */
249.L_copy_trailing_words:
250 movq (%rsi), %r8
251 mov %r8, (%rdi)
252 leaq 8(%rsi), %rsi
253 leaq 8(%rdi), %rdi
254 decl %ecx
255 jnz .L_copy_trailing_words
256
257 /* Any trailing bytes? */
258.L_no_whole_words:
259 andl %edx, %edx
260 jz .L_done_memcpy_trap
261
262 /* Copy trailing bytes */
263 movl %edx, %ecx
264.L_copy_trailing_bytes:
265 movb (%rsi), %al
266 movb %al, (%rdi)
267 incq %rsi
268 incq %rdi
269 decl %ecx
270 jnz .L_copy_trailing_bytes
271
272 /* Copy successful. Return zero */
273.L_done_memcpy_trap:
274 xorq %rax, %rax
275 ret
276ENDPROC(memcpy_mcsafe)
277
278 .section .fixup, "ax"
279 /* Return -EFAULT for any failure */
280.L_memcpy_mcsafe_fail:
281 mov $-EFAULT, %rax
282 ret
283
284 .previous
285
286 _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
287 _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
288 _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
289 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
290 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
291 _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
292 _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
293 _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
294 _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
295 _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
296 _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
297#endif