Loading...
1/* SPDX-License-Identifier: GPL-2.0-only */
2/* Copyright 2002 Andi Kleen */
3
4#include <linux/export.h>
5#include <linux/linkage.h>
6#include <linux/cfi_types.h>
7#include <asm/errno.h>
8#include <asm/cpufeatures.h>
9#include <asm/alternative.h>
10
11.section .noinstr.text, "ax"
12
13/*
14 * memcpy - Copy a memory block.
15 *
16 * Input:
17 * rdi destination
18 * rsi source
19 * rdx count
20 *
21 * Output:
22 * rax original destination
23 *
24 * The FSRM alternative should be done inline (avoiding the call and
25 * the disgusting return handling), but that would require some help
26 * from the compiler for better calling conventions.
27 *
28 * The 'rep movsb' itself is small enough to replace the call, but the
29 * two register moves blow up the code. And one of them is "needed"
30 * only for the return value that is the same as the source input,
31 * which the compiler could/should do much better anyway.
32 */
33SYM_TYPED_FUNC_START(__memcpy)
34 ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
35
36 movq %rdi, %rax
37 movq %rdx, %rcx
38 rep movsb
39 RET
40SYM_FUNC_END(__memcpy)
41EXPORT_SYMBOL(__memcpy)
42
43SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
44EXPORT_SYMBOL(memcpy)
45
46SYM_FUNC_START_LOCAL(memcpy_orig)
47 movq %rdi, %rax
48
49 cmpq $0x20, %rdx
50 jb .Lhandle_tail
51
52 /*
53 * We check whether memory false dependence could occur,
54 * then jump to corresponding copy mode.
55 */
56 cmp %dil, %sil
57 jl .Lcopy_backward
58 subq $0x20, %rdx
59.Lcopy_forward_loop:
60 subq $0x20, %rdx
61
62 /*
63 * Move in blocks of 4x8 bytes:
64 */
65 movq 0*8(%rsi), %r8
66 movq 1*8(%rsi), %r9
67 movq 2*8(%rsi), %r10
68 movq 3*8(%rsi), %r11
69 leaq 4*8(%rsi), %rsi
70
71 movq %r8, 0*8(%rdi)
72 movq %r9, 1*8(%rdi)
73 movq %r10, 2*8(%rdi)
74 movq %r11, 3*8(%rdi)
75 leaq 4*8(%rdi), %rdi
76 jae .Lcopy_forward_loop
77 addl $0x20, %edx
78 jmp .Lhandle_tail
79
80.Lcopy_backward:
81 /*
82 * Calculate copy position to tail.
83 */
84 addq %rdx, %rsi
85 addq %rdx, %rdi
86 subq $0x20, %rdx
87 /*
88 * At most 3 ALU operations in one cycle,
89 * so append NOPS in the same 16 bytes trunk.
90 */
91 .p2align 4
92.Lcopy_backward_loop:
93 subq $0x20, %rdx
94 movq -1*8(%rsi), %r8
95 movq -2*8(%rsi), %r9
96 movq -3*8(%rsi), %r10
97 movq -4*8(%rsi), %r11
98 leaq -4*8(%rsi), %rsi
99 movq %r8, -1*8(%rdi)
100 movq %r9, -2*8(%rdi)
101 movq %r10, -3*8(%rdi)
102 movq %r11, -4*8(%rdi)
103 leaq -4*8(%rdi), %rdi
104 jae .Lcopy_backward_loop
105
106 /*
107 * Calculate copy position to head.
108 */
109 addl $0x20, %edx
110 subq %rdx, %rsi
111 subq %rdx, %rdi
112.Lhandle_tail:
113 cmpl $16, %edx
114 jb .Lless_16bytes
115
116 /*
117 * Move data from 16 bytes to 31 bytes.
118 */
119 movq 0*8(%rsi), %r8
120 movq 1*8(%rsi), %r9
121 movq -2*8(%rsi, %rdx), %r10
122 movq -1*8(%rsi, %rdx), %r11
123 movq %r8, 0*8(%rdi)
124 movq %r9, 1*8(%rdi)
125 movq %r10, -2*8(%rdi, %rdx)
126 movq %r11, -1*8(%rdi, %rdx)
127 RET
128 .p2align 4
129.Lless_16bytes:
130 cmpl $8, %edx
131 jb .Lless_8bytes
132 /*
133 * Move data from 8 bytes to 15 bytes.
134 */
135 movq 0*8(%rsi), %r8
136 movq -1*8(%rsi, %rdx), %r9
137 movq %r8, 0*8(%rdi)
138 movq %r9, -1*8(%rdi, %rdx)
139 RET
140 .p2align 4
141.Lless_8bytes:
142 cmpl $4, %edx
143 jb .Lless_3bytes
144
145 /*
146 * Move data from 4 bytes to 7 bytes.
147 */
148 movl (%rsi), %ecx
149 movl -4(%rsi, %rdx), %r8d
150 movl %ecx, (%rdi)
151 movl %r8d, -4(%rdi, %rdx)
152 RET
153 .p2align 4
154.Lless_3bytes:
155 subl $1, %edx
156 jb .Lend
157 /*
158 * Move data from 1 bytes to 3 bytes.
159 */
160 movzbl (%rsi), %ecx
161 jz .Lstore_1byte
162 movzbq 1(%rsi), %r8
163 movzbq (%rsi, %rdx), %r9
164 movb %r8b, 1(%rdi)
165 movb %r9b, (%rdi, %rdx)
166.Lstore_1byte:
167 movb %cl, (%rdi)
168
169.Lend:
170 RET
171SYM_FUNC_END(memcpy_orig)
172
1/* SPDX-License-Identifier: GPL-2.0-only */
2/* Copyright 2002 Andi Kleen */
3
4#include <linux/linkage.h>
5#include <asm/errno.h>
6#include <asm/cpufeatures.h>
7#include <asm/mcsafe_test.h>
8#include <asm/alternative-asm.h>
9#include <asm/export.h>
10
11/*
12 * We build a jump to memcpy_orig by default which gets NOPped out on
13 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
14 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
15 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
16 */
17
18.weak memcpy
19
20/*
21 * memcpy - Copy a memory block.
22 *
23 * Input:
24 * rdi destination
25 * rsi source
26 * rdx count
27 *
28 * Output:
29 * rax original destination
30 */
31ENTRY(__memcpy)
32ENTRY(memcpy)
33 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
34 "jmp memcpy_erms", X86_FEATURE_ERMS
35
36 movq %rdi, %rax
37 movq %rdx, %rcx
38 shrq $3, %rcx
39 andl $7, %edx
40 rep movsq
41 movl %edx, %ecx
42 rep movsb
43 ret
44ENDPROC(memcpy)
45ENDPROC(__memcpy)
46EXPORT_SYMBOL(memcpy)
47EXPORT_SYMBOL(__memcpy)
48
49/*
50 * memcpy_erms() - enhanced fast string memcpy. This is faster and
51 * simpler than memcpy. Use memcpy_erms when possible.
52 */
53ENTRY(memcpy_erms)
54 movq %rdi, %rax
55 movq %rdx, %rcx
56 rep movsb
57 ret
58ENDPROC(memcpy_erms)
59
60ENTRY(memcpy_orig)
61 movq %rdi, %rax
62
63 cmpq $0x20, %rdx
64 jb .Lhandle_tail
65
66 /*
67 * We check whether memory false dependence could occur,
68 * then jump to corresponding copy mode.
69 */
70 cmp %dil, %sil
71 jl .Lcopy_backward
72 subq $0x20, %rdx
73.Lcopy_forward_loop:
74 subq $0x20, %rdx
75
76 /*
77 * Move in blocks of 4x8 bytes:
78 */
79 movq 0*8(%rsi), %r8
80 movq 1*8(%rsi), %r9
81 movq 2*8(%rsi), %r10
82 movq 3*8(%rsi), %r11
83 leaq 4*8(%rsi), %rsi
84
85 movq %r8, 0*8(%rdi)
86 movq %r9, 1*8(%rdi)
87 movq %r10, 2*8(%rdi)
88 movq %r11, 3*8(%rdi)
89 leaq 4*8(%rdi), %rdi
90 jae .Lcopy_forward_loop
91 addl $0x20, %edx
92 jmp .Lhandle_tail
93
94.Lcopy_backward:
95 /*
96 * Calculate copy position to tail.
97 */
98 addq %rdx, %rsi
99 addq %rdx, %rdi
100 subq $0x20, %rdx
101 /*
102 * At most 3 ALU operations in one cycle,
103 * so append NOPS in the same 16 bytes trunk.
104 */
105 .p2align 4
106.Lcopy_backward_loop:
107 subq $0x20, %rdx
108 movq -1*8(%rsi), %r8
109 movq -2*8(%rsi), %r9
110 movq -3*8(%rsi), %r10
111 movq -4*8(%rsi), %r11
112 leaq -4*8(%rsi), %rsi
113 movq %r8, -1*8(%rdi)
114 movq %r9, -2*8(%rdi)
115 movq %r10, -3*8(%rdi)
116 movq %r11, -4*8(%rdi)
117 leaq -4*8(%rdi), %rdi
118 jae .Lcopy_backward_loop
119
120 /*
121 * Calculate copy position to head.
122 */
123 addl $0x20, %edx
124 subq %rdx, %rsi
125 subq %rdx, %rdi
126.Lhandle_tail:
127 cmpl $16, %edx
128 jb .Lless_16bytes
129
130 /*
131 * Move data from 16 bytes to 31 bytes.
132 */
133 movq 0*8(%rsi), %r8
134 movq 1*8(%rsi), %r9
135 movq -2*8(%rsi, %rdx), %r10
136 movq -1*8(%rsi, %rdx), %r11
137 movq %r8, 0*8(%rdi)
138 movq %r9, 1*8(%rdi)
139 movq %r10, -2*8(%rdi, %rdx)
140 movq %r11, -1*8(%rdi, %rdx)
141 retq
142 .p2align 4
143.Lless_16bytes:
144 cmpl $8, %edx
145 jb .Lless_8bytes
146 /*
147 * Move data from 8 bytes to 15 bytes.
148 */
149 movq 0*8(%rsi), %r8
150 movq -1*8(%rsi, %rdx), %r9
151 movq %r8, 0*8(%rdi)
152 movq %r9, -1*8(%rdi, %rdx)
153 retq
154 .p2align 4
155.Lless_8bytes:
156 cmpl $4, %edx
157 jb .Lless_3bytes
158
159 /*
160 * Move data from 4 bytes to 7 bytes.
161 */
162 movl (%rsi), %ecx
163 movl -4(%rsi, %rdx), %r8d
164 movl %ecx, (%rdi)
165 movl %r8d, -4(%rdi, %rdx)
166 retq
167 .p2align 4
168.Lless_3bytes:
169 subl $1, %edx
170 jb .Lend
171 /*
172 * Move data from 1 bytes to 3 bytes.
173 */
174 movzbl (%rsi), %ecx
175 jz .Lstore_1byte
176 movzbq 1(%rsi), %r8
177 movzbq (%rsi, %rdx), %r9
178 movb %r8b, 1(%rdi)
179 movb %r9b, (%rdi, %rdx)
180.Lstore_1byte:
181 movb %cl, (%rdi)
182
183.Lend:
184 retq
185ENDPROC(memcpy_orig)
186
187#ifndef CONFIG_UML
188
189MCSAFE_TEST_CTL
190
191/*
192 * __memcpy_mcsafe - memory copy with machine check exception handling
193 * Note that we only catch machine checks when reading the source addresses.
194 * Writes to target are posted and don't generate machine checks.
195 */
196ENTRY(__memcpy_mcsafe)
197 cmpl $8, %edx
198 /* Less than 8 bytes? Go to byte copy loop */
199 jb .L_no_whole_words
200
201 /* Check for bad alignment of source */
202 testl $7, %esi
203 /* Already aligned */
204 jz .L_8byte_aligned
205
206 /* Copy one byte at a time until source is 8-byte aligned */
207 movl %esi, %ecx
208 andl $7, %ecx
209 subl $8, %ecx
210 negl %ecx
211 subl %ecx, %edx
212.L_read_leading_bytes:
213 movb (%rsi), %al
214 MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
215 MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
216.L_write_leading_bytes:
217 movb %al, (%rdi)
218 incq %rsi
219 incq %rdi
220 decl %ecx
221 jnz .L_read_leading_bytes
222
223.L_8byte_aligned:
224 movl %edx, %ecx
225 andl $7, %edx
226 shrl $3, %ecx
227 jz .L_no_whole_words
228
229.L_read_words:
230 movq (%rsi), %r8
231 MCSAFE_TEST_SRC %rsi 8 .E_read_words
232 MCSAFE_TEST_DST %rdi 8 .E_write_words
233.L_write_words:
234 movq %r8, (%rdi)
235 addq $8, %rsi
236 addq $8, %rdi
237 decl %ecx
238 jnz .L_read_words
239
240 /* Any trailing bytes? */
241.L_no_whole_words:
242 andl %edx, %edx
243 jz .L_done_memcpy_trap
244
245 /* Copy trailing bytes */
246 movl %edx, %ecx
247.L_read_trailing_bytes:
248 movb (%rsi), %al
249 MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
250 MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
251.L_write_trailing_bytes:
252 movb %al, (%rdi)
253 incq %rsi
254 incq %rdi
255 decl %ecx
256 jnz .L_read_trailing_bytes
257
258 /* Copy successful. Return zero */
259.L_done_memcpy_trap:
260 xorl %eax, %eax
261.L_done:
262 ret
263ENDPROC(__memcpy_mcsafe)
264EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
265
266 .section .fixup, "ax"
267 /*
268 * Return number of bytes not copied for any failure. Note that
269 * there is no "tail" handling since the source buffer is 8-byte
270 * aligned and poison is cacheline aligned.
271 */
272.E_read_words:
273 shll $3, %ecx
274.E_leading_bytes:
275 addl %edx, %ecx
276.E_trailing_bytes:
277 mov %ecx, %eax
278 jmp .L_done
279
280 /*
281 * For write fault handling, given the destination is unaligned,
282 * we handle faults on multi-byte writes with a byte-by-byte
283 * copy up to the write-protected page.
284 */
285.E_write_words:
286 shll $3, %ecx
287 addl %edx, %ecx
288 movl %ecx, %edx
289 jmp mcsafe_handle_tail
290
291 .previous
292
293 _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
294 _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
295 _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
296 _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
297 _ASM_EXTABLE(.L_write_words, .E_write_words)
298 _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
299#endif