copy_user_uncached_64.S - arch/x86/lib/copy_user_uncached_64.S - Linux source code v3.1

Note: File does not exist in v3.1.
  1/* SPDX-License-Identifier: GPL-2.0-only */
  2/*
  3 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
  4 */
  5
  6#include <linux/export.h>
  7#include <linux/linkage.h>
  8#include <asm/asm.h>
  9
 10/*
 11 * copy_user_nocache - Uncached memory copy with exception handling
 12 *
 13 * This copies from user space into kernel space, but the kernel
 14 * space accesses can take a machine check exception, so they too
 15 * need exception handling.
 16 *
 17 * Note: only 32-bit and 64-bit stores have non-temporal versions,
 18 * and we only use aligned versions. Any unaligned parts at the
 19 * start or end of the copy will be done using normal cached stores.
 20 *
 21 * Input:
 22 * rdi destination
 23 * rsi source
 24 * edx count
 25 *
 26 * Output:
 27 * rax uncopied bytes or 0 if successful.
 28 */
 29SYM_FUNC_START(__copy_user_nocache)
 30	/* If destination is not 7-byte aligned, we'll have to align it */
 31	testb $7,%dil
 32	jne .Lalign
 33
 34.Lis_aligned:
 35	cmp $64,%edx
 36	jb .Lquadwords
 37
 38	.p2align 4,0x90
 39.Lunrolled:
 4010:	movq (%rsi),%r8
 4111:	movq 8(%rsi),%r9
 4212:	movq 16(%rsi),%r10
 4313:	movq 24(%rsi),%r11
 4420:	movnti %r8,(%rdi)
 4521:	movnti %r9,8(%rdi)
 4622:	movnti %r10,16(%rdi)
 4723:	movnti %r11,24(%rdi)
 4830:	movq 32(%rsi),%r8
 4931:	movq 40(%rsi),%r9
 5032:	movq 48(%rsi),%r10
 5133:	movq 56(%rsi),%r11
 5240:	movnti %r8,32(%rdi)
 5341:	movnti %r9,40(%rdi)
 5442:	movnti %r10,48(%rdi)
 5543:	movnti %r11,56(%rdi)
 56
 57	addq $64,%rsi
 58	addq $64,%rdi
 59	sub $64,%edx
 60	cmp $64,%edx
 61	jae .Lunrolled
 62
 63/*
 64 * First set of user mode loads have been done
 65 * without any stores, so if they fail, we can
 66 * just try the non-unrolled loop.
 67 */
 68_ASM_EXTABLE_UA(10b, .Lquadwords)
 69_ASM_EXTABLE_UA(11b, .Lquadwords)
 70_ASM_EXTABLE_UA(12b, .Lquadwords)
 71_ASM_EXTABLE_UA(13b, .Lquadwords)
 72
 73/*
 74 * The second set of user mode loads have been
 75 * done with 32 bytes stored to the destination,
 76 * so we need to take that into account before
 77 * falling back to the unrolled loop.
 78 */
 79_ASM_EXTABLE_UA(30b, .Lfixup32)
 80_ASM_EXTABLE_UA(31b, .Lfixup32)
 81_ASM_EXTABLE_UA(32b, .Lfixup32)
 82_ASM_EXTABLE_UA(33b, .Lfixup32)
 83
 84/*
 85 * An exception on a write means that we're
 86 * done, but we need to update the count
 87 * depending on where in the unrolled loop
 88 * we were.
 89 */
 90_ASM_EXTABLE_UA(20b, .Ldone0)
 91_ASM_EXTABLE_UA(21b, .Ldone8)
 92_ASM_EXTABLE_UA(22b, .Ldone16)
 93_ASM_EXTABLE_UA(23b, .Ldone24)
 94_ASM_EXTABLE_UA(40b, .Ldone32)
 95_ASM_EXTABLE_UA(41b, .Ldone40)
 96_ASM_EXTABLE_UA(42b, .Ldone48)
 97_ASM_EXTABLE_UA(43b, .Ldone56)
 98
 99.Lquadwords:
100	cmp $8,%edx
101	jb .Llong
10250:	movq (%rsi),%rax
10351:	movnti %rax,(%rdi)
104	addq $8,%rsi
105	addq $8,%rdi
106	sub $8,%edx
107	jmp .Lquadwords
108
109/*
110 * If we fail on the last full quadword, we will
111 * not try to do any byte-wise cached accesses.
112 * We will try to do one more 4-byte uncached
113 * one, though.
114 */
115_ASM_EXTABLE_UA(50b, .Llast4)
116_ASM_EXTABLE_UA(51b, .Ldone0)
117
118.Llong:
119	test $4,%dl
120	je .Lword
12160:	movl (%rsi),%eax
12261:	movnti %eax,(%rdi)
123	addq $4,%rsi
124	addq $4,%rdi
125	sub $4,%edx
126.Lword:
127	sfence
128	test $2,%dl
129	je .Lbyte
13070:	movw (%rsi),%ax
13171:	movw %ax,(%rdi)
132	addq $2,%rsi
133	addq $2,%rdi
134	sub $2,%edx
135.Lbyte:
136	test $1,%dl
137	je .Ldone
13880:	movb (%rsi),%al
13981:	movb %al,(%rdi)
140	dec %edx
141.Ldone:
142	mov %edx,%eax
143	RET
144
145/*
146 * If we fail on the last four bytes, we won't
147 * bother with any fixups. It's dead, Jim. Note
148 * that there's no need for 'sfence' for any
149 * of this, since the exception will have been
150 * serializing.
151 */
152_ASM_EXTABLE_UA(60b, .Ldone)
153_ASM_EXTABLE_UA(61b, .Ldone)
154_ASM_EXTABLE_UA(70b, .Ldone)
155_ASM_EXTABLE_UA(71b, .Ldone)
156_ASM_EXTABLE_UA(80b, .Ldone)
157_ASM_EXTABLE_UA(81b, .Ldone)
158
159/*
160 * This is the "head needs aliging" case when
161 * the destination isn't 8-byte aligned. The
162 * 4-byte case can be done uncached, but any
163 * smaller alignment is done with regular stores.
164 */
165.Lalign:
166	test $1,%dil
167	je .Lalign_word
168	test %edx,%edx
169	je .Ldone
17090:	movb (%rsi),%al
17191:	movb %al,(%rdi)
172	inc %rsi
173	inc %rdi
174	dec %edx
175.Lalign_word:
176	test $2,%dil
177	je .Lalign_long
178	cmp $2,%edx
179	jb .Lbyte
18092:	movw (%rsi),%ax
18193:	movw %ax,(%rdi)
182	addq $2,%rsi
183	addq $2,%rdi
184	sub $2,%edx
185.Lalign_long:
186	test $4,%dil
187	je .Lis_aligned
188	cmp $4,%edx
189	jb .Lword
19094:	movl (%rsi),%eax
19195:	movnti %eax,(%rdi)
192	addq $4,%rsi
193	addq $4,%rdi
194	sub $4,%edx
195	jmp .Lis_aligned
196
197/*
198 * If we fail on the initial alignment accesses,
199 * we're all done. Again, no point in trying to
200 * do byte-by-byte probing if the 4-byte load
201 * fails - we're not doing any uncached accesses
202 * any more.
203 */
204_ASM_EXTABLE_UA(90b, .Ldone)
205_ASM_EXTABLE_UA(91b, .Ldone)
206_ASM_EXTABLE_UA(92b, .Ldone)
207_ASM_EXTABLE_UA(93b, .Ldone)
208_ASM_EXTABLE_UA(94b, .Ldone)
209_ASM_EXTABLE_UA(95b, .Ldone)
210
211/*
212 * Exception table fixups for faults in the middle
213 */
214.Ldone56: sub $8,%edx
215.Ldone48: sub $8,%edx
216.Ldone40: sub $8,%edx
217.Ldone32: sub $8,%edx
218.Ldone24: sub $8,%edx
219.Ldone16: sub $8,%edx
220.Ldone8: sub $8,%edx
221.Ldone0:
222	mov %edx,%eax
223	RET
224
225.Lfixup32:
226	addq $32,%rsi
227	addq $32,%rdi
228	sub $32,%edx
229	jmp .Lquadwords
230
231.Llast4:
23252:	movl (%rsi),%eax
23353:	movnti %eax,(%rdi)
234	sfence
235	sub $4,%edx
236	mov %edx,%eax
237	RET
238_ASM_EXTABLE_UA(52b, .Ldone0)
239_ASM_EXTABLE_UA(53b, .Ldone0)
240
241SYM_FUNC_END(__copy_user_nocache)
242EXPORT_SYMBOL(__copy_user_nocache)