Loading...
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *
4 * Copyright (C) IBM Corporation, 2011
5 *
6 * Author: Anton Blanchard <anton@au.ibm.com>
7 */
8#include <asm/ppc_asm.h>
9
10#ifndef SELFTEST_CASE
11/* 0 == don't use VMX, 1 == use VMX */
12#define SELFTEST_CASE 0
13#endif
14
15#ifdef __BIG_ENDIAN__
16#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
17#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
18#else
19#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
20#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
21#endif
22
23 .macro err1
24100:
25 EX_TABLE(100b,.Ldo_err1)
26 .endm
27
28 .macro err2
29200:
30 EX_TABLE(200b,.Ldo_err2)
31 .endm
32
33#ifdef CONFIG_ALTIVEC
34 .macro err3
35300:
36 EX_TABLE(300b,.Ldo_err3)
37 .endm
38
39 .macro err4
40400:
41 EX_TABLE(400b,.Ldo_err4)
42 .endm
43
44
45.Ldo_err4:
46 ld r16,STK_REG(R16)(r1)
47 ld r15,STK_REG(R15)(r1)
48 ld r14,STK_REG(R14)(r1)
49.Ldo_err3:
50 bl exit_vmx_usercopy
51 ld r0,STACKFRAMESIZE+16(r1)
52 mtlr r0
53 b .Lexit
54#endif /* CONFIG_ALTIVEC */
55
56.Ldo_err2:
57 ld r22,STK_REG(R22)(r1)
58 ld r21,STK_REG(R21)(r1)
59 ld r20,STK_REG(R20)(r1)
60 ld r19,STK_REG(R19)(r1)
61 ld r18,STK_REG(R18)(r1)
62 ld r17,STK_REG(R17)(r1)
63 ld r16,STK_REG(R16)(r1)
64 ld r15,STK_REG(R15)(r1)
65 ld r14,STK_REG(R14)(r1)
66.Lexit:
67 addi r1,r1,STACKFRAMESIZE
68.Ldo_err1:
69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72 b __copy_tofrom_user_base
73
74
75_GLOBAL(__copy_tofrom_user_power7)
76 cmpldi r5,16
77 cmpldi cr1,r5,3328
78
79 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
80 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
81 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
82
83 blt .Lshort_copy
84
85#ifdef CONFIG_ALTIVEC
86test_feature = SELFTEST_CASE
87BEGIN_FTR_SECTION
88 bgt cr1,.Lvmx_copy
89END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
90#endif
91
92.Lnonvmx_copy:
93 /* Get the source 8B aligned */
94 neg r6,r4
95 mtocrf 0x01,r6
96 clrldi r6,r6,(64-3)
97
98 bf cr7*4+3,1f
99err1; lbz r0,0(r4)
100 addi r4,r4,1
101err1; stb r0,0(r3)
102 addi r3,r3,1
103
1041: bf cr7*4+2,2f
105err1; lhz r0,0(r4)
106 addi r4,r4,2
107err1; sth r0,0(r3)
108 addi r3,r3,2
109
1102: bf cr7*4+1,3f
111err1; lwz r0,0(r4)
112 addi r4,r4,4
113err1; stw r0,0(r3)
114 addi r3,r3,4
115
1163: sub r5,r5,r6
117 cmpldi r5,128
118 blt 5f
119
120 mflr r0
121 stdu r1,-STACKFRAMESIZE(r1)
122 std r14,STK_REG(R14)(r1)
123 std r15,STK_REG(R15)(r1)
124 std r16,STK_REG(R16)(r1)
125 std r17,STK_REG(R17)(r1)
126 std r18,STK_REG(R18)(r1)
127 std r19,STK_REG(R19)(r1)
128 std r20,STK_REG(R20)(r1)
129 std r21,STK_REG(R21)(r1)
130 std r22,STK_REG(R22)(r1)
131 std r0,STACKFRAMESIZE+16(r1)
132
133 srdi r6,r5,7
134 mtctr r6
135
136 /* Now do cacheline (128B) sized loads and stores. */
137 .align 5
1384:
139err2; ld r0,0(r4)
140err2; ld r6,8(r4)
141err2; ld r7,16(r4)
142err2; ld r8,24(r4)
143err2; ld r9,32(r4)
144err2; ld r10,40(r4)
145err2; ld r11,48(r4)
146err2; ld r12,56(r4)
147err2; ld r14,64(r4)
148err2; ld r15,72(r4)
149err2; ld r16,80(r4)
150err2; ld r17,88(r4)
151err2; ld r18,96(r4)
152err2; ld r19,104(r4)
153err2; ld r20,112(r4)
154err2; ld r21,120(r4)
155 addi r4,r4,128
156err2; std r0,0(r3)
157err2; std r6,8(r3)
158err2; std r7,16(r3)
159err2; std r8,24(r3)
160err2; std r9,32(r3)
161err2; std r10,40(r3)
162err2; std r11,48(r3)
163err2; std r12,56(r3)
164err2; std r14,64(r3)
165err2; std r15,72(r3)
166err2; std r16,80(r3)
167err2; std r17,88(r3)
168err2; std r18,96(r3)
169err2; std r19,104(r3)
170err2; std r20,112(r3)
171err2; std r21,120(r3)
172 addi r3,r3,128
173 bdnz 4b
174
175 clrldi r5,r5,(64-7)
176
177 ld r14,STK_REG(R14)(r1)
178 ld r15,STK_REG(R15)(r1)
179 ld r16,STK_REG(R16)(r1)
180 ld r17,STK_REG(R17)(r1)
181 ld r18,STK_REG(R18)(r1)
182 ld r19,STK_REG(R19)(r1)
183 ld r20,STK_REG(R20)(r1)
184 ld r21,STK_REG(R21)(r1)
185 ld r22,STK_REG(R22)(r1)
186 addi r1,r1,STACKFRAMESIZE
187
188 /* Up to 127B to go */
1895: srdi r6,r5,4
190 mtocrf 0x01,r6
191
1926: bf cr7*4+1,7f
193err1; ld r0,0(r4)
194err1; ld r6,8(r4)
195err1; ld r7,16(r4)
196err1; ld r8,24(r4)
197err1; ld r9,32(r4)
198err1; ld r10,40(r4)
199err1; ld r11,48(r4)
200err1; ld r12,56(r4)
201 addi r4,r4,64
202err1; std r0,0(r3)
203err1; std r6,8(r3)
204err1; std r7,16(r3)
205err1; std r8,24(r3)
206err1; std r9,32(r3)
207err1; std r10,40(r3)
208err1; std r11,48(r3)
209err1; std r12,56(r3)
210 addi r3,r3,64
211
212 /* Up to 63B to go */
2137: bf cr7*4+2,8f
214err1; ld r0,0(r4)
215err1; ld r6,8(r4)
216err1; ld r7,16(r4)
217err1; ld r8,24(r4)
218 addi r4,r4,32
219err1; std r0,0(r3)
220err1; std r6,8(r3)
221err1; std r7,16(r3)
222err1; std r8,24(r3)
223 addi r3,r3,32
224
225 /* Up to 31B to go */
2268: bf cr7*4+3,9f
227err1; ld r0,0(r4)
228err1; ld r6,8(r4)
229 addi r4,r4,16
230err1; std r0,0(r3)
231err1; std r6,8(r3)
232 addi r3,r3,16
233
2349: clrldi r5,r5,(64-4)
235
236 /* Up to 15B to go */
237.Lshort_copy:
238 mtocrf 0x01,r5
239 bf cr7*4+0,12f
240err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
241err1; lwz r6,4(r4)
242 addi r4,r4,8
243err1; stw r0,0(r3)
244err1; stw r6,4(r3)
245 addi r3,r3,8
246
24712: bf cr7*4+1,13f
248err1; lwz r0,0(r4)
249 addi r4,r4,4
250err1; stw r0,0(r3)
251 addi r3,r3,4
252
25313: bf cr7*4+2,14f
254err1; lhz r0,0(r4)
255 addi r4,r4,2
256err1; sth r0,0(r3)
257 addi r3,r3,2
258
25914: bf cr7*4+3,15f
260err1; lbz r0,0(r4)
261err1; stb r0,0(r3)
262
26315: li r3,0
264 blr
265
266.Lunwind_stack_nonvmx_copy:
267 addi r1,r1,STACKFRAMESIZE
268 b .Lnonvmx_copy
269
270.Lvmx_copy:
271#ifdef CONFIG_ALTIVEC
272 mflr r0
273 std r0,16(r1)
274 stdu r1,-STACKFRAMESIZE(r1)
275 bl enter_vmx_usercopy
276 cmpwi cr1,r3,0
277 ld r0,STACKFRAMESIZE+16(r1)
278 ld r3,STK_REG(R31)(r1)
279 ld r4,STK_REG(R30)(r1)
280 ld r5,STK_REG(R29)(r1)
281 mtlr r0
282
283 /*
284 * We prefetch both the source and destination using enhanced touch
285 * instructions. We use a stream ID of 0 for the load side and
286 * 1 for the store side.
287 */
288 clrrdi r6,r4,7
289 clrrdi r9,r3,7
290 ori r9,r9,1 /* stream=1 */
291
292 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
293 cmpldi r7,0x3FF
294 ble 1f
295 li r7,0x3FF
2961: lis r0,0x0E00 /* depth=7 */
297 sldi r7,r7,7
298 or r7,r7,r0
299 ori r10,r7,1 /* stream=1 */
300
301 lis r8,0x8000 /* GO=1 */
302 clrldi r8,r8,32
303
304 /* setup read stream 0 */
305 dcbt 0,r6,0b01000 /* addr from */
306 dcbt 0,r7,0b01010 /* length and depth from */
307 /* setup write stream 1 */
308 dcbtst 0,r9,0b01000 /* addr to */
309 dcbtst 0,r10,0b01010 /* length and depth to */
310 eieio
311 dcbt 0,r8,0b01010 /* all streams GO */
312
313 beq cr1,.Lunwind_stack_nonvmx_copy
314
315 /*
316 * If source and destination are not relatively aligned we use a
317 * slower permute loop.
318 */
319 xor r6,r4,r3
320 rldicl. r6,r6,0,(64-4)
321 bne .Lvmx_unaligned_copy
322
323 /* Get the destination 16B aligned */
324 neg r6,r3
325 mtocrf 0x01,r6
326 clrldi r6,r6,(64-4)
327
328 bf cr7*4+3,1f
329err3; lbz r0,0(r4)
330 addi r4,r4,1
331err3; stb r0,0(r3)
332 addi r3,r3,1
333
3341: bf cr7*4+2,2f
335err3; lhz r0,0(r4)
336 addi r4,r4,2
337err3; sth r0,0(r3)
338 addi r3,r3,2
339
3402: bf cr7*4+1,3f
341err3; lwz r0,0(r4)
342 addi r4,r4,4
343err3; stw r0,0(r3)
344 addi r3,r3,4
345
3463: bf cr7*4+0,4f
347err3; ld r0,0(r4)
348 addi r4,r4,8
349err3; std r0,0(r3)
350 addi r3,r3,8
351
3524: sub r5,r5,r6
353
354 /* Get the desination 128B aligned */
355 neg r6,r3
356 srdi r7,r6,4
357 mtocrf 0x01,r7
358 clrldi r6,r6,(64-7)
359
360 li r9,16
361 li r10,32
362 li r11,48
363
364 bf cr7*4+3,5f
365err3; lvx v1,0,r4
366 addi r4,r4,16
367err3; stvx v1,0,r3
368 addi r3,r3,16
369
3705: bf cr7*4+2,6f
371err3; lvx v1,0,r4
372err3; lvx v0,r4,r9
373 addi r4,r4,32
374err3; stvx v1,0,r3
375err3; stvx v0,r3,r9
376 addi r3,r3,32
377
3786: bf cr7*4+1,7f
379err3; lvx v3,0,r4
380err3; lvx v2,r4,r9
381err3; lvx v1,r4,r10
382err3; lvx v0,r4,r11
383 addi r4,r4,64
384err3; stvx v3,0,r3
385err3; stvx v2,r3,r9
386err3; stvx v1,r3,r10
387err3; stvx v0,r3,r11
388 addi r3,r3,64
389
3907: sub r5,r5,r6
391 srdi r6,r5,7
392
393 std r14,STK_REG(R14)(r1)
394 std r15,STK_REG(R15)(r1)
395 std r16,STK_REG(R16)(r1)
396
397 li r12,64
398 li r14,80
399 li r15,96
400 li r16,112
401
402 mtctr r6
403
404 /*
405 * Now do cacheline sized loads and stores. By this stage the
406 * cacheline stores are also cacheline aligned.
407 */
408 .align 5
4098:
410err4; lvx v7,0,r4
411err4; lvx v6,r4,r9
412err4; lvx v5,r4,r10
413err4; lvx v4,r4,r11
414err4; lvx v3,r4,r12
415err4; lvx v2,r4,r14
416err4; lvx v1,r4,r15
417err4; lvx v0,r4,r16
418 addi r4,r4,128
419err4; stvx v7,0,r3
420err4; stvx v6,r3,r9
421err4; stvx v5,r3,r10
422err4; stvx v4,r3,r11
423err4; stvx v3,r3,r12
424err4; stvx v2,r3,r14
425err4; stvx v1,r3,r15
426err4; stvx v0,r3,r16
427 addi r3,r3,128
428 bdnz 8b
429
430 ld r14,STK_REG(R14)(r1)
431 ld r15,STK_REG(R15)(r1)
432 ld r16,STK_REG(R16)(r1)
433
434 /* Up to 127B to go */
435 clrldi r5,r5,(64-7)
436 srdi r6,r5,4
437 mtocrf 0x01,r6
438
439 bf cr7*4+1,9f
440err3; lvx v3,0,r4
441err3; lvx v2,r4,r9
442err3; lvx v1,r4,r10
443err3; lvx v0,r4,r11
444 addi r4,r4,64
445err3; stvx v3,0,r3
446err3; stvx v2,r3,r9
447err3; stvx v1,r3,r10
448err3; stvx v0,r3,r11
449 addi r3,r3,64
450
4519: bf cr7*4+2,10f
452err3; lvx v1,0,r4
453err3; lvx v0,r4,r9
454 addi r4,r4,32
455err3; stvx v1,0,r3
456err3; stvx v0,r3,r9
457 addi r3,r3,32
458
45910: bf cr7*4+3,11f
460err3; lvx v1,0,r4
461 addi r4,r4,16
462err3; stvx v1,0,r3
463 addi r3,r3,16
464
465 /* Up to 15B to go */
46611: clrldi r5,r5,(64-4)
467 mtocrf 0x01,r5
468 bf cr7*4+0,12f
469err3; ld r0,0(r4)
470 addi r4,r4,8
471err3; std r0,0(r3)
472 addi r3,r3,8
473
47412: bf cr7*4+1,13f
475err3; lwz r0,0(r4)
476 addi r4,r4,4
477err3; stw r0,0(r3)
478 addi r3,r3,4
479
48013: bf cr7*4+2,14f
481err3; lhz r0,0(r4)
482 addi r4,r4,2
483err3; sth r0,0(r3)
484 addi r3,r3,2
485
48614: bf cr7*4+3,15f
487err3; lbz r0,0(r4)
488err3; stb r0,0(r3)
489
49015: addi r1,r1,STACKFRAMESIZE
491 b exit_vmx_usercopy /* tail call optimise */
492
493.Lvmx_unaligned_copy:
494 /* Get the destination 16B aligned */
495 neg r6,r3
496 mtocrf 0x01,r6
497 clrldi r6,r6,(64-4)
498
499 bf cr7*4+3,1f
500err3; lbz r0,0(r4)
501 addi r4,r4,1
502err3; stb r0,0(r3)
503 addi r3,r3,1
504
5051: bf cr7*4+2,2f
506err3; lhz r0,0(r4)
507 addi r4,r4,2
508err3; sth r0,0(r3)
509 addi r3,r3,2
510
5112: bf cr7*4+1,3f
512err3; lwz r0,0(r4)
513 addi r4,r4,4
514err3; stw r0,0(r3)
515 addi r3,r3,4
516
5173: bf cr7*4+0,4f
518err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
519err3; lwz r7,4(r4)
520 addi r4,r4,8
521err3; stw r0,0(r3)
522err3; stw r7,4(r3)
523 addi r3,r3,8
524
5254: sub r5,r5,r6
526
527 /* Get the desination 128B aligned */
528 neg r6,r3
529 srdi r7,r6,4
530 mtocrf 0x01,r7
531 clrldi r6,r6,(64-7)
532
533 li r9,16
534 li r10,32
535 li r11,48
536
537 LVS(v16,0,r4) /* Setup permute control vector */
538err3; lvx v0,0,r4
539 addi r4,r4,16
540
541 bf cr7*4+3,5f
542err3; lvx v1,0,r4
543 VPERM(v8,v0,v1,v16)
544 addi r4,r4,16
545err3; stvx v8,0,r3
546 addi r3,r3,16
547 vor v0,v1,v1
548
5495: bf cr7*4+2,6f
550err3; lvx v1,0,r4
551 VPERM(v8,v0,v1,v16)
552err3; lvx v0,r4,r9
553 VPERM(v9,v1,v0,v16)
554 addi r4,r4,32
555err3; stvx v8,0,r3
556err3; stvx v9,r3,r9
557 addi r3,r3,32
558
5596: bf cr7*4+1,7f
560err3; lvx v3,0,r4
561 VPERM(v8,v0,v3,v16)
562err3; lvx v2,r4,r9
563 VPERM(v9,v3,v2,v16)
564err3; lvx v1,r4,r10
565 VPERM(v10,v2,v1,v16)
566err3; lvx v0,r4,r11
567 VPERM(v11,v1,v0,v16)
568 addi r4,r4,64
569err3; stvx v8,0,r3
570err3; stvx v9,r3,r9
571err3; stvx v10,r3,r10
572err3; stvx v11,r3,r11
573 addi r3,r3,64
574
5757: sub r5,r5,r6
576 srdi r6,r5,7
577
578 std r14,STK_REG(R14)(r1)
579 std r15,STK_REG(R15)(r1)
580 std r16,STK_REG(R16)(r1)
581
582 li r12,64
583 li r14,80
584 li r15,96
585 li r16,112
586
587 mtctr r6
588
589 /*
590 * Now do cacheline sized loads and stores. By this stage the
591 * cacheline stores are also cacheline aligned.
592 */
593 .align 5
5948:
595err4; lvx v7,0,r4
596 VPERM(v8,v0,v7,v16)
597err4; lvx v6,r4,r9
598 VPERM(v9,v7,v6,v16)
599err4; lvx v5,r4,r10
600 VPERM(v10,v6,v5,v16)
601err4; lvx v4,r4,r11
602 VPERM(v11,v5,v4,v16)
603err4; lvx v3,r4,r12
604 VPERM(v12,v4,v3,v16)
605err4; lvx v2,r4,r14
606 VPERM(v13,v3,v2,v16)
607err4; lvx v1,r4,r15
608 VPERM(v14,v2,v1,v16)
609err4; lvx v0,r4,r16
610 VPERM(v15,v1,v0,v16)
611 addi r4,r4,128
612err4; stvx v8,0,r3
613err4; stvx v9,r3,r9
614err4; stvx v10,r3,r10
615err4; stvx v11,r3,r11
616err4; stvx v12,r3,r12
617err4; stvx v13,r3,r14
618err4; stvx v14,r3,r15
619err4; stvx v15,r3,r16
620 addi r3,r3,128
621 bdnz 8b
622
623 ld r14,STK_REG(R14)(r1)
624 ld r15,STK_REG(R15)(r1)
625 ld r16,STK_REG(R16)(r1)
626
627 /* Up to 127B to go */
628 clrldi r5,r5,(64-7)
629 srdi r6,r5,4
630 mtocrf 0x01,r6
631
632 bf cr7*4+1,9f
633err3; lvx v3,0,r4
634 VPERM(v8,v0,v3,v16)
635err3; lvx v2,r4,r9
636 VPERM(v9,v3,v2,v16)
637err3; lvx v1,r4,r10
638 VPERM(v10,v2,v1,v16)
639err3; lvx v0,r4,r11
640 VPERM(v11,v1,v0,v16)
641 addi r4,r4,64
642err3; stvx v8,0,r3
643err3; stvx v9,r3,r9
644err3; stvx v10,r3,r10
645err3; stvx v11,r3,r11
646 addi r3,r3,64
647
6489: bf cr7*4+2,10f
649err3; lvx v1,0,r4
650 VPERM(v8,v0,v1,v16)
651err3; lvx v0,r4,r9
652 VPERM(v9,v1,v0,v16)
653 addi r4,r4,32
654err3; stvx v8,0,r3
655err3; stvx v9,r3,r9
656 addi r3,r3,32
657
65810: bf cr7*4+3,11f
659err3; lvx v1,0,r4
660 VPERM(v8,v0,v1,v16)
661 addi r4,r4,16
662err3; stvx v8,0,r3
663 addi r3,r3,16
664
665 /* Up to 15B to go */
66611: clrldi r5,r5,(64-4)
667 addi r4,r4,-16 /* Unwind the +16 load offset */
668 mtocrf 0x01,r5
669 bf cr7*4+0,12f
670err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
671err3; lwz r6,4(r4)
672 addi r4,r4,8
673err3; stw r0,0(r3)
674err3; stw r6,4(r3)
675 addi r3,r3,8
676
67712: bf cr7*4+1,13f
678err3; lwz r0,0(r4)
679 addi r4,r4,4
680err3; stw r0,0(r3)
681 addi r3,r3,4
682
68313: bf cr7*4+2,14f
684err3; lhz r0,0(r4)
685 addi r4,r4,2
686err3; sth r0,0(r3)
687 addi r3,r3,2
688
68914: bf cr7*4+3,15f
690err3; lbz r0,0(r4)
691err3; stb r0,0(r3)
692
69315: addi r1,r1,STACKFRAMESIZE
694 b exit_vmx_usercopy /* tail call optimise */
695#endif /* CONFIG_ALTIVEC */