Loading...
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *
4 * Copyright (C) IBM Corporation, 2011
5 *
6 * Author: Anton Blanchard <anton@au.ibm.com>
7 */
8#include <asm/ppc_asm.h>
9
10#ifndef SELFTEST_CASE
11/* 0 == don't use VMX, 1 == use VMX */
12#define SELFTEST_CASE 0
13#endif
14
15#ifdef __BIG_ENDIAN__
16#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
17#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
18#else
19#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
20#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
21#endif
22
23 .macro err1
24100:
25 EX_TABLE(100b,.Ldo_err1)
26 .endm
27
28 .macro err2
29200:
30 EX_TABLE(200b,.Ldo_err2)
31 .endm
32
33#ifdef CONFIG_ALTIVEC
34 .macro err3
35300:
36 EX_TABLE(300b,.Ldo_err3)
37 .endm
38
39 .macro err4
40400:
41 EX_TABLE(400b,.Ldo_err4)
42 .endm
43
44
45.Ldo_err4:
46 ld r16,STK_REG(R16)(r1)
47 ld r15,STK_REG(R15)(r1)
48 ld r14,STK_REG(R14)(r1)
49.Ldo_err3:
50 bl exit_vmx_usercopy
51 ld r0,STACKFRAMESIZE+16(r1)
52 mtlr r0
53 b .Lexit
54#endif /* CONFIG_ALTIVEC */
55
56.Ldo_err2:
57 ld r22,STK_REG(R22)(r1)
58 ld r21,STK_REG(R21)(r1)
59 ld r20,STK_REG(R20)(r1)
60 ld r19,STK_REG(R19)(r1)
61 ld r18,STK_REG(R18)(r1)
62 ld r17,STK_REG(R17)(r1)
63 ld r16,STK_REG(R16)(r1)
64 ld r15,STK_REG(R15)(r1)
65 ld r14,STK_REG(R14)(r1)
66.Lexit:
67 addi r1,r1,STACKFRAMESIZE
68.Ldo_err1:
69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72 b __copy_tofrom_user_base
73
74
75_GLOBAL(__copy_tofrom_user_power7)
76 cmpldi r5,16
77 cmpldi cr1,r5,3328
78
79 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
80 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
81 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
82
83 blt .Lshort_copy
84
85#ifdef CONFIG_ALTIVEC
86test_feature = SELFTEST_CASE
87BEGIN_FTR_SECTION
88 bgt cr1,.Lvmx_copy
89END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
90#endif
91
92.Lnonvmx_copy:
93 /* Get the source 8B aligned */
94 neg r6,r4
95 mtocrf 0x01,r6
96 clrldi r6,r6,(64-3)
97
98 bf cr7*4+3,1f
99err1; lbz r0,0(r4)
100 addi r4,r4,1
101err1; stb r0,0(r3)
102 addi r3,r3,1
103
1041: bf cr7*4+2,2f
105err1; lhz r0,0(r4)
106 addi r4,r4,2
107err1; sth r0,0(r3)
108 addi r3,r3,2
109
1102: bf cr7*4+1,3f
111err1; lwz r0,0(r4)
112 addi r4,r4,4
113err1; stw r0,0(r3)
114 addi r3,r3,4
115
1163: sub r5,r5,r6
117 cmpldi r5,128
118 blt 5f
119
120 mflr r0
121 stdu r1,-STACKFRAMESIZE(r1)
122 std r14,STK_REG(R14)(r1)
123 std r15,STK_REG(R15)(r1)
124 std r16,STK_REG(R16)(r1)
125 std r17,STK_REG(R17)(r1)
126 std r18,STK_REG(R18)(r1)
127 std r19,STK_REG(R19)(r1)
128 std r20,STK_REG(R20)(r1)
129 std r21,STK_REG(R21)(r1)
130 std r22,STK_REG(R22)(r1)
131 std r0,STACKFRAMESIZE+16(r1)
132
133 srdi r6,r5,7
134 mtctr r6
135
136 /* Now do cacheline (128B) sized loads and stores. */
137 .align 5
1384:
139err2; ld r0,0(r4)
140err2; ld r6,8(r4)
141err2; ld r7,16(r4)
142err2; ld r8,24(r4)
143err2; ld r9,32(r4)
144err2; ld r10,40(r4)
145err2; ld r11,48(r4)
146err2; ld r12,56(r4)
147err2; ld r14,64(r4)
148err2; ld r15,72(r4)
149err2; ld r16,80(r4)
150err2; ld r17,88(r4)
151err2; ld r18,96(r4)
152err2; ld r19,104(r4)
153err2; ld r20,112(r4)
154err2; ld r21,120(r4)
155 addi r4,r4,128
156err2; std r0,0(r3)
157err2; std r6,8(r3)
158err2; std r7,16(r3)
159err2; std r8,24(r3)
160err2; std r9,32(r3)
161err2; std r10,40(r3)
162err2; std r11,48(r3)
163err2; std r12,56(r3)
164err2; std r14,64(r3)
165err2; std r15,72(r3)
166err2; std r16,80(r3)
167err2; std r17,88(r3)
168err2; std r18,96(r3)
169err2; std r19,104(r3)
170err2; std r20,112(r3)
171err2; std r21,120(r3)
172 addi r3,r3,128
173 bdnz 4b
174
175 clrldi r5,r5,(64-7)
176
177 ld r14,STK_REG(R14)(r1)
178 ld r15,STK_REG(R15)(r1)
179 ld r16,STK_REG(R16)(r1)
180 ld r17,STK_REG(R17)(r1)
181 ld r18,STK_REG(R18)(r1)
182 ld r19,STK_REG(R19)(r1)
183 ld r20,STK_REG(R20)(r1)
184 ld r21,STK_REG(R21)(r1)
185 ld r22,STK_REG(R22)(r1)
186 addi r1,r1,STACKFRAMESIZE
187
188 /* Up to 127B to go */
1895: srdi r6,r5,4
190 mtocrf 0x01,r6
191
1926: bf cr7*4+1,7f
193err1; ld r0,0(r4)
194err1; ld r6,8(r4)
195err1; ld r7,16(r4)
196err1; ld r8,24(r4)
197err1; ld r9,32(r4)
198err1; ld r10,40(r4)
199err1; ld r11,48(r4)
200err1; ld r12,56(r4)
201 addi r4,r4,64
202err1; std r0,0(r3)
203err1; std r6,8(r3)
204err1; std r7,16(r3)
205err1; std r8,24(r3)
206err1; std r9,32(r3)
207err1; std r10,40(r3)
208err1; std r11,48(r3)
209err1; std r12,56(r3)
210 addi r3,r3,64
211
212 /* Up to 63B to go */
2137: bf cr7*4+2,8f
214err1; ld r0,0(r4)
215err1; ld r6,8(r4)
216err1; ld r7,16(r4)
217err1; ld r8,24(r4)
218 addi r4,r4,32
219err1; std r0,0(r3)
220err1; std r6,8(r3)
221err1; std r7,16(r3)
222err1; std r8,24(r3)
223 addi r3,r3,32
224
225 /* Up to 31B to go */
2268: bf cr7*4+3,9f
227err1; ld r0,0(r4)
228err1; ld r6,8(r4)
229 addi r4,r4,16
230err1; std r0,0(r3)
231err1; std r6,8(r3)
232 addi r3,r3,16
233
2349: clrldi r5,r5,(64-4)
235
236 /* Up to 15B to go */
237.Lshort_copy:
238 mtocrf 0x01,r5
239 bf cr7*4+0,12f
240err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
241err1; lwz r6,4(r4)
242 addi r4,r4,8
243err1; stw r0,0(r3)
244err1; stw r6,4(r3)
245 addi r3,r3,8
246
24712: bf cr7*4+1,13f
248err1; lwz r0,0(r4)
249 addi r4,r4,4
250err1; stw r0,0(r3)
251 addi r3,r3,4
252
25313: bf cr7*4+2,14f
254err1; lhz r0,0(r4)
255 addi r4,r4,2
256err1; sth r0,0(r3)
257 addi r3,r3,2
258
25914: bf cr7*4+3,15f
260err1; lbz r0,0(r4)
261err1; stb r0,0(r3)
262
26315: li r3,0
264 blr
265
266.Lunwind_stack_nonvmx_copy:
267 addi r1,r1,STACKFRAMESIZE
268 b .Lnonvmx_copy
269
270.Lvmx_copy:
271#ifdef CONFIG_ALTIVEC
272 mflr r0
273 std r0,16(r1)
274 stdu r1,-STACKFRAMESIZE(r1)
275 bl enter_vmx_usercopy
276 cmpwi cr1,r3,0
277 ld r0,STACKFRAMESIZE+16(r1)
278 ld r3,STK_REG(R31)(r1)
279 ld r4,STK_REG(R30)(r1)
280 ld r5,STK_REG(R29)(r1)
281 mtlr r0
282
283 /*
284 * We prefetch both the source and destination using enhanced touch
285 * instructions. We use a stream ID of 0 for the load side and
286 * 1 for the store side.
287 */
288 clrrdi r6,r4,7
289 clrrdi r9,r3,7
290 ori r9,r9,1 /* stream=1 */
291
292 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
293 cmpldi r7,0x3FF
294 ble 1f
295 li r7,0x3FF
2961: lis r0,0x0E00 /* depth=7 */
297 sldi r7,r7,7
298 or r7,r7,r0
299 ori r10,r7,1 /* stream=1 */
300
301 lis r8,0x8000 /* GO=1 */
302 clrldi r8,r8,32
303
304 /* setup read stream 0 */
305 dcbt 0,r6,0b01000 /* addr from */
306 dcbt 0,r7,0b01010 /* length and depth from */
307 /* setup write stream 1 */
308 dcbtst 0,r9,0b01000 /* addr to */
309 dcbtst 0,r10,0b01010 /* length and depth to */
310 eieio
311 dcbt 0,r8,0b01010 /* all streams GO */
312
313 beq cr1,.Lunwind_stack_nonvmx_copy
314
315 /*
316 * If source and destination are not relatively aligned we use a
317 * slower permute loop.
318 */
319 xor r6,r4,r3
320 rldicl. r6,r6,0,(64-4)
321 bne .Lvmx_unaligned_copy
322
323 /* Get the destination 16B aligned */
324 neg r6,r3
325 mtocrf 0x01,r6
326 clrldi r6,r6,(64-4)
327
328 bf cr7*4+3,1f
329err3; lbz r0,0(r4)
330 addi r4,r4,1
331err3; stb r0,0(r3)
332 addi r3,r3,1
333
3341: bf cr7*4+2,2f
335err3; lhz r0,0(r4)
336 addi r4,r4,2
337err3; sth r0,0(r3)
338 addi r3,r3,2
339
3402: bf cr7*4+1,3f
341err3; lwz r0,0(r4)
342 addi r4,r4,4
343err3; stw r0,0(r3)
344 addi r3,r3,4
345
3463: bf cr7*4+0,4f
347err3; ld r0,0(r4)
348 addi r4,r4,8
349err3; std r0,0(r3)
350 addi r3,r3,8
351
3524: sub r5,r5,r6
353
354 /* Get the desination 128B aligned */
355 neg r6,r3
356 srdi r7,r6,4
357 mtocrf 0x01,r7
358 clrldi r6,r6,(64-7)
359
360 li r9,16
361 li r10,32
362 li r11,48
363
364 bf cr7*4+3,5f
365err3; lvx v1,0,r4
366 addi r4,r4,16
367err3; stvx v1,0,r3
368 addi r3,r3,16
369
3705: bf cr7*4+2,6f
371err3; lvx v1,0,r4
372err3; lvx v0,r4,r9
373 addi r4,r4,32
374err3; stvx v1,0,r3
375err3; stvx v0,r3,r9
376 addi r3,r3,32
377
3786: bf cr7*4+1,7f
379err3; lvx v3,0,r4
380err3; lvx v2,r4,r9
381err3; lvx v1,r4,r10
382err3; lvx v0,r4,r11
383 addi r4,r4,64
384err3; stvx v3,0,r3
385err3; stvx v2,r3,r9
386err3; stvx v1,r3,r10
387err3; stvx v0,r3,r11
388 addi r3,r3,64
389
3907: sub r5,r5,r6
391 srdi r6,r5,7
392
393 std r14,STK_REG(R14)(r1)
394 std r15,STK_REG(R15)(r1)
395 std r16,STK_REG(R16)(r1)
396
397 li r12,64
398 li r14,80
399 li r15,96
400 li r16,112
401
402 mtctr r6
403
404 /*
405 * Now do cacheline sized loads and stores. By this stage the
406 * cacheline stores are also cacheline aligned.
407 */
408 .align 5
4098:
410err4; lvx v7,0,r4
411err4; lvx v6,r4,r9
412err4; lvx v5,r4,r10
413err4; lvx v4,r4,r11
414err4; lvx v3,r4,r12
415err4; lvx v2,r4,r14
416err4; lvx v1,r4,r15
417err4; lvx v0,r4,r16
418 addi r4,r4,128
419err4; stvx v7,0,r3
420err4; stvx v6,r3,r9
421err4; stvx v5,r3,r10
422err4; stvx v4,r3,r11
423err4; stvx v3,r3,r12
424err4; stvx v2,r3,r14
425err4; stvx v1,r3,r15
426err4; stvx v0,r3,r16
427 addi r3,r3,128
428 bdnz 8b
429
430 ld r14,STK_REG(R14)(r1)
431 ld r15,STK_REG(R15)(r1)
432 ld r16,STK_REG(R16)(r1)
433
434 /* Up to 127B to go */
435 clrldi r5,r5,(64-7)
436 srdi r6,r5,4
437 mtocrf 0x01,r6
438
439 bf cr7*4+1,9f
440err3; lvx v3,0,r4
441err3; lvx v2,r4,r9
442err3; lvx v1,r4,r10
443err3; lvx v0,r4,r11
444 addi r4,r4,64
445err3; stvx v3,0,r3
446err3; stvx v2,r3,r9
447err3; stvx v1,r3,r10
448err3; stvx v0,r3,r11
449 addi r3,r3,64
450
4519: bf cr7*4+2,10f
452err3; lvx v1,0,r4
453err3; lvx v0,r4,r9
454 addi r4,r4,32
455err3; stvx v1,0,r3
456err3; stvx v0,r3,r9
457 addi r3,r3,32
458
45910: bf cr7*4+3,11f
460err3; lvx v1,0,r4
461 addi r4,r4,16
462err3; stvx v1,0,r3
463 addi r3,r3,16
464
465 /* Up to 15B to go */
46611: clrldi r5,r5,(64-4)
467 mtocrf 0x01,r5
468 bf cr7*4+0,12f
469err3; ld r0,0(r4)
470 addi r4,r4,8
471err3; std r0,0(r3)
472 addi r3,r3,8
473
47412: bf cr7*4+1,13f
475err3; lwz r0,0(r4)
476 addi r4,r4,4
477err3; stw r0,0(r3)
478 addi r3,r3,4
479
48013: bf cr7*4+2,14f
481err3; lhz r0,0(r4)
482 addi r4,r4,2
483err3; sth r0,0(r3)
484 addi r3,r3,2
485
48614: bf cr7*4+3,15f
487err3; lbz r0,0(r4)
488err3; stb r0,0(r3)
489
49015: addi r1,r1,STACKFRAMESIZE
491 b exit_vmx_usercopy /* tail call optimise */
492
493.Lvmx_unaligned_copy:
494 /* Get the destination 16B aligned */
495 neg r6,r3
496 mtocrf 0x01,r6
497 clrldi r6,r6,(64-4)
498
499 bf cr7*4+3,1f
500err3; lbz r0,0(r4)
501 addi r4,r4,1
502err3; stb r0,0(r3)
503 addi r3,r3,1
504
5051: bf cr7*4+2,2f
506err3; lhz r0,0(r4)
507 addi r4,r4,2
508err3; sth r0,0(r3)
509 addi r3,r3,2
510
5112: bf cr7*4+1,3f
512err3; lwz r0,0(r4)
513 addi r4,r4,4
514err3; stw r0,0(r3)
515 addi r3,r3,4
516
5173: bf cr7*4+0,4f
518err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
519err3; lwz r7,4(r4)
520 addi r4,r4,8
521err3; stw r0,0(r3)
522err3; stw r7,4(r3)
523 addi r3,r3,8
524
5254: sub r5,r5,r6
526
527 /* Get the desination 128B aligned */
528 neg r6,r3
529 srdi r7,r6,4
530 mtocrf 0x01,r7
531 clrldi r6,r6,(64-7)
532
533 li r9,16
534 li r10,32
535 li r11,48
536
537 LVS(v16,0,r4) /* Setup permute control vector */
538err3; lvx v0,0,r4
539 addi r4,r4,16
540
541 bf cr7*4+3,5f
542err3; lvx v1,0,r4
543 VPERM(v8,v0,v1,v16)
544 addi r4,r4,16
545err3; stvx v8,0,r3
546 addi r3,r3,16
547 vor v0,v1,v1
548
5495: bf cr7*4+2,6f
550err3; lvx v1,0,r4
551 VPERM(v8,v0,v1,v16)
552err3; lvx v0,r4,r9
553 VPERM(v9,v1,v0,v16)
554 addi r4,r4,32
555err3; stvx v8,0,r3
556err3; stvx v9,r3,r9
557 addi r3,r3,32
558
5596: bf cr7*4+1,7f
560err3; lvx v3,0,r4
561 VPERM(v8,v0,v3,v16)
562err3; lvx v2,r4,r9
563 VPERM(v9,v3,v2,v16)
564err3; lvx v1,r4,r10
565 VPERM(v10,v2,v1,v16)
566err3; lvx v0,r4,r11
567 VPERM(v11,v1,v0,v16)
568 addi r4,r4,64
569err3; stvx v8,0,r3
570err3; stvx v9,r3,r9
571err3; stvx v10,r3,r10
572err3; stvx v11,r3,r11
573 addi r3,r3,64
574
5757: sub r5,r5,r6
576 srdi r6,r5,7
577
578 std r14,STK_REG(R14)(r1)
579 std r15,STK_REG(R15)(r1)
580 std r16,STK_REG(R16)(r1)
581
582 li r12,64
583 li r14,80
584 li r15,96
585 li r16,112
586
587 mtctr r6
588
589 /*
590 * Now do cacheline sized loads and stores. By this stage the
591 * cacheline stores are also cacheline aligned.
592 */
593 .align 5
5948:
595err4; lvx v7,0,r4
596 VPERM(v8,v0,v7,v16)
597err4; lvx v6,r4,r9
598 VPERM(v9,v7,v6,v16)
599err4; lvx v5,r4,r10
600 VPERM(v10,v6,v5,v16)
601err4; lvx v4,r4,r11
602 VPERM(v11,v5,v4,v16)
603err4; lvx v3,r4,r12
604 VPERM(v12,v4,v3,v16)
605err4; lvx v2,r4,r14
606 VPERM(v13,v3,v2,v16)
607err4; lvx v1,r4,r15
608 VPERM(v14,v2,v1,v16)
609err4; lvx v0,r4,r16
610 VPERM(v15,v1,v0,v16)
611 addi r4,r4,128
612err4; stvx v8,0,r3
613err4; stvx v9,r3,r9
614err4; stvx v10,r3,r10
615err4; stvx v11,r3,r11
616err4; stvx v12,r3,r12
617err4; stvx v13,r3,r14
618err4; stvx v14,r3,r15
619err4; stvx v15,r3,r16
620 addi r3,r3,128
621 bdnz 8b
622
623 ld r14,STK_REG(R14)(r1)
624 ld r15,STK_REG(R15)(r1)
625 ld r16,STK_REG(R16)(r1)
626
627 /* Up to 127B to go */
628 clrldi r5,r5,(64-7)
629 srdi r6,r5,4
630 mtocrf 0x01,r6
631
632 bf cr7*4+1,9f
633err3; lvx v3,0,r4
634 VPERM(v8,v0,v3,v16)
635err3; lvx v2,r4,r9
636 VPERM(v9,v3,v2,v16)
637err3; lvx v1,r4,r10
638 VPERM(v10,v2,v1,v16)
639err3; lvx v0,r4,r11
640 VPERM(v11,v1,v0,v16)
641 addi r4,r4,64
642err3; stvx v8,0,r3
643err3; stvx v9,r3,r9
644err3; stvx v10,r3,r10
645err3; stvx v11,r3,r11
646 addi r3,r3,64
647
6489: bf cr7*4+2,10f
649err3; lvx v1,0,r4
650 VPERM(v8,v0,v1,v16)
651err3; lvx v0,r4,r9
652 VPERM(v9,v1,v0,v16)
653 addi r4,r4,32
654err3; stvx v8,0,r3
655err3; stvx v9,r3,r9
656 addi r3,r3,32
657
65810: bf cr7*4+3,11f
659err3; lvx v1,0,r4
660 VPERM(v8,v0,v1,v16)
661 addi r4,r4,16
662err3; stvx v8,0,r3
663 addi r3,r3,16
664
665 /* Up to 15B to go */
66611: clrldi r5,r5,(64-4)
667 addi r4,r4,-16 /* Unwind the +16 load offset */
668 mtocrf 0x01,r5
669 bf cr7*4+0,12f
670err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
671err3; lwz r6,4(r4)
672 addi r4,r4,8
673err3; stw r0,0(r3)
674err3; stw r6,4(r3)
675 addi r3,r3,8
676
67712: bf cr7*4+1,13f
678err3; lwz r0,0(r4)
679 addi r4,r4,4
680err3; stw r0,0(r3)
681 addi r3,r3,4
682
68313: bf cr7*4+2,14f
684err3; lhz r0,0(r4)
685 addi r4,r4,2
686err3; sth r0,0(r3)
687 addi r3,r3,2
688
68914: bf cr7*4+3,15f
690err3; lbz r0,0(r4)
691err3; stb r0,0(r3)
692
69315: addi r1,r1,STACKFRAMESIZE
694 b exit_vmx_usercopy /* tail call optimise */
695#endif /* CONFIG_ALTIVEC */
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22#ifdef __BIG_ENDIAN__
23#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
24#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
25#else
26#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
27#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
28#endif
29
30 .macro err1
31100:
32 EX_TABLE(100b,.Ldo_err1)
33 .endm
34
35 .macro err2
36200:
37 EX_TABLE(200b,.Ldo_err2)
38 .endm
39
40#ifdef CONFIG_ALTIVEC
41 .macro err3
42300:
43 EX_TABLE(300b,.Ldo_err3)
44 .endm
45
46 .macro err4
47400:
48 EX_TABLE(400b,.Ldo_err4)
49 .endm
50
51
52.Ldo_err4:
53 ld r16,STK_REG(R16)(r1)
54 ld r15,STK_REG(R15)(r1)
55 ld r14,STK_REG(R14)(r1)
56.Ldo_err3:
57 bl exit_vmx_usercopy
58 ld r0,STACKFRAMESIZE+16(r1)
59 mtlr r0
60 b .Lexit
61#endif /* CONFIG_ALTIVEC */
62
63.Ldo_err2:
64 ld r22,STK_REG(R22)(r1)
65 ld r21,STK_REG(R21)(r1)
66 ld r20,STK_REG(R20)(r1)
67 ld r19,STK_REG(R19)(r1)
68 ld r18,STK_REG(R18)(r1)
69 ld r17,STK_REG(R17)(r1)
70 ld r16,STK_REG(R16)(r1)
71 ld r15,STK_REG(R15)(r1)
72 ld r14,STK_REG(R14)(r1)
73.Lexit:
74 addi r1,r1,STACKFRAMESIZE
75.Ldo_err1:
76 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
77 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
78 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
79 b __copy_tofrom_user_base
80
81
82_GLOBAL(__copy_tofrom_user_power7)
83#ifdef CONFIG_ALTIVEC
84 cmpldi r5,16
85 cmpldi cr1,r5,4096
86
87 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
88 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
89 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
90
91 blt .Lshort_copy
92 bgt cr1,.Lvmx_copy
93#else
94 cmpldi r5,16
95
96 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
97 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
98 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
99
100 blt .Lshort_copy
101#endif
102
103.Lnonvmx_copy:
104 /* Get the source 8B aligned */
105 neg r6,r4
106 mtocrf 0x01,r6
107 clrldi r6,r6,(64-3)
108
109 bf cr7*4+3,1f
110err1; lbz r0,0(r4)
111 addi r4,r4,1
112err1; stb r0,0(r3)
113 addi r3,r3,1
114
1151: bf cr7*4+2,2f
116err1; lhz r0,0(r4)
117 addi r4,r4,2
118err1; sth r0,0(r3)
119 addi r3,r3,2
120
1212: bf cr7*4+1,3f
122err1; lwz r0,0(r4)
123 addi r4,r4,4
124err1; stw r0,0(r3)
125 addi r3,r3,4
126
1273: sub r5,r5,r6
128 cmpldi r5,128
129 blt 5f
130
131 mflr r0
132 stdu r1,-STACKFRAMESIZE(r1)
133 std r14,STK_REG(R14)(r1)
134 std r15,STK_REG(R15)(r1)
135 std r16,STK_REG(R16)(r1)
136 std r17,STK_REG(R17)(r1)
137 std r18,STK_REG(R18)(r1)
138 std r19,STK_REG(R19)(r1)
139 std r20,STK_REG(R20)(r1)
140 std r21,STK_REG(R21)(r1)
141 std r22,STK_REG(R22)(r1)
142 std r0,STACKFRAMESIZE+16(r1)
143
144 srdi r6,r5,7
145 mtctr r6
146
147 /* Now do cacheline (128B) sized loads and stores. */
148 .align 5
1494:
150err2; ld r0,0(r4)
151err2; ld r6,8(r4)
152err2; ld r7,16(r4)
153err2; ld r8,24(r4)
154err2; ld r9,32(r4)
155err2; ld r10,40(r4)
156err2; ld r11,48(r4)
157err2; ld r12,56(r4)
158err2; ld r14,64(r4)
159err2; ld r15,72(r4)
160err2; ld r16,80(r4)
161err2; ld r17,88(r4)
162err2; ld r18,96(r4)
163err2; ld r19,104(r4)
164err2; ld r20,112(r4)
165err2; ld r21,120(r4)
166 addi r4,r4,128
167err2; std r0,0(r3)
168err2; std r6,8(r3)
169err2; std r7,16(r3)
170err2; std r8,24(r3)
171err2; std r9,32(r3)
172err2; std r10,40(r3)
173err2; std r11,48(r3)
174err2; std r12,56(r3)
175err2; std r14,64(r3)
176err2; std r15,72(r3)
177err2; std r16,80(r3)
178err2; std r17,88(r3)
179err2; std r18,96(r3)
180err2; std r19,104(r3)
181err2; std r20,112(r3)
182err2; std r21,120(r3)
183 addi r3,r3,128
184 bdnz 4b
185
186 clrldi r5,r5,(64-7)
187
188 ld r14,STK_REG(R14)(r1)
189 ld r15,STK_REG(R15)(r1)
190 ld r16,STK_REG(R16)(r1)
191 ld r17,STK_REG(R17)(r1)
192 ld r18,STK_REG(R18)(r1)
193 ld r19,STK_REG(R19)(r1)
194 ld r20,STK_REG(R20)(r1)
195 ld r21,STK_REG(R21)(r1)
196 ld r22,STK_REG(R22)(r1)
197 addi r1,r1,STACKFRAMESIZE
198
199 /* Up to 127B to go */
2005: srdi r6,r5,4
201 mtocrf 0x01,r6
202
2036: bf cr7*4+1,7f
204err1; ld r0,0(r4)
205err1; ld r6,8(r4)
206err1; ld r7,16(r4)
207err1; ld r8,24(r4)
208err1; ld r9,32(r4)
209err1; ld r10,40(r4)
210err1; ld r11,48(r4)
211err1; ld r12,56(r4)
212 addi r4,r4,64
213err1; std r0,0(r3)
214err1; std r6,8(r3)
215err1; std r7,16(r3)
216err1; std r8,24(r3)
217err1; std r9,32(r3)
218err1; std r10,40(r3)
219err1; std r11,48(r3)
220err1; std r12,56(r3)
221 addi r3,r3,64
222
223 /* Up to 63B to go */
2247: bf cr7*4+2,8f
225err1; ld r0,0(r4)
226err1; ld r6,8(r4)
227err1; ld r7,16(r4)
228err1; ld r8,24(r4)
229 addi r4,r4,32
230err1; std r0,0(r3)
231err1; std r6,8(r3)
232err1; std r7,16(r3)
233err1; std r8,24(r3)
234 addi r3,r3,32
235
236 /* Up to 31B to go */
2378: bf cr7*4+3,9f
238err1; ld r0,0(r4)
239err1; ld r6,8(r4)
240 addi r4,r4,16
241err1; std r0,0(r3)
242err1; std r6,8(r3)
243 addi r3,r3,16
244
2459: clrldi r5,r5,(64-4)
246
247 /* Up to 15B to go */
248.Lshort_copy:
249 mtocrf 0x01,r5
250 bf cr7*4+0,12f
251err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
252err1; lwz r6,4(r4)
253 addi r4,r4,8
254err1; stw r0,0(r3)
255err1; stw r6,4(r3)
256 addi r3,r3,8
257
25812: bf cr7*4+1,13f
259err1; lwz r0,0(r4)
260 addi r4,r4,4
261err1; stw r0,0(r3)
262 addi r3,r3,4
263
26413: bf cr7*4+2,14f
265err1; lhz r0,0(r4)
266 addi r4,r4,2
267err1; sth r0,0(r3)
268 addi r3,r3,2
269
27014: bf cr7*4+3,15f
271err1; lbz r0,0(r4)
272err1; stb r0,0(r3)
273
27415: li r3,0
275 blr
276
277.Lunwind_stack_nonvmx_copy:
278 addi r1,r1,STACKFRAMESIZE
279 b .Lnonvmx_copy
280
281#ifdef CONFIG_ALTIVEC
282.Lvmx_copy:
283 mflr r0
284 std r0,16(r1)
285 stdu r1,-STACKFRAMESIZE(r1)
286 bl enter_vmx_usercopy
287 cmpwi cr1,r3,0
288 ld r0,STACKFRAMESIZE+16(r1)
289 ld r3,STK_REG(R31)(r1)
290 ld r4,STK_REG(R30)(r1)
291 ld r5,STK_REG(R29)(r1)
292 mtlr r0
293
294 /*
295 * We prefetch both the source and destination using enhanced touch
296 * instructions. We use a stream ID of 0 for the load side and
297 * 1 for the store side.
298 */
299 clrrdi r6,r4,7
300 clrrdi r9,r3,7
301 ori r9,r9,1 /* stream=1 */
302
303 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
304 cmpldi r7,0x3FF
305 ble 1f
306 li r7,0x3FF
3071: lis r0,0x0E00 /* depth=7 */
308 sldi r7,r7,7
309 or r7,r7,r0
310 ori r10,r7,1 /* stream=1 */
311
312 lis r8,0x8000 /* GO=1 */
313 clrldi r8,r8,32
314
315.machine push
316.machine "power4"
317 /* setup read stream 0 */
318 dcbt r0,r6,0b01000 /* addr from */
319 dcbt r0,r7,0b01010 /* length and depth from */
320 /* setup write stream 1 */
321 dcbtst r0,r9,0b01000 /* addr to */
322 dcbtst r0,r10,0b01010 /* length and depth to */
323 eieio
324 dcbt r0,r8,0b01010 /* all streams GO */
325.machine pop
326
327 beq cr1,.Lunwind_stack_nonvmx_copy
328
329 /*
330 * If source and destination are not relatively aligned we use a
331 * slower permute loop.
332 */
333 xor r6,r4,r3
334 rldicl. r6,r6,0,(64-4)
335 bne .Lvmx_unaligned_copy
336
337 /* Get the destination 16B aligned */
338 neg r6,r3
339 mtocrf 0x01,r6
340 clrldi r6,r6,(64-4)
341
342 bf cr7*4+3,1f
343err3; lbz r0,0(r4)
344 addi r4,r4,1
345err3; stb r0,0(r3)
346 addi r3,r3,1
347
3481: bf cr7*4+2,2f
349err3; lhz r0,0(r4)
350 addi r4,r4,2
351err3; sth r0,0(r3)
352 addi r3,r3,2
353
3542: bf cr7*4+1,3f
355err3; lwz r0,0(r4)
356 addi r4,r4,4
357err3; stw r0,0(r3)
358 addi r3,r3,4
359
3603: bf cr7*4+0,4f
361err3; ld r0,0(r4)
362 addi r4,r4,8
363err3; std r0,0(r3)
364 addi r3,r3,8
365
3664: sub r5,r5,r6
367
368 /* Get the desination 128B aligned */
369 neg r6,r3
370 srdi r7,r6,4
371 mtocrf 0x01,r7
372 clrldi r6,r6,(64-7)
373
374 li r9,16
375 li r10,32
376 li r11,48
377
378 bf cr7*4+3,5f
379err3; lvx v1,r0,r4
380 addi r4,r4,16
381err3; stvx v1,r0,r3
382 addi r3,r3,16
383
3845: bf cr7*4+2,6f
385err3; lvx v1,r0,r4
386err3; lvx v0,r4,r9
387 addi r4,r4,32
388err3; stvx v1,r0,r3
389err3; stvx v0,r3,r9
390 addi r3,r3,32
391
3926: bf cr7*4+1,7f
393err3; lvx v3,r0,r4
394err3; lvx v2,r4,r9
395err3; lvx v1,r4,r10
396err3; lvx v0,r4,r11
397 addi r4,r4,64
398err3; stvx v3,r0,r3
399err3; stvx v2,r3,r9
400err3; stvx v1,r3,r10
401err3; stvx v0,r3,r11
402 addi r3,r3,64
403
4047: sub r5,r5,r6
405 srdi r6,r5,7
406
407 std r14,STK_REG(R14)(r1)
408 std r15,STK_REG(R15)(r1)
409 std r16,STK_REG(R16)(r1)
410
411 li r12,64
412 li r14,80
413 li r15,96
414 li r16,112
415
416 mtctr r6
417
418 /*
419 * Now do cacheline sized loads and stores. By this stage the
420 * cacheline stores are also cacheline aligned.
421 */
422 .align 5
4238:
424err4; lvx v7,r0,r4
425err4; lvx v6,r4,r9
426err4; lvx v5,r4,r10
427err4; lvx v4,r4,r11
428err4; lvx v3,r4,r12
429err4; lvx v2,r4,r14
430err4; lvx v1,r4,r15
431err4; lvx v0,r4,r16
432 addi r4,r4,128
433err4; stvx v7,r0,r3
434err4; stvx v6,r3,r9
435err4; stvx v5,r3,r10
436err4; stvx v4,r3,r11
437err4; stvx v3,r3,r12
438err4; stvx v2,r3,r14
439err4; stvx v1,r3,r15
440err4; stvx v0,r3,r16
441 addi r3,r3,128
442 bdnz 8b
443
444 ld r14,STK_REG(R14)(r1)
445 ld r15,STK_REG(R15)(r1)
446 ld r16,STK_REG(R16)(r1)
447
448 /* Up to 127B to go */
449 clrldi r5,r5,(64-7)
450 srdi r6,r5,4
451 mtocrf 0x01,r6
452
453 bf cr7*4+1,9f
454err3; lvx v3,r0,r4
455err3; lvx v2,r4,r9
456err3; lvx v1,r4,r10
457err3; lvx v0,r4,r11
458 addi r4,r4,64
459err3; stvx v3,r0,r3
460err3; stvx v2,r3,r9
461err3; stvx v1,r3,r10
462err3; stvx v0,r3,r11
463 addi r3,r3,64
464
4659: bf cr7*4+2,10f
466err3; lvx v1,r0,r4
467err3; lvx v0,r4,r9
468 addi r4,r4,32
469err3; stvx v1,r0,r3
470err3; stvx v0,r3,r9
471 addi r3,r3,32
472
47310: bf cr7*4+3,11f
474err3; lvx v1,r0,r4
475 addi r4,r4,16
476err3; stvx v1,r0,r3
477 addi r3,r3,16
478
479 /* Up to 15B to go */
48011: clrldi r5,r5,(64-4)
481 mtocrf 0x01,r5
482 bf cr7*4+0,12f
483err3; ld r0,0(r4)
484 addi r4,r4,8
485err3; std r0,0(r3)
486 addi r3,r3,8
487
48812: bf cr7*4+1,13f
489err3; lwz r0,0(r4)
490 addi r4,r4,4
491err3; stw r0,0(r3)
492 addi r3,r3,4
493
49413: bf cr7*4+2,14f
495err3; lhz r0,0(r4)
496 addi r4,r4,2
497err3; sth r0,0(r3)
498 addi r3,r3,2
499
50014: bf cr7*4+3,15f
501err3; lbz r0,0(r4)
502err3; stb r0,0(r3)
503
50415: addi r1,r1,STACKFRAMESIZE
505 b exit_vmx_usercopy /* tail call optimise */
506
507.Lvmx_unaligned_copy:
508 /* Get the destination 16B aligned */
509 neg r6,r3
510 mtocrf 0x01,r6
511 clrldi r6,r6,(64-4)
512
513 bf cr7*4+3,1f
514err3; lbz r0,0(r4)
515 addi r4,r4,1
516err3; stb r0,0(r3)
517 addi r3,r3,1
518
5191: bf cr7*4+2,2f
520err3; lhz r0,0(r4)
521 addi r4,r4,2
522err3; sth r0,0(r3)
523 addi r3,r3,2
524
5252: bf cr7*4+1,3f
526err3; lwz r0,0(r4)
527 addi r4,r4,4
528err3; stw r0,0(r3)
529 addi r3,r3,4
530
5313: bf cr7*4+0,4f
532err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
533err3; lwz r7,4(r4)
534 addi r4,r4,8
535err3; stw r0,0(r3)
536err3; stw r7,4(r3)
537 addi r3,r3,8
538
5394: sub r5,r5,r6
540
541 /* Get the desination 128B aligned */
542 neg r6,r3
543 srdi r7,r6,4
544 mtocrf 0x01,r7
545 clrldi r6,r6,(64-7)
546
547 li r9,16
548 li r10,32
549 li r11,48
550
551 LVS(v16,0,r4) /* Setup permute control vector */
552err3; lvx v0,0,r4
553 addi r4,r4,16
554
555 bf cr7*4+3,5f
556err3; lvx v1,r0,r4
557 VPERM(v8,v0,v1,v16)
558 addi r4,r4,16
559err3; stvx v8,r0,r3
560 addi r3,r3,16
561 vor v0,v1,v1
562
5635: bf cr7*4+2,6f
564err3; lvx v1,r0,r4
565 VPERM(v8,v0,v1,v16)
566err3; lvx v0,r4,r9
567 VPERM(v9,v1,v0,v16)
568 addi r4,r4,32
569err3; stvx v8,r0,r3
570err3; stvx v9,r3,r9
571 addi r3,r3,32
572
5736: bf cr7*4+1,7f
574err3; lvx v3,r0,r4
575 VPERM(v8,v0,v3,v16)
576err3; lvx v2,r4,r9
577 VPERM(v9,v3,v2,v16)
578err3; lvx v1,r4,r10
579 VPERM(v10,v2,v1,v16)
580err3; lvx v0,r4,r11
581 VPERM(v11,v1,v0,v16)
582 addi r4,r4,64
583err3; stvx v8,r0,r3
584err3; stvx v9,r3,r9
585err3; stvx v10,r3,r10
586err3; stvx v11,r3,r11
587 addi r3,r3,64
588
5897: sub r5,r5,r6
590 srdi r6,r5,7
591
592 std r14,STK_REG(R14)(r1)
593 std r15,STK_REG(R15)(r1)
594 std r16,STK_REG(R16)(r1)
595
596 li r12,64
597 li r14,80
598 li r15,96
599 li r16,112
600
601 mtctr r6
602
603 /*
604 * Now do cacheline sized loads and stores. By this stage the
605 * cacheline stores are also cacheline aligned.
606 */
607 .align 5
6088:
609err4; lvx v7,r0,r4
610 VPERM(v8,v0,v7,v16)
611err4; lvx v6,r4,r9
612 VPERM(v9,v7,v6,v16)
613err4; lvx v5,r4,r10
614 VPERM(v10,v6,v5,v16)
615err4; lvx v4,r4,r11
616 VPERM(v11,v5,v4,v16)
617err4; lvx v3,r4,r12
618 VPERM(v12,v4,v3,v16)
619err4; lvx v2,r4,r14
620 VPERM(v13,v3,v2,v16)
621err4; lvx v1,r4,r15
622 VPERM(v14,v2,v1,v16)
623err4; lvx v0,r4,r16
624 VPERM(v15,v1,v0,v16)
625 addi r4,r4,128
626err4; stvx v8,r0,r3
627err4; stvx v9,r3,r9
628err4; stvx v10,r3,r10
629err4; stvx v11,r3,r11
630err4; stvx v12,r3,r12
631err4; stvx v13,r3,r14
632err4; stvx v14,r3,r15
633err4; stvx v15,r3,r16
634 addi r3,r3,128
635 bdnz 8b
636
637 ld r14,STK_REG(R14)(r1)
638 ld r15,STK_REG(R15)(r1)
639 ld r16,STK_REG(R16)(r1)
640
641 /* Up to 127B to go */
642 clrldi r5,r5,(64-7)
643 srdi r6,r5,4
644 mtocrf 0x01,r6
645
646 bf cr7*4+1,9f
647err3; lvx v3,r0,r4
648 VPERM(v8,v0,v3,v16)
649err3; lvx v2,r4,r9
650 VPERM(v9,v3,v2,v16)
651err3; lvx v1,r4,r10
652 VPERM(v10,v2,v1,v16)
653err3; lvx v0,r4,r11
654 VPERM(v11,v1,v0,v16)
655 addi r4,r4,64
656err3; stvx v8,r0,r3
657err3; stvx v9,r3,r9
658err3; stvx v10,r3,r10
659err3; stvx v11,r3,r11
660 addi r3,r3,64
661
6629: bf cr7*4+2,10f
663err3; lvx v1,r0,r4
664 VPERM(v8,v0,v1,v16)
665err3; lvx v0,r4,r9
666 VPERM(v9,v1,v0,v16)
667 addi r4,r4,32
668err3; stvx v8,r0,r3
669err3; stvx v9,r3,r9
670 addi r3,r3,32
671
67210: bf cr7*4+3,11f
673err3; lvx v1,r0,r4
674 VPERM(v8,v0,v1,v16)
675 addi r4,r4,16
676err3; stvx v8,r0,r3
677 addi r3,r3,16
678
679 /* Up to 15B to go */
68011: clrldi r5,r5,(64-4)
681 addi r4,r4,-16 /* Unwind the +16 load offset */
682 mtocrf 0x01,r5
683 bf cr7*4+0,12f
684err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
685err3; lwz r6,4(r4)
686 addi r4,r4,8
687err3; stw r0,0(r3)
688err3; stw r6,4(r3)
689 addi r3,r3,8
690
69112: bf cr7*4+1,13f
692err3; lwz r0,0(r4)
693 addi r4,r4,4
694err3; stw r0,0(r3)
695 addi r3,r3,4
696
69713: bf cr7*4+2,14f
698err3; lhz r0,0(r4)
699 addi r4,r4,2
700err3; sth r0,0(r3)
701 addi r3,r3,2
702
70314: bf cr7*4+3,15f
704err3; lbz r0,0(r4)
705err3; stb r0,0(r3)
706
70715: addi r1,r1,STACKFRAMESIZE
708 b exit_vmx_usercopy /* tail call optimise */
709#endif /* CONFIG_ALTIVEC */