Loading...
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *
4 * Copyright (C) IBM Corporation, 2012
5 *
6 * Author: Anton Blanchard <anton@au.ibm.com>
7 */
8#include <asm/ppc_asm.h>
9
10#ifndef SELFTEST_CASE
11/* 0 == don't use VMX, 1 == use VMX */
12#define SELFTEST_CASE 0
13#endif
14
15#ifdef __BIG_ENDIAN__
16#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
17#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
18#else
19#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
20#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
21#endif
22
23_GLOBAL(memcpy_power7)
24 cmpldi r5,16
25 cmpldi cr1,r5,4096
26 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
27 blt .Lshort_copy
28
29#ifdef CONFIG_ALTIVEC
30test_feature = SELFTEST_CASE
31BEGIN_FTR_SECTION
32 bgt cr1, .Lvmx_copy
33END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
34#endif
35
36.Lnonvmx_copy:
37 /* Get the source 8B aligned */
38 neg r6,r4
39 mtocrf 0x01,r6
40 clrldi r6,r6,(64-3)
41
42 bf cr7*4+3,1f
43 lbz r0,0(r4)
44 addi r4,r4,1
45 stb r0,0(r3)
46 addi r3,r3,1
47
481: bf cr7*4+2,2f
49 lhz r0,0(r4)
50 addi r4,r4,2
51 sth r0,0(r3)
52 addi r3,r3,2
53
542: bf cr7*4+1,3f
55 lwz r0,0(r4)
56 addi r4,r4,4
57 stw r0,0(r3)
58 addi r3,r3,4
59
603: sub r5,r5,r6
61 cmpldi r5,128
62 blt 5f
63
64 mflr r0
65 stdu r1,-STACKFRAMESIZE(r1)
66 std r14,STK_REG(R14)(r1)
67 std r15,STK_REG(R15)(r1)
68 std r16,STK_REG(R16)(r1)
69 std r17,STK_REG(R17)(r1)
70 std r18,STK_REG(R18)(r1)
71 std r19,STK_REG(R19)(r1)
72 std r20,STK_REG(R20)(r1)
73 std r21,STK_REG(R21)(r1)
74 std r22,STK_REG(R22)(r1)
75 std r0,STACKFRAMESIZE+16(r1)
76
77 srdi r6,r5,7
78 mtctr r6
79
80 /* Now do cacheline (128B) sized loads and stores. */
81 .align 5
824:
83 ld r0,0(r4)
84 ld r6,8(r4)
85 ld r7,16(r4)
86 ld r8,24(r4)
87 ld r9,32(r4)
88 ld r10,40(r4)
89 ld r11,48(r4)
90 ld r12,56(r4)
91 ld r14,64(r4)
92 ld r15,72(r4)
93 ld r16,80(r4)
94 ld r17,88(r4)
95 ld r18,96(r4)
96 ld r19,104(r4)
97 ld r20,112(r4)
98 ld r21,120(r4)
99 addi r4,r4,128
100 std r0,0(r3)
101 std r6,8(r3)
102 std r7,16(r3)
103 std r8,24(r3)
104 std r9,32(r3)
105 std r10,40(r3)
106 std r11,48(r3)
107 std r12,56(r3)
108 std r14,64(r3)
109 std r15,72(r3)
110 std r16,80(r3)
111 std r17,88(r3)
112 std r18,96(r3)
113 std r19,104(r3)
114 std r20,112(r3)
115 std r21,120(r3)
116 addi r3,r3,128
117 bdnz 4b
118
119 clrldi r5,r5,(64-7)
120
121 ld r14,STK_REG(R14)(r1)
122 ld r15,STK_REG(R15)(r1)
123 ld r16,STK_REG(R16)(r1)
124 ld r17,STK_REG(R17)(r1)
125 ld r18,STK_REG(R18)(r1)
126 ld r19,STK_REG(R19)(r1)
127 ld r20,STK_REG(R20)(r1)
128 ld r21,STK_REG(R21)(r1)
129 ld r22,STK_REG(R22)(r1)
130 addi r1,r1,STACKFRAMESIZE
131
132 /* Up to 127B to go */
1335: srdi r6,r5,4
134 mtocrf 0x01,r6
135
1366: bf cr7*4+1,7f
137 ld r0,0(r4)
138 ld r6,8(r4)
139 ld r7,16(r4)
140 ld r8,24(r4)
141 ld r9,32(r4)
142 ld r10,40(r4)
143 ld r11,48(r4)
144 ld r12,56(r4)
145 addi r4,r4,64
146 std r0,0(r3)
147 std r6,8(r3)
148 std r7,16(r3)
149 std r8,24(r3)
150 std r9,32(r3)
151 std r10,40(r3)
152 std r11,48(r3)
153 std r12,56(r3)
154 addi r3,r3,64
155
156 /* Up to 63B to go */
1577: bf cr7*4+2,8f
158 ld r0,0(r4)
159 ld r6,8(r4)
160 ld r7,16(r4)
161 ld r8,24(r4)
162 addi r4,r4,32
163 std r0,0(r3)
164 std r6,8(r3)
165 std r7,16(r3)
166 std r8,24(r3)
167 addi r3,r3,32
168
169 /* Up to 31B to go */
1708: bf cr7*4+3,9f
171 ld r0,0(r4)
172 ld r6,8(r4)
173 addi r4,r4,16
174 std r0,0(r3)
175 std r6,8(r3)
176 addi r3,r3,16
177
1789: clrldi r5,r5,(64-4)
179
180 /* Up to 15B to go */
181.Lshort_copy:
182 mtocrf 0x01,r5
183 bf cr7*4+0,12f
184 lwz r0,0(r4) /* Less chance of a reject with word ops */
185 lwz r6,4(r4)
186 addi r4,r4,8
187 stw r0,0(r3)
188 stw r6,4(r3)
189 addi r3,r3,8
190
19112: bf cr7*4+1,13f
192 lwz r0,0(r4)
193 addi r4,r4,4
194 stw r0,0(r3)
195 addi r3,r3,4
196
19713: bf cr7*4+2,14f
198 lhz r0,0(r4)
199 addi r4,r4,2
200 sth r0,0(r3)
201 addi r3,r3,2
202
20314: bf cr7*4+3,15f
204 lbz r0,0(r4)
205 stb r0,0(r3)
206
20715: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208 blr
209
210.Lunwind_stack_nonvmx_copy:
211 addi r1,r1,STACKFRAMESIZE
212 b .Lnonvmx_copy
213
214.Lvmx_copy:
215#ifdef CONFIG_ALTIVEC
216 mflr r0
217 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219 std r0,16(r1)
220 stdu r1,-STACKFRAMESIZE(r1)
221 bl CFUNC(enter_vmx_ops)
222 cmpwi cr1,r3,0
223 ld r0,STACKFRAMESIZE+16(r1)
224 ld r3,STK_REG(R31)(r1)
225 ld r4,STK_REG(R30)(r1)
226 ld r5,STK_REG(R29)(r1)
227 mtlr r0
228
229 /*
230 * We prefetch both the source and destination using enhanced touch
231 * instructions. We use a stream ID of 0 for the load side and
232 * 1 for the store side.
233 */
234 clrrdi r6,r4,7
235 clrrdi r9,r3,7
236 ori r9,r9,1 /* stream=1 */
237
238 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
239 cmpldi r7,0x3FF
240 ble 1f
241 li r7,0x3FF
2421: lis r0,0x0E00 /* depth=7 */
243 sldi r7,r7,7
244 or r7,r7,r0
245 ori r10,r7,1 /* stream=1 */
246
247 lis r8,0x8000 /* GO=1 */
248 clrldi r8,r8,32
249
250 dcbt 0,r6,0b01000
251 dcbt 0,r7,0b01010
252 dcbtst 0,r9,0b01000
253 dcbtst 0,r10,0b01010
254 eieio
255 dcbt 0,r8,0b01010 /* GO */
256
257 beq cr1,.Lunwind_stack_nonvmx_copy
258
259 /*
260 * If source and destination are not relatively aligned we use a
261 * slower permute loop.
262 */
263 xor r6,r4,r3
264 rldicl. r6,r6,0,(64-4)
265 bne .Lvmx_unaligned_copy
266
267 /* Get the destination 16B aligned */
268 neg r6,r3
269 mtocrf 0x01,r6
270 clrldi r6,r6,(64-4)
271
272 bf cr7*4+3,1f
273 lbz r0,0(r4)
274 addi r4,r4,1
275 stb r0,0(r3)
276 addi r3,r3,1
277
2781: bf cr7*4+2,2f
279 lhz r0,0(r4)
280 addi r4,r4,2
281 sth r0,0(r3)
282 addi r3,r3,2
283
2842: bf cr7*4+1,3f
285 lwz r0,0(r4)
286 addi r4,r4,4
287 stw r0,0(r3)
288 addi r3,r3,4
289
2903: bf cr7*4+0,4f
291 ld r0,0(r4)
292 addi r4,r4,8
293 std r0,0(r3)
294 addi r3,r3,8
295
2964: sub r5,r5,r6
297
298 /* Get the desination 128B aligned */
299 neg r6,r3
300 srdi r7,r6,4
301 mtocrf 0x01,r7
302 clrldi r6,r6,(64-7)
303
304 li r9,16
305 li r10,32
306 li r11,48
307
308 bf cr7*4+3,5f
309 lvx v1,0,r4
310 addi r4,r4,16
311 stvx v1,0,r3
312 addi r3,r3,16
313
3145: bf cr7*4+2,6f
315 lvx v1,0,r4
316 lvx v0,r4,r9
317 addi r4,r4,32
318 stvx v1,0,r3
319 stvx v0,r3,r9
320 addi r3,r3,32
321
3226: bf cr7*4+1,7f
323 lvx v3,0,r4
324 lvx v2,r4,r9
325 lvx v1,r4,r10
326 lvx v0,r4,r11
327 addi r4,r4,64
328 stvx v3,0,r3
329 stvx v2,r3,r9
330 stvx v1,r3,r10
331 stvx v0,r3,r11
332 addi r3,r3,64
333
3347: sub r5,r5,r6
335 srdi r6,r5,7
336
337 std r14,STK_REG(R14)(r1)
338 std r15,STK_REG(R15)(r1)
339 std r16,STK_REG(R16)(r1)
340
341 li r12,64
342 li r14,80
343 li r15,96
344 li r16,112
345
346 mtctr r6
347
348 /*
349 * Now do cacheline sized loads and stores. By this stage the
350 * cacheline stores are also cacheline aligned.
351 */
352 .align 5
3538:
354 lvx v7,0,r4
355 lvx v6,r4,r9
356 lvx v5,r4,r10
357 lvx v4,r4,r11
358 lvx v3,r4,r12
359 lvx v2,r4,r14
360 lvx v1,r4,r15
361 lvx v0,r4,r16
362 addi r4,r4,128
363 stvx v7,0,r3
364 stvx v6,r3,r9
365 stvx v5,r3,r10
366 stvx v4,r3,r11
367 stvx v3,r3,r12
368 stvx v2,r3,r14
369 stvx v1,r3,r15
370 stvx v0,r3,r16
371 addi r3,r3,128
372 bdnz 8b
373
374 ld r14,STK_REG(R14)(r1)
375 ld r15,STK_REG(R15)(r1)
376 ld r16,STK_REG(R16)(r1)
377
378 /* Up to 127B to go */
379 clrldi r5,r5,(64-7)
380 srdi r6,r5,4
381 mtocrf 0x01,r6
382
383 bf cr7*4+1,9f
384 lvx v3,0,r4
385 lvx v2,r4,r9
386 lvx v1,r4,r10
387 lvx v0,r4,r11
388 addi r4,r4,64
389 stvx v3,0,r3
390 stvx v2,r3,r9
391 stvx v1,r3,r10
392 stvx v0,r3,r11
393 addi r3,r3,64
394
3959: bf cr7*4+2,10f
396 lvx v1,0,r4
397 lvx v0,r4,r9
398 addi r4,r4,32
399 stvx v1,0,r3
400 stvx v0,r3,r9
401 addi r3,r3,32
402
40310: bf cr7*4+3,11f
404 lvx v1,0,r4
405 addi r4,r4,16
406 stvx v1,0,r3
407 addi r3,r3,16
408
409 /* Up to 15B to go */
41011: clrldi r5,r5,(64-4)
411 mtocrf 0x01,r5
412 bf cr7*4+0,12f
413 ld r0,0(r4)
414 addi r4,r4,8
415 std r0,0(r3)
416 addi r3,r3,8
417
41812: bf cr7*4+1,13f
419 lwz r0,0(r4)
420 addi r4,r4,4
421 stw r0,0(r3)
422 addi r3,r3,4
423
42413: bf cr7*4+2,14f
425 lhz r0,0(r4)
426 addi r4,r4,2
427 sth r0,0(r3)
428 addi r3,r3,2
429
43014: bf cr7*4+3,15f
431 lbz r0,0(r4)
432 stb r0,0(r3)
433
43415: addi r1,r1,STACKFRAMESIZE
435 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
436 b CFUNC(exit_vmx_ops) /* tail call optimise */
437
438.Lvmx_unaligned_copy:
439 /* Get the destination 16B aligned */
440 neg r6,r3
441 mtocrf 0x01,r6
442 clrldi r6,r6,(64-4)
443
444 bf cr7*4+3,1f
445 lbz r0,0(r4)
446 addi r4,r4,1
447 stb r0,0(r3)
448 addi r3,r3,1
449
4501: bf cr7*4+2,2f
451 lhz r0,0(r4)
452 addi r4,r4,2
453 sth r0,0(r3)
454 addi r3,r3,2
455
4562: bf cr7*4+1,3f
457 lwz r0,0(r4)
458 addi r4,r4,4
459 stw r0,0(r3)
460 addi r3,r3,4
461
4623: bf cr7*4+0,4f
463 lwz r0,0(r4) /* Less chance of a reject with word ops */
464 lwz r7,4(r4)
465 addi r4,r4,8
466 stw r0,0(r3)
467 stw r7,4(r3)
468 addi r3,r3,8
469
4704: sub r5,r5,r6
471
472 /* Get the desination 128B aligned */
473 neg r6,r3
474 srdi r7,r6,4
475 mtocrf 0x01,r7
476 clrldi r6,r6,(64-7)
477
478 li r9,16
479 li r10,32
480 li r11,48
481
482 LVS(v16,0,r4) /* Setup permute control vector */
483 lvx v0,0,r4
484 addi r4,r4,16
485
486 bf cr7*4+3,5f
487 lvx v1,0,r4
488 VPERM(v8,v0,v1,v16)
489 addi r4,r4,16
490 stvx v8,0,r3
491 addi r3,r3,16
492 vor v0,v1,v1
493
4945: bf cr7*4+2,6f
495 lvx v1,0,r4
496 VPERM(v8,v0,v1,v16)
497 lvx v0,r4,r9
498 VPERM(v9,v1,v0,v16)
499 addi r4,r4,32
500 stvx v8,0,r3
501 stvx v9,r3,r9
502 addi r3,r3,32
503
5046: bf cr7*4+1,7f
505 lvx v3,0,r4
506 VPERM(v8,v0,v3,v16)
507 lvx v2,r4,r9
508 VPERM(v9,v3,v2,v16)
509 lvx v1,r4,r10
510 VPERM(v10,v2,v1,v16)
511 lvx v0,r4,r11
512 VPERM(v11,v1,v0,v16)
513 addi r4,r4,64
514 stvx v8,0,r3
515 stvx v9,r3,r9
516 stvx v10,r3,r10
517 stvx v11,r3,r11
518 addi r3,r3,64
519
5207: sub r5,r5,r6
521 srdi r6,r5,7
522
523 std r14,STK_REG(R14)(r1)
524 std r15,STK_REG(R15)(r1)
525 std r16,STK_REG(R16)(r1)
526
527 li r12,64
528 li r14,80
529 li r15,96
530 li r16,112
531
532 mtctr r6
533
534 /*
535 * Now do cacheline sized loads and stores. By this stage the
536 * cacheline stores are also cacheline aligned.
537 */
538 .align 5
5398:
540 lvx v7,0,r4
541 VPERM(v8,v0,v7,v16)
542 lvx v6,r4,r9
543 VPERM(v9,v7,v6,v16)
544 lvx v5,r4,r10
545 VPERM(v10,v6,v5,v16)
546 lvx v4,r4,r11
547 VPERM(v11,v5,v4,v16)
548 lvx v3,r4,r12
549 VPERM(v12,v4,v3,v16)
550 lvx v2,r4,r14
551 VPERM(v13,v3,v2,v16)
552 lvx v1,r4,r15
553 VPERM(v14,v2,v1,v16)
554 lvx v0,r4,r16
555 VPERM(v15,v1,v0,v16)
556 addi r4,r4,128
557 stvx v8,0,r3
558 stvx v9,r3,r9
559 stvx v10,r3,r10
560 stvx v11,r3,r11
561 stvx v12,r3,r12
562 stvx v13,r3,r14
563 stvx v14,r3,r15
564 stvx v15,r3,r16
565 addi r3,r3,128
566 bdnz 8b
567
568 ld r14,STK_REG(R14)(r1)
569 ld r15,STK_REG(R15)(r1)
570 ld r16,STK_REG(R16)(r1)
571
572 /* Up to 127B to go */
573 clrldi r5,r5,(64-7)
574 srdi r6,r5,4
575 mtocrf 0x01,r6
576
577 bf cr7*4+1,9f
578 lvx v3,0,r4
579 VPERM(v8,v0,v3,v16)
580 lvx v2,r4,r9
581 VPERM(v9,v3,v2,v16)
582 lvx v1,r4,r10
583 VPERM(v10,v2,v1,v16)
584 lvx v0,r4,r11
585 VPERM(v11,v1,v0,v16)
586 addi r4,r4,64
587 stvx v8,0,r3
588 stvx v9,r3,r9
589 stvx v10,r3,r10
590 stvx v11,r3,r11
591 addi r3,r3,64
592
5939: bf cr7*4+2,10f
594 lvx v1,0,r4
595 VPERM(v8,v0,v1,v16)
596 lvx v0,r4,r9
597 VPERM(v9,v1,v0,v16)
598 addi r4,r4,32
599 stvx v8,0,r3
600 stvx v9,r3,r9
601 addi r3,r3,32
602
60310: bf cr7*4+3,11f
604 lvx v1,0,r4
605 VPERM(v8,v0,v1,v16)
606 addi r4,r4,16
607 stvx v8,0,r3
608 addi r3,r3,16
609
610 /* Up to 15B to go */
61111: clrldi r5,r5,(64-4)
612 addi r4,r4,-16 /* Unwind the +16 load offset */
613 mtocrf 0x01,r5
614 bf cr7*4+0,12f
615 lwz r0,0(r4) /* Less chance of a reject with word ops */
616 lwz r6,4(r4)
617 addi r4,r4,8
618 stw r0,0(r3)
619 stw r6,4(r3)
620 addi r3,r3,8
621
62212: bf cr7*4+1,13f
623 lwz r0,0(r4)
624 addi r4,r4,4
625 stw r0,0(r3)
626 addi r3,r3,4
627
62813: bf cr7*4+2,14f
629 lhz r0,0(r4)
630 addi r4,r4,2
631 sth r0,0(r3)
632 addi r3,r3,2
633
63414: bf cr7*4+3,15f
635 lbz r0,0(r4)
636 stb r0,0(r3)
637
63815: addi r1,r1,STACKFRAMESIZE
639 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
640 b CFUNC(exit_vmx_ops) /* tail call optimise */
641#endif /* CONFIG_ALTIVEC */
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22_GLOBAL(memcpy_power7)
23
24#ifdef __BIG_ENDIAN__
25#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
26#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
27#else
28#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
29#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
30#endif
31
32#ifdef CONFIG_ALTIVEC
33 cmpldi r5,16
34 cmpldi cr1,r5,4096
35
36 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
37
38 blt .Lshort_copy
39 bgt cr1,.Lvmx_copy
40#else
41 cmpldi r5,16
42
43 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
44
45 blt .Lshort_copy
46#endif
47
48.Lnonvmx_copy:
49 /* Get the source 8B aligned */
50 neg r6,r4
51 mtocrf 0x01,r6
52 clrldi r6,r6,(64-3)
53
54 bf cr7*4+3,1f
55 lbz r0,0(r4)
56 addi r4,r4,1
57 stb r0,0(r3)
58 addi r3,r3,1
59
601: bf cr7*4+2,2f
61 lhz r0,0(r4)
62 addi r4,r4,2
63 sth r0,0(r3)
64 addi r3,r3,2
65
662: bf cr7*4+1,3f
67 lwz r0,0(r4)
68 addi r4,r4,4
69 stw r0,0(r3)
70 addi r3,r3,4
71
723: sub r5,r5,r6
73 cmpldi r5,128
74 blt 5f
75
76 mflr r0
77 stdu r1,-STACKFRAMESIZE(r1)
78 std r14,STK_REG(R14)(r1)
79 std r15,STK_REG(R15)(r1)
80 std r16,STK_REG(R16)(r1)
81 std r17,STK_REG(R17)(r1)
82 std r18,STK_REG(R18)(r1)
83 std r19,STK_REG(R19)(r1)
84 std r20,STK_REG(R20)(r1)
85 std r21,STK_REG(R21)(r1)
86 std r22,STK_REG(R22)(r1)
87 std r0,STACKFRAMESIZE+16(r1)
88
89 srdi r6,r5,7
90 mtctr r6
91
92 /* Now do cacheline (128B) sized loads and stores. */
93 .align 5
944:
95 ld r0,0(r4)
96 ld r6,8(r4)
97 ld r7,16(r4)
98 ld r8,24(r4)
99 ld r9,32(r4)
100 ld r10,40(r4)
101 ld r11,48(r4)
102 ld r12,56(r4)
103 ld r14,64(r4)
104 ld r15,72(r4)
105 ld r16,80(r4)
106 ld r17,88(r4)
107 ld r18,96(r4)
108 ld r19,104(r4)
109 ld r20,112(r4)
110 ld r21,120(r4)
111 addi r4,r4,128
112 std r0,0(r3)
113 std r6,8(r3)
114 std r7,16(r3)
115 std r8,24(r3)
116 std r9,32(r3)
117 std r10,40(r3)
118 std r11,48(r3)
119 std r12,56(r3)
120 std r14,64(r3)
121 std r15,72(r3)
122 std r16,80(r3)
123 std r17,88(r3)
124 std r18,96(r3)
125 std r19,104(r3)
126 std r20,112(r3)
127 std r21,120(r3)
128 addi r3,r3,128
129 bdnz 4b
130
131 clrldi r5,r5,(64-7)
132
133 ld r14,STK_REG(R14)(r1)
134 ld r15,STK_REG(R15)(r1)
135 ld r16,STK_REG(R16)(r1)
136 ld r17,STK_REG(R17)(r1)
137 ld r18,STK_REG(R18)(r1)
138 ld r19,STK_REG(R19)(r1)
139 ld r20,STK_REG(R20)(r1)
140 ld r21,STK_REG(R21)(r1)
141 ld r22,STK_REG(R22)(r1)
142 addi r1,r1,STACKFRAMESIZE
143
144 /* Up to 127B to go */
1455: srdi r6,r5,4
146 mtocrf 0x01,r6
147
1486: bf cr7*4+1,7f
149 ld r0,0(r4)
150 ld r6,8(r4)
151 ld r7,16(r4)
152 ld r8,24(r4)
153 ld r9,32(r4)
154 ld r10,40(r4)
155 ld r11,48(r4)
156 ld r12,56(r4)
157 addi r4,r4,64
158 std r0,0(r3)
159 std r6,8(r3)
160 std r7,16(r3)
161 std r8,24(r3)
162 std r9,32(r3)
163 std r10,40(r3)
164 std r11,48(r3)
165 std r12,56(r3)
166 addi r3,r3,64
167
168 /* Up to 63B to go */
1697: bf cr7*4+2,8f
170 ld r0,0(r4)
171 ld r6,8(r4)
172 ld r7,16(r4)
173 ld r8,24(r4)
174 addi r4,r4,32
175 std r0,0(r3)
176 std r6,8(r3)
177 std r7,16(r3)
178 std r8,24(r3)
179 addi r3,r3,32
180
181 /* Up to 31B to go */
1828: bf cr7*4+3,9f
183 ld r0,0(r4)
184 ld r6,8(r4)
185 addi r4,r4,16
186 std r0,0(r3)
187 std r6,8(r3)
188 addi r3,r3,16
189
1909: clrldi r5,r5,(64-4)
191
192 /* Up to 15B to go */
193.Lshort_copy:
194 mtocrf 0x01,r5
195 bf cr7*4+0,12f
196 lwz r0,0(r4) /* Less chance of a reject with word ops */
197 lwz r6,4(r4)
198 addi r4,r4,8
199 stw r0,0(r3)
200 stw r6,4(r3)
201 addi r3,r3,8
202
20312: bf cr7*4+1,13f
204 lwz r0,0(r4)
205 addi r4,r4,4
206 stw r0,0(r3)
207 addi r3,r3,4
208
20913: bf cr7*4+2,14f
210 lhz r0,0(r4)
211 addi r4,r4,2
212 sth r0,0(r3)
213 addi r3,r3,2
214
21514: bf cr7*4+3,15f
216 lbz r0,0(r4)
217 stb r0,0(r3)
218
21915: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
220 blr
221
222.Lunwind_stack_nonvmx_copy:
223 addi r1,r1,STACKFRAMESIZE
224 b .Lnonvmx_copy
225
226#ifdef CONFIG_ALTIVEC
227.Lvmx_copy:
228 mflr r0
229 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
230 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
231 std r0,16(r1)
232 stdu r1,-STACKFRAMESIZE(r1)
233 bl enter_vmx_copy
234 cmpwi cr1,r3,0
235 ld r0,STACKFRAMESIZE+16(r1)
236 ld r3,STK_REG(R31)(r1)
237 ld r4,STK_REG(R30)(r1)
238 ld r5,STK_REG(R29)(r1)
239 mtlr r0
240
241 /*
242 * We prefetch both the source and destination using enhanced touch
243 * instructions. We use a stream ID of 0 for the load side and
244 * 1 for the store side.
245 */
246 clrrdi r6,r4,7
247 clrrdi r9,r3,7
248 ori r9,r9,1 /* stream=1 */
249
250 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
251 cmpldi r7,0x3FF
252 ble 1f
253 li r7,0x3FF
2541: lis r0,0x0E00 /* depth=7 */
255 sldi r7,r7,7
256 or r7,r7,r0
257 ori r10,r7,1 /* stream=1 */
258
259 lis r8,0x8000 /* GO=1 */
260 clrldi r8,r8,32
261
262.machine push
263.machine "power4"
264 dcbt r0,r6,0b01000
265 dcbt r0,r7,0b01010
266 dcbtst r0,r9,0b01000
267 dcbtst r0,r10,0b01010
268 eieio
269 dcbt r0,r8,0b01010 /* GO */
270.machine pop
271
272 beq cr1,.Lunwind_stack_nonvmx_copy
273
274 /*
275 * If source and destination are not relatively aligned we use a
276 * slower permute loop.
277 */
278 xor r6,r4,r3
279 rldicl. r6,r6,0,(64-4)
280 bne .Lvmx_unaligned_copy
281
282 /* Get the destination 16B aligned */
283 neg r6,r3
284 mtocrf 0x01,r6
285 clrldi r6,r6,(64-4)
286
287 bf cr7*4+3,1f
288 lbz r0,0(r4)
289 addi r4,r4,1
290 stb r0,0(r3)
291 addi r3,r3,1
292
2931: bf cr7*4+2,2f
294 lhz r0,0(r4)
295 addi r4,r4,2
296 sth r0,0(r3)
297 addi r3,r3,2
298
2992: bf cr7*4+1,3f
300 lwz r0,0(r4)
301 addi r4,r4,4
302 stw r0,0(r3)
303 addi r3,r3,4
304
3053: bf cr7*4+0,4f
306 ld r0,0(r4)
307 addi r4,r4,8
308 std r0,0(r3)
309 addi r3,r3,8
310
3114: sub r5,r5,r6
312
313 /* Get the desination 128B aligned */
314 neg r6,r3
315 srdi r7,r6,4
316 mtocrf 0x01,r7
317 clrldi r6,r6,(64-7)
318
319 li r9,16
320 li r10,32
321 li r11,48
322
323 bf cr7*4+3,5f
324 lvx v1,r0,r4
325 addi r4,r4,16
326 stvx v1,r0,r3
327 addi r3,r3,16
328
3295: bf cr7*4+2,6f
330 lvx v1,r0,r4
331 lvx v0,r4,r9
332 addi r4,r4,32
333 stvx v1,r0,r3
334 stvx v0,r3,r9
335 addi r3,r3,32
336
3376: bf cr7*4+1,7f
338 lvx v3,r0,r4
339 lvx v2,r4,r9
340 lvx v1,r4,r10
341 lvx v0,r4,r11
342 addi r4,r4,64
343 stvx v3,r0,r3
344 stvx v2,r3,r9
345 stvx v1,r3,r10
346 stvx v0,r3,r11
347 addi r3,r3,64
348
3497: sub r5,r5,r6
350 srdi r6,r5,7
351
352 std r14,STK_REG(R14)(r1)
353 std r15,STK_REG(R15)(r1)
354 std r16,STK_REG(R16)(r1)
355
356 li r12,64
357 li r14,80
358 li r15,96
359 li r16,112
360
361 mtctr r6
362
363 /*
364 * Now do cacheline sized loads and stores. By this stage the
365 * cacheline stores are also cacheline aligned.
366 */
367 .align 5
3688:
369 lvx v7,r0,r4
370 lvx v6,r4,r9
371 lvx v5,r4,r10
372 lvx v4,r4,r11
373 lvx v3,r4,r12
374 lvx v2,r4,r14
375 lvx v1,r4,r15
376 lvx v0,r4,r16
377 addi r4,r4,128
378 stvx v7,r0,r3
379 stvx v6,r3,r9
380 stvx v5,r3,r10
381 stvx v4,r3,r11
382 stvx v3,r3,r12
383 stvx v2,r3,r14
384 stvx v1,r3,r15
385 stvx v0,r3,r16
386 addi r3,r3,128
387 bdnz 8b
388
389 ld r14,STK_REG(R14)(r1)
390 ld r15,STK_REG(R15)(r1)
391 ld r16,STK_REG(R16)(r1)
392
393 /* Up to 127B to go */
394 clrldi r5,r5,(64-7)
395 srdi r6,r5,4
396 mtocrf 0x01,r6
397
398 bf cr7*4+1,9f
399 lvx v3,r0,r4
400 lvx v2,r4,r9
401 lvx v1,r4,r10
402 lvx v0,r4,r11
403 addi r4,r4,64
404 stvx v3,r0,r3
405 stvx v2,r3,r9
406 stvx v1,r3,r10
407 stvx v0,r3,r11
408 addi r3,r3,64
409
4109: bf cr7*4+2,10f
411 lvx v1,r0,r4
412 lvx v0,r4,r9
413 addi r4,r4,32
414 stvx v1,r0,r3
415 stvx v0,r3,r9
416 addi r3,r3,32
417
41810: bf cr7*4+3,11f
419 lvx v1,r0,r4
420 addi r4,r4,16
421 stvx v1,r0,r3
422 addi r3,r3,16
423
424 /* Up to 15B to go */
42511: clrldi r5,r5,(64-4)
426 mtocrf 0x01,r5
427 bf cr7*4+0,12f
428 ld r0,0(r4)
429 addi r4,r4,8
430 std r0,0(r3)
431 addi r3,r3,8
432
43312: bf cr7*4+1,13f
434 lwz r0,0(r4)
435 addi r4,r4,4
436 stw r0,0(r3)
437 addi r3,r3,4
438
43913: bf cr7*4+2,14f
440 lhz r0,0(r4)
441 addi r4,r4,2
442 sth r0,0(r3)
443 addi r3,r3,2
444
44514: bf cr7*4+3,15f
446 lbz r0,0(r4)
447 stb r0,0(r3)
448
44915: addi r1,r1,STACKFRAMESIZE
450 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
451 b exit_vmx_copy /* tail call optimise */
452
453.Lvmx_unaligned_copy:
454 /* Get the destination 16B aligned */
455 neg r6,r3
456 mtocrf 0x01,r6
457 clrldi r6,r6,(64-4)
458
459 bf cr7*4+3,1f
460 lbz r0,0(r4)
461 addi r4,r4,1
462 stb r0,0(r3)
463 addi r3,r3,1
464
4651: bf cr7*4+2,2f
466 lhz r0,0(r4)
467 addi r4,r4,2
468 sth r0,0(r3)
469 addi r3,r3,2
470
4712: bf cr7*4+1,3f
472 lwz r0,0(r4)
473 addi r4,r4,4
474 stw r0,0(r3)
475 addi r3,r3,4
476
4773: bf cr7*4+0,4f
478 lwz r0,0(r4) /* Less chance of a reject with word ops */
479 lwz r7,4(r4)
480 addi r4,r4,8
481 stw r0,0(r3)
482 stw r7,4(r3)
483 addi r3,r3,8
484
4854: sub r5,r5,r6
486
487 /* Get the desination 128B aligned */
488 neg r6,r3
489 srdi r7,r6,4
490 mtocrf 0x01,r7
491 clrldi r6,r6,(64-7)
492
493 li r9,16
494 li r10,32
495 li r11,48
496
497 LVS(v16,0,r4) /* Setup permute control vector */
498 lvx v0,0,r4
499 addi r4,r4,16
500
501 bf cr7*4+3,5f
502 lvx v1,r0,r4
503 VPERM(v8,v0,v1,v16)
504 addi r4,r4,16
505 stvx v8,r0,r3
506 addi r3,r3,16
507 vor v0,v1,v1
508
5095: bf cr7*4+2,6f
510 lvx v1,r0,r4
511 VPERM(v8,v0,v1,v16)
512 lvx v0,r4,r9
513 VPERM(v9,v1,v0,v16)
514 addi r4,r4,32
515 stvx v8,r0,r3
516 stvx v9,r3,r9
517 addi r3,r3,32
518
5196: bf cr7*4+1,7f
520 lvx v3,r0,r4
521 VPERM(v8,v0,v3,v16)
522 lvx v2,r4,r9
523 VPERM(v9,v3,v2,v16)
524 lvx v1,r4,r10
525 VPERM(v10,v2,v1,v16)
526 lvx v0,r4,r11
527 VPERM(v11,v1,v0,v16)
528 addi r4,r4,64
529 stvx v8,r0,r3
530 stvx v9,r3,r9
531 stvx v10,r3,r10
532 stvx v11,r3,r11
533 addi r3,r3,64
534
5357: sub r5,r5,r6
536 srdi r6,r5,7
537
538 std r14,STK_REG(R14)(r1)
539 std r15,STK_REG(R15)(r1)
540 std r16,STK_REG(R16)(r1)
541
542 li r12,64
543 li r14,80
544 li r15,96
545 li r16,112
546
547 mtctr r6
548
549 /*
550 * Now do cacheline sized loads and stores. By this stage the
551 * cacheline stores are also cacheline aligned.
552 */
553 .align 5
5548:
555 lvx v7,r0,r4
556 VPERM(v8,v0,v7,v16)
557 lvx v6,r4,r9
558 VPERM(v9,v7,v6,v16)
559 lvx v5,r4,r10
560 VPERM(v10,v6,v5,v16)
561 lvx v4,r4,r11
562 VPERM(v11,v5,v4,v16)
563 lvx v3,r4,r12
564 VPERM(v12,v4,v3,v16)
565 lvx v2,r4,r14
566 VPERM(v13,v3,v2,v16)
567 lvx v1,r4,r15
568 VPERM(v14,v2,v1,v16)
569 lvx v0,r4,r16
570 VPERM(v15,v1,v0,v16)
571 addi r4,r4,128
572 stvx v8,r0,r3
573 stvx v9,r3,r9
574 stvx v10,r3,r10
575 stvx v11,r3,r11
576 stvx v12,r3,r12
577 stvx v13,r3,r14
578 stvx v14,r3,r15
579 stvx v15,r3,r16
580 addi r3,r3,128
581 bdnz 8b
582
583 ld r14,STK_REG(R14)(r1)
584 ld r15,STK_REG(R15)(r1)
585 ld r16,STK_REG(R16)(r1)
586
587 /* Up to 127B to go */
588 clrldi r5,r5,(64-7)
589 srdi r6,r5,4
590 mtocrf 0x01,r6
591
592 bf cr7*4+1,9f
593 lvx v3,r0,r4
594 VPERM(v8,v0,v3,v16)
595 lvx v2,r4,r9
596 VPERM(v9,v3,v2,v16)
597 lvx v1,r4,r10
598 VPERM(v10,v2,v1,v16)
599 lvx v0,r4,r11
600 VPERM(v11,v1,v0,v16)
601 addi r4,r4,64
602 stvx v8,r0,r3
603 stvx v9,r3,r9
604 stvx v10,r3,r10
605 stvx v11,r3,r11
606 addi r3,r3,64
607
6089: bf cr7*4+2,10f
609 lvx v1,r0,r4
610 VPERM(v8,v0,v1,v16)
611 lvx v0,r4,r9
612 VPERM(v9,v1,v0,v16)
613 addi r4,r4,32
614 stvx v8,r0,r3
615 stvx v9,r3,r9
616 addi r3,r3,32
617
61810: bf cr7*4+3,11f
619 lvx v1,r0,r4
620 VPERM(v8,v0,v1,v16)
621 addi r4,r4,16
622 stvx v8,r0,r3
623 addi r3,r3,16
624
625 /* Up to 15B to go */
62611: clrldi r5,r5,(64-4)
627 addi r4,r4,-16 /* Unwind the +16 load offset */
628 mtocrf 0x01,r5
629 bf cr7*4+0,12f
630 lwz r0,0(r4) /* Less chance of a reject with word ops */
631 lwz r6,4(r4)
632 addi r4,r4,8
633 stw r0,0(r3)
634 stw r6,4(r3)
635 addi r3,r3,8
636
63712: bf cr7*4+1,13f
638 lwz r0,0(r4)
639 addi r4,r4,4
640 stw r0,0(r3)
641 addi r3,r3,4
642
64313: bf cr7*4+2,14f
644 lhz r0,0(r4)
645 addi r4,r4,2
646 sth r0,0(r3)
647 addi r3,r3,2
648
64914: bf cr7*4+3,15f
650 lbz r0,0(r4)
651 stb r0,0(r3)
652
65315: addi r1,r1,STACKFRAMESIZE
654 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
655 b exit_vmx_copy /* tail call optimise */
656#endif /* CONFIG_ALTIVEC */