Loading...
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32#include <linux/linkage.h>
33 .text
34 .globl memcpy
35 .type memcpy, @function
36 .ent memcpy
37
38memcpy:
39fast_memcpy_ascending:
40 /* move d to return register as value of function */
41 addi r3, r5, 0
42
43 addi r4, r0, 4 /* n = 4 */
44 cmpu r4, r4, r7 /* n = c - n (unsigned) */
45 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
46
47 /* transfer first 0~3 bytes to get aligned dest address */
48 andi r4, r5, 3 /* n = d & 3 */
49 /* if zero, destination already aligned */
50 beqi r4, a_dalign_done
51 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
52 rsubi r4, r4, 4
53 rsub r7, r4, r7 /* c = c - n adjust c */
54
55a_xfer_first_loop:
56 /* if no bytes left to transfer, transfer the bulk */
57 beqi r4, a_dalign_done
58 lbui r11, r6, 0 /* h = *s */
59 sbi r11, r5, 0 /* *d = h */
60 addi r6, r6, 1 /* s++ */
61 addi r5, r5, 1 /* d++ */
62 brid a_xfer_first_loop /* loop */
63 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
64
65a_dalign_done:
66 addi r4, r0, 32 /* n = 32 */
67 cmpu r4, r4, r7 /* n = c - n (unsigned) */
68 /* if n < 0, less than one block to transfer */
69 blti r4, a_block_done
70
71a_block_xfer:
72 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
73 rsub r7, r4, r7 /* c = c - n */
74
75 andi r9, r6, 3 /* t1 = s & 3 */
76 /* if temp != 0, unaligned transfers needed */
77 bnei r9, a_block_unaligned
78
79a_block_aligned:
80 lwi r9, r6, 0 /* t1 = *(s + 0) */
81 lwi r10, r6, 4 /* t2 = *(s + 4) */
82 lwi r11, r6, 8 /* t3 = *(s + 8) */
83 lwi r12, r6, 12 /* t4 = *(s + 12) */
84 swi r9, r5, 0 /* *(d + 0) = t1 */
85 swi r10, r5, 4 /* *(d + 4) = t2 */
86 swi r11, r5, 8 /* *(d + 8) = t3 */
87 swi r12, r5, 12 /* *(d + 12) = t4 */
88 lwi r9, r6, 16 /* t1 = *(s + 16) */
89 lwi r10, r6, 20 /* t2 = *(s + 20) */
90 lwi r11, r6, 24 /* t3 = *(s + 24) */
91 lwi r12, r6, 28 /* t4 = *(s + 28) */
92 swi r9, r5, 16 /* *(d + 16) = t1 */
93 swi r10, r5, 20 /* *(d + 20) = t2 */
94 swi r11, r5, 24 /* *(d + 24) = t3 */
95 swi r12, r5, 28 /* *(d + 28) = t4 */
96 addi r6, r6, 32 /* s = s + 32 */
97 addi r4, r4, -32 /* n = n - 32 */
98 bneid r4, a_block_aligned /* while (n) loop */
99 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
100 bri a_block_done
101
102a_block_unaligned:
103 andi r8, r6, 0xfffffffc /* as = s & ~3 */
104 add r6, r6, r4 /* s = s + n */
105 lwi r11, r8, 0 /* h = *(as + 0) */
106
107 addi r9, r9, -1
108 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
109 addi r9, r9, -1
110 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
111
112a_block_u3:
113 bslli r11, r11, 24 /* h = h << 24 */
114a_bu3_loop:
115 lwi r12, r8, 4 /* v = *(as + 4) */
116 bsrli r9, r12, 8 /* t1 = v >> 8 */
117 or r9, r11, r9 /* t1 = h | t1 */
118 swi r9, r5, 0 /* *(d + 0) = t1 */
119 bslli r11, r12, 24 /* h = v << 24 */
120 lwi r12, r8, 8 /* v = *(as + 8) */
121 bsrli r9, r12, 8 /* t1 = v >> 8 */
122 or r9, r11, r9 /* t1 = h | t1 */
123 swi r9, r5, 4 /* *(d + 4) = t1 */
124 bslli r11, r12, 24 /* h = v << 24 */
125 lwi r12, r8, 12 /* v = *(as + 12) */
126 bsrli r9, r12, 8 /* t1 = v >> 8 */
127 or r9, r11, r9 /* t1 = h | t1 */
128 swi r9, r5, 8 /* *(d + 8) = t1 */
129 bslli r11, r12, 24 /* h = v << 24 */
130 lwi r12, r8, 16 /* v = *(as + 16) */
131 bsrli r9, r12, 8 /* t1 = v >> 8 */
132 or r9, r11, r9 /* t1 = h | t1 */
133 swi r9, r5, 12 /* *(d + 12) = t1 */
134 bslli r11, r12, 24 /* h = v << 24 */
135 lwi r12, r8, 20 /* v = *(as + 20) */
136 bsrli r9, r12, 8 /* t1 = v >> 8 */
137 or r9, r11, r9 /* t1 = h | t1 */
138 swi r9, r5, 16 /* *(d + 16) = t1 */
139 bslli r11, r12, 24 /* h = v << 24 */
140 lwi r12, r8, 24 /* v = *(as + 24) */
141 bsrli r9, r12, 8 /* t1 = v >> 8 */
142 or r9, r11, r9 /* t1 = h | t1 */
143 swi r9, r5, 20 /* *(d + 20) = t1 */
144 bslli r11, r12, 24 /* h = v << 24 */
145 lwi r12, r8, 28 /* v = *(as + 28) */
146 bsrli r9, r12, 8 /* t1 = v >> 8 */
147 or r9, r11, r9 /* t1 = h | t1 */
148 swi r9, r5, 24 /* *(d + 24) = t1 */
149 bslli r11, r12, 24 /* h = v << 24 */
150 lwi r12, r8, 32 /* v = *(as + 32) */
151 bsrli r9, r12, 8 /* t1 = v >> 8 */
152 or r9, r11, r9 /* t1 = h | t1 */
153 swi r9, r5, 28 /* *(d + 28) = t1 */
154 bslli r11, r12, 24 /* h = v << 24 */
155 addi r8, r8, 32 /* as = as + 32 */
156 addi r4, r4, -32 /* n = n - 32 */
157 bneid r4, a_bu3_loop /* while (n) loop */
158 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
159 bri a_block_done
160
161a_block_u1:
162 bslli r11, r11, 8 /* h = h << 8 */
163a_bu1_loop:
164 lwi r12, r8, 4 /* v = *(as + 4) */
165 bsrli r9, r12, 24 /* t1 = v >> 24 */
166 or r9, r11, r9 /* t1 = h | t1 */
167 swi r9, r5, 0 /* *(d + 0) = t1 */
168 bslli r11, r12, 8 /* h = v << 8 */
169 lwi r12, r8, 8 /* v = *(as + 8) */
170 bsrli r9, r12, 24 /* t1 = v >> 24 */
171 or r9, r11, r9 /* t1 = h | t1 */
172 swi r9, r5, 4 /* *(d + 4) = t1 */
173 bslli r11, r12, 8 /* h = v << 8 */
174 lwi r12, r8, 12 /* v = *(as + 12) */
175 bsrli r9, r12, 24 /* t1 = v >> 24 */
176 or r9, r11, r9 /* t1 = h | t1 */
177 swi r9, r5, 8 /* *(d + 8) = t1 */
178 bslli r11, r12, 8 /* h = v << 8 */
179 lwi r12, r8, 16 /* v = *(as + 16) */
180 bsrli r9, r12, 24 /* t1 = v >> 24 */
181 or r9, r11, r9 /* t1 = h | t1 */
182 swi r9, r5, 12 /* *(d + 12) = t1 */
183 bslli r11, r12, 8 /* h = v << 8 */
184 lwi r12, r8, 20 /* v = *(as + 20) */
185 bsrli r9, r12, 24 /* t1 = v >> 24 */
186 or r9, r11, r9 /* t1 = h | t1 */
187 swi r9, r5, 16 /* *(d + 16) = t1 */
188 bslli r11, r12, 8 /* h = v << 8 */
189 lwi r12, r8, 24 /* v = *(as + 24) */
190 bsrli r9, r12, 24 /* t1 = v >> 24 */
191 or r9, r11, r9 /* t1 = h | t1 */
192 swi r9, r5, 20 /* *(d + 20) = t1 */
193 bslli r11, r12, 8 /* h = v << 8 */
194 lwi r12, r8, 28 /* v = *(as + 28) */
195 bsrli r9, r12, 24 /* t1 = v >> 24 */
196 or r9, r11, r9 /* t1 = h | t1 */
197 swi r9, r5, 24 /* *(d + 24) = t1 */
198 bslli r11, r12, 8 /* h = v << 8 */
199 lwi r12, r8, 32 /* v = *(as + 32) */
200 bsrli r9, r12, 24 /* t1 = v >> 24 */
201 or r9, r11, r9 /* t1 = h | t1 */
202 swi r9, r5, 28 /* *(d + 28) = t1 */
203 bslli r11, r12, 8 /* h = v << 8 */
204 addi r8, r8, 32 /* as = as + 32 */
205 addi r4, r4, -32 /* n = n - 32 */
206 bneid r4, a_bu1_loop /* while (n) loop */
207 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
208 bri a_block_done
209
210a_block_u2:
211 bslli r11, r11, 16 /* h = h << 16 */
212a_bu2_loop:
213 lwi r12, r8, 4 /* v = *(as + 4) */
214 bsrli r9, r12, 16 /* t1 = v >> 16 */
215 or r9, r11, r9 /* t1 = h | t1 */
216 swi r9, r5, 0 /* *(d + 0) = t1 */
217 bslli r11, r12, 16 /* h = v << 16 */
218 lwi r12, r8, 8 /* v = *(as + 8) */
219 bsrli r9, r12, 16 /* t1 = v >> 16 */
220 or r9, r11, r9 /* t1 = h | t1 */
221 swi r9, r5, 4 /* *(d + 4) = t1 */
222 bslli r11, r12, 16 /* h = v << 16 */
223 lwi r12, r8, 12 /* v = *(as + 12) */
224 bsrli r9, r12, 16 /* t1 = v >> 16 */
225 or r9, r11, r9 /* t1 = h | t1 */
226 swi r9, r5, 8 /* *(d + 8) = t1 */
227 bslli r11, r12, 16 /* h = v << 16 */
228 lwi r12, r8, 16 /* v = *(as + 16) */
229 bsrli r9, r12, 16 /* t1 = v >> 16 */
230 or r9, r11, r9 /* t1 = h | t1 */
231 swi r9, r5, 12 /* *(d + 12) = t1 */
232 bslli r11, r12, 16 /* h = v << 16 */
233 lwi r12, r8, 20 /* v = *(as + 20) */
234 bsrli r9, r12, 16 /* t1 = v >> 16 */
235 or r9, r11, r9 /* t1 = h | t1 */
236 swi r9, r5, 16 /* *(d + 16) = t1 */
237 bslli r11, r12, 16 /* h = v << 16 */
238 lwi r12, r8, 24 /* v = *(as + 24) */
239 bsrli r9, r12, 16 /* t1 = v >> 16 */
240 or r9, r11, r9 /* t1 = h | t1 */
241 swi r9, r5, 20 /* *(d + 20) = t1 */
242 bslli r11, r12, 16 /* h = v << 16 */
243 lwi r12, r8, 28 /* v = *(as + 28) */
244 bsrli r9, r12, 16 /* t1 = v >> 16 */
245 or r9, r11, r9 /* t1 = h | t1 */
246 swi r9, r5, 24 /* *(d + 24) = t1 */
247 bslli r11, r12, 16 /* h = v << 16 */
248 lwi r12, r8, 32 /* v = *(as + 32) */
249 bsrli r9, r12, 16 /* t1 = v >> 16 */
250 or r9, r11, r9 /* t1 = h | t1 */
251 swi r9, r5, 28 /* *(d + 28) = t1 */
252 bslli r11, r12, 16 /* h = v << 16 */
253 addi r8, r8, 32 /* as = as + 32 */
254 addi r4, r4, -32 /* n = n - 32 */
255 bneid r4, a_bu2_loop /* while (n) loop */
256 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
257
258a_block_done:
259 addi r4, r0, 4 /* n = 4 */
260 cmpu r4, r4, r7 /* n = c - n (unsigned) */
261 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
262
263a_word_xfer:
264 andi r4, r7, 0xfffffffc /* n = c & ~3 */
265 addi r10, r0, 0 /* offset = 0 */
266
267 andi r9, r6, 3 /* t1 = s & 3 */
268 /* if temp != 0, unaligned transfers needed */
269 bnei r9, a_word_unaligned
270
271a_word_aligned:
272 lw r9, r6, r10 /* t1 = *(s+offset) */
273 sw r9, r5, r10 /* *(d+offset) = t1 */
274 addi r4, r4,-4 /* n-- */
275 bneid r4, a_word_aligned /* loop */
276 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
277
278 bri a_word_done
279
280a_word_unaligned:
281 andi r8, r6, 0xfffffffc /* as = s & ~3 */
282 lwi r11, r8, 0 /* h = *(as + 0) */
283 addi r8, r8, 4 /* as = as + 4 */
284
285 addi r9, r9, -1
286 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
287 addi r9, r9, -1
288 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
289
290a_word_u3:
291 bslli r11, r11, 24 /* h = h << 24 */
292a_wu3_loop:
293 lw r12, r8, r10 /* v = *(as + offset) */
294 bsrli r9, r12, 8 /* t1 = v >> 8 */
295 or r9, r11, r9 /* t1 = h | t1 */
296 sw r9, r5, r10 /* *(d + offset) = t1 */
297 bslli r11, r12, 24 /* h = v << 24 */
298 addi r4, r4,-4 /* n = n - 4 */
299 bneid r4, a_wu3_loop /* while (n) loop */
300 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
301
302 bri a_word_done
303
304a_word_u1:
305 bslli r11, r11, 8 /* h = h << 8 */
306a_wu1_loop:
307 lw r12, r8, r10 /* v = *(as + offset) */
308 bsrli r9, r12, 24 /* t1 = v >> 24 */
309 or r9, r11, r9 /* t1 = h | t1 */
310 sw r9, r5, r10 /* *(d + offset) = t1 */
311 bslli r11, r12, 8 /* h = v << 8 */
312 addi r4, r4,-4 /* n = n - 4 */
313 bneid r4, a_wu1_loop /* while (n) loop */
314 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
315
316 bri a_word_done
317
318a_word_u2:
319 bslli r11, r11, 16 /* h = h << 16 */
320a_wu2_loop:
321 lw r12, r8, r10 /* v = *(as + offset) */
322 bsrli r9, r12, 16 /* t1 = v >> 16 */
323 or r9, r11, r9 /* t1 = h | t1 */
324 sw r9, r5, r10 /* *(d + offset) = t1 */
325 bslli r11, r12, 16 /* h = v << 16 */
326 addi r4, r4,-4 /* n = n - 4 */
327 bneid r4, a_wu2_loop /* while (n) loop */
328 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
329
330a_word_done:
331 add r5, r5, r10 /* d = d + offset */
332 add r6, r6, r10 /* s = s + offset */
333 rsub r7, r10, r7 /* c = c - offset */
334
335a_xfer_end:
336a_xfer_end_loop:
337 beqi r7, a_done /* while (c) */
338 lbui r9, r6, 0 /* t1 = *s */
339 addi r6, r6, 1 /* s++ */
340 sbi r9, r5, 0 /* *d = t1 */
341 addi r7, r7, -1 /* c-- */
342 brid a_xfer_end_loop /* loop */
343 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
344
345a_done:
346 rtsd r15, 8
347 nop
348
349.size memcpy, . - memcpy
350.end memcpy
351/*----------------------------------------------------------------------------*/
352 .globl memmove
353 .type memmove, @function
354 .ent memmove
355
356memmove:
357 cmpu r4, r5, r6 /* n = s - d */
358 bgei r4,fast_memcpy_ascending
359
360fast_memcpy_descending:
361 /* move d to return register as value of function */
362 addi r3, r5, 0
363
364 add r5, r5, r7 /* d = d + c */
365 add r6, r6, r7 /* s = s + c */
366
367 addi r4, r0, 4 /* n = 4 */
368 cmpu r4, r4, r7 /* n = c - n (unsigned) */
369 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
370
371 /* transfer first 0~3 bytes to get aligned dest address */
372 andi r4, r5, 3 /* n = d & 3 */
373 /* if zero, destination already aligned */
374 beqi r4,d_dalign_done
375 rsub r7, r4, r7 /* c = c - n adjust c */
376
377d_xfer_first_loop:
378 /* if no bytes left to transfer, transfer the bulk */
379 beqi r4,d_dalign_done
380 addi r6, r6, -1 /* s-- */
381 addi r5, r5, -1 /* d-- */
382 lbui r11, r6, 0 /* h = *s */
383 sbi r11, r5, 0 /* *d = h */
384 brid d_xfer_first_loop /* loop */
385 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
386
387d_dalign_done:
388 addi r4, r0, 32 /* n = 32 */
389 cmpu r4, r4, r7 /* n = c - n (unsigned) */
390 /* if n < 0, less than one block to transfer */
391 blti r4, d_block_done
392
393d_block_xfer:
394 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
395 rsub r7, r4, r7 /* c = c - n */
396
397 andi r9, r6, 3 /* t1 = s & 3 */
398 /* if temp != 0, unaligned transfers needed */
399 bnei r9, d_block_unaligned
400
401d_block_aligned:
402 addi r6, r6, -32 /* s = s - 32 */
403 addi r5, r5, -32 /* d = d - 32 */
404 lwi r9, r6, 28 /* t1 = *(s + 28) */
405 lwi r10, r6, 24 /* t2 = *(s + 24) */
406 lwi r11, r6, 20 /* t3 = *(s + 20) */
407 lwi r12, r6, 16 /* t4 = *(s + 16) */
408 swi r9, r5, 28 /* *(d + 28) = t1 */
409 swi r10, r5, 24 /* *(d + 24) = t2 */
410 swi r11, r5, 20 /* *(d + 20) = t3 */
411 swi r12, r5, 16 /* *(d + 16) = t4 */
412 lwi r9, r6, 12 /* t1 = *(s + 12) */
413 lwi r10, r6, 8 /* t2 = *(s + 8) */
414 lwi r11, r6, 4 /* t3 = *(s + 4) */
415 lwi r12, r6, 0 /* t4 = *(s + 0) */
416 swi r9, r5, 12 /* *(d + 12) = t1 */
417 swi r10, r5, 8 /* *(d + 8) = t2 */
418 swi r11, r5, 4 /* *(d + 4) = t3 */
419 addi r4, r4, -32 /* n = n - 32 */
420 bneid r4, d_block_aligned /* while (n) loop */
421 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
422 bri d_block_done
423
424d_block_unaligned:
425 andi r8, r6, 0xfffffffc /* as = s & ~3 */
426 rsub r6, r4, r6 /* s = s - n */
427 lwi r11, r8, 0 /* h = *(as + 0) */
428
429 addi r9, r9, -1
430 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
431 addi r9, r9, -1
432 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
433
434d_block_u3:
435 bsrli r11, r11, 8 /* h = h >> 8 */
436d_bu3_loop:
437 addi r8, r8, -32 /* as = as - 32 */
438 addi r5, r5, -32 /* d = d - 32 */
439 lwi r12, r8, 28 /* v = *(as + 28) */
440 bslli r9, r12, 24 /* t1 = v << 24 */
441 or r9, r11, r9 /* t1 = h | t1 */
442 swi r9, r5, 28 /* *(d + 28) = t1 */
443 bsrli r11, r12, 8 /* h = v >> 8 */
444 lwi r12, r8, 24 /* v = *(as + 24) */
445 bslli r9, r12, 24 /* t1 = v << 24 */
446 or r9, r11, r9 /* t1 = h | t1 */
447 swi r9, r5, 24 /* *(d + 24) = t1 */
448 bsrli r11, r12, 8 /* h = v >> 8 */
449 lwi r12, r8, 20 /* v = *(as + 20) */
450 bslli r9, r12, 24 /* t1 = v << 24 */
451 or r9, r11, r9 /* t1 = h | t1 */
452 swi r9, r5, 20 /* *(d + 20) = t1 */
453 bsrli r11, r12, 8 /* h = v >> 8 */
454 lwi r12, r8, 16 /* v = *(as + 16) */
455 bslli r9, r12, 24 /* t1 = v << 24 */
456 or r9, r11, r9 /* t1 = h | t1 */
457 swi r9, r5, 16 /* *(d + 16) = t1 */
458 bsrli r11, r12, 8 /* h = v >> 8 */
459 lwi r12, r8, 12 /* v = *(as + 12) */
460 bslli r9, r12, 24 /* t1 = v << 24 */
461 or r9, r11, r9 /* t1 = h | t1 */
462 swi r9, r5, 12 /* *(d + 112) = t1 */
463 bsrli r11, r12, 8 /* h = v >> 8 */
464 lwi r12, r8, 8 /* v = *(as + 8) */
465 bslli r9, r12, 24 /* t1 = v << 24 */
466 or r9, r11, r9 /* t1 = h | t1 */
467 swi r9, r5, 8 /* *(d + 8) = t1 */
468 bsrli r11, r12, 8 /* h = v >> 8 */
469 lwi r12, r8, 4 /* v = *(as + 4) */
470 bslli r9, r12, 24 /* t1 = v << 24 */
471 or r9, r11, r9 /* t1 = h | t1 */
472 swi r9, r5, 4 /* *(d + 4) = t1 */
473 bsrli r11, r12, 8 /* h = v >> 8 */
474 lwi r12, r8, 0 /* v = *(as + 0) */
475 bslli r9, r12, 24 /* t1 = v << 24 */
476 or r9, r11, r9 /* t1 = h | t1 */
477 swi r9, r5, 0 /* *(d + 0) = t1 */
478 addi r4, r4, -32 /* n = n - 32 */
479 bneid r4, d_bu3_loop /* while (n) loop */
480 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
481 bri d_block_done
482
483d_block_u1:
484 bsrli r11, r11, 24 /* h = h >> 24 */
485d_bu1_loop:
486 addi r8, r8, -32 /* as = as - 32 */
487 addi r5, r5, -32 /* d = d - 32 */
488 lwi r12, r8, 28 /* v = *(as + 28) */
489 bslli r9, r12, 8 /* t1 = v << 8 */
490 or r9, r11, r9 /* t1 = h | t1 */
491 swi r9, r5, 28 /* *(d + 28) = t1 */
492 bsrli r11, r12, 24 /* h = v >> 24 */
493 lwi r12, r8, 24 /* v = *(as + 24) */
494 bslli r9, r12, 8 /* t1 = v << 8 */
495 or r9, r11, r9 /* t1 = h | t1 */
496 swi r9, r5, 24 /* *(d + 24) = t1 */
497 bsrli r11, r12, 24 /* h = v >> 24 */
498 lwi r12, r8, 20 /* v = *(as + 20) */
499 bslli r9, r12, 8 /* t1 = v << 8 */
500 or r9, r11, r9 /* t1 = h | t1 */
501 swi r9, r5, 20 /* *(d + 20) = t1 */
502 bsrli r11, r12, 24 /* h = v >> 24 */
503 lwi r12, r8, 16 /* v = *(as + 16) */
504 bslli r9, r12, 8 /* t1 = v << 8 */
505 or r9, r11, r9 /* t1 = h | t1 */
506 swi r9, r5, 16 /* *(d + 16) = t1 */
507 bsrli r11, r12, 24 /* h = v >> 24 */
508 lwi r12, r8, 12 /* v = *(as + 12) */
509 bslli r9, r12, 8 /* t1 = v << 8 */
510 or r9, r11, r9 /* t1 = h | t1 */
511 swi r9, r5, 12 /* *(d + 112) = t1 */
512 bsrli r11, r12, 24 /* h = v >> 24 */
513 lwi r12, r8, 8 /* v = *(as + 8) */
514 bslli r9, r12, 8 /* t1 = v << 8 */
515 or r9, r11, r9 /* t1 = h | t1 */
516 swi r9, r5, 8 /* *(d + 8) = t1 */
517 bsrli r11, r12, 24 /* h = v >> 24 */
518 lwi r12, r8, 4 /* v = *(as + 4) */
519 bslli r9, r12, 8 /* t1 = v << 8 */
520 or r9, r11, r9 /* t1 = h | t1 */
521 swi r9, r5, 4 /* *(d + 4) = t1 */
522 bsrli r11, r12, 24 /* h = v >> 24 */
523 lwi r12, r8, 0 /* v = *(as + 0) */
524 bslli r9, r12, 8 /* t1 = v << 8 */
525 or r9, r11, r9 /* t1 = h | t1 */
526 swi r9, r5, 0 /* *(d + 0) = t1 */
527 addi r4, r4, -32 /* n = n - 32 */
528 bneid r4, d_bu1_loop /* while (n) loop */
529 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
530 bri d_block_done
531
532d_block_u2:
533 bsrli r11, r11, 16 /* h = h >> 16 */
534d_bu2_loop:
535 addi r8, r8, -32 /* as = as - 32 */
536 addi r5, r5, -32 /* d = d - 32 */
537 lwi r12, r8, 28 /* v = *(as + 28) */
538 bslli r9, r12, 16 /* t1 = v << 16 */
539 or r9, r11, r9 /* t1 = h | t1 */
540 swi r9, r5, 28 /* *(d + 28) = t1 */
541 bsrli r11, r12, 16 /* h = v >> 16 */
542 lwi r12, r8, 24 /* v = *(as + 24) */
543 bslli r9, r12, 16 /* t1 = v << 16 */
544 or r9, r11, r9 /* t1 = h | t1 */
545 swi r9, r5, 24 /* *(d + 24) = t1 */
546 bsrli r11, r12, 16 /* h = v >> 16 */
547 lwi r12, r8, 20 /* v = *(as + 20) */
548 bslli r9, r12, 16 /* t1 = v << 16 */
549 or r9, r11, r9 /* t1 = h | t1 */
550 swi r9, r5, 20 /* *(d + 20) = t1 */
551 bsrli r11, r12, 16 /* h = v >> 16 */
552 lwi r12, r8, 16 /* v = *(as + 16) */
553 bslli r9, r12, 16 /* t1 = v << 16 */
554 or r9, r11, r9 /* t1 = h | t1 */
555 swi r9, r5, 16 /* *(d + 16) = t1 */
556 bsrli r11, r12, 16 /* h = v >> 16 */
557 lwi r12, r8, 12 /* v = *(as + 12) */
558 bslli r9, r12, 16 /* t1 = v << 16 */
559 or r9, r11, r9 /* t1 = h | t1 */
560 swi r9, r5, 12 /* *(d + 112) = t1 */
561 bsrli r11, r12, 16 /* h = v >> 16 */
562 lwi r12, r8, 8 /* v = *(as + 8) */
563 bslli r9, r12, 16 /* t1 = v << 16 */
564 or r9, r11, r9 /* t1 = h | t1 */
565 swi r9, r5, 8 /* *(d + 8) = t1 */
566 bsrli r11, r12, 16 /* h = v >> 16 */
567 lwi r12, r8, 4 /* v = *(as + 4) */
568 bslli r9, r12, 16 /* t1 = v << 16 */
569 or r9, r11, r9 /* t1 = h | t1 */
570 swi r9, r5, 4 /* *(d + 4) = t1 */
571 bsrli r11, r12, 16 /* h = v >> 16 */
572 lwi r12, r8, 0 /* v = *(as + 0) */
573 bslli r9, r12, 16 /* t1 = v << 16 */
574 or r9, r11, r9 /* t1 = h | t1 */
575 swi r9, r5, 0 /* *(d + 0) = t1 */
576 addi r4, r4, -32 /* n = n - 32 */
577 bneid r4, d_bu2_loop /* while (n) loop */
578 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
579
580d_block_done:
581 addi r4, r0, 4 /* n = 4 */
582 cmpu r4, r4, r7 /* n = c - n (unsigned) */
583 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
584
585d_word_xfer:
586 andi r4, r7, 0xfffffffc /* n = c & ~3 */
587 rsub r5, r4, r5 /* d = d - n */
588 rsub r6, r4, r6 /* s = s - n */
589 rsub r7, r4, r7 /* c = c - n */
590
591 andi r9, r6, 3 /* t1 = s & 3 */
592 /* if temp != 0, unaligned transfers needed */
593 bnei r9, d_word_unaligned
594
595d_word_aligned:
596 addi r4, r4,-4 /* n-- */
597 lw r9, r6, r4 /* t1 = *(s+n) */
598 bneid r4, d_word_aligned /* loop */
599 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
600
601 bri d_word_done
602
603d_word_unaligned:
604 andi r8, r6, 0xfffffffc /* as = s & ~3 */
605 lw r11, r8, r4 /* h = *(as + n) */
606
607 addi r9, r9, -1
608 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
609 addi r9, r9, -1
610 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
611
612d_word_u3:
613 bsrli r11, r11, 8 /* h = h >> 8 */
614d_wu3_loop:
615 addi r4, r4,-4 /* n = n - 4 */
616 lw r12, r8, r4 /* v = *(as + n) */
617 bslli r9, r12, 24 /* t1 = v << 24 */
618 or r9, r11, r9 /* t1 = h | t1 */
619 sw r9, r5, r4 /* *(d + n) = t1 */
620 bneid r4, d_wu3_loop /* while (n) loop */
621 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
622
623 bri d_word_done
624
625d_word_u1:
626 bsrli r11, r11, 24 /* h = h >> 24 */
627d_wu1_loop:
628 addi r4, r4,-4 /* n = n - 4 */
629 lw r12, r8, r4 /* v = *(as + n) */
630 bslli r9, r12, 8 /* t1 = v << 8 */
631 or r9, r11, r9 /* t1 = h | t1 */
632 sw r9, r5, r4 /* *(d + n) = t1 */
633 bneid r4, d_wu1_loop /* while (n) loop */
634 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
635
636 bri d_word_done
637
638d_word_u2:
639 bsrli r11, r11, 16 /* h = h >> 16 */
640d_wu2_loop:
641 addi r4, r4,-4 /* n = n - 4 */
642 lw r12, r8, r4 /* v = *(as + n) */
643 bslli r9, r12, 16 /* t1 = v << 16 */
644 or r9, r11, r9 /* t1 = h | t1 */
645 sw r9, r5, r4 /* *(d + n) = t1 */
646 bneid r4, d_wu2_loop /* while (n) loop */
647 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
648
649d_word_done:
650
651d_xfer_end:
652d_xfer_end_loop:
653 beqi r7, a_done /* while (c) */
654 addi r6, r6, -1 /* s-- */
655 lbui r9, r6, 0 /* t1 = *s */
656 addi r5, r5, -1 /* d-- */
657 sbi r9, r5, 0 /* *d = t1 */
658 brid d_xfer_end_loop /* loop */
659 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
660
661d_done:
662 rtsd r15, 8
663 nop
664
665.size memmove, . - memmove
666.end memmove
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32#ifdef __MICROBLAZEEL__
33#error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM.
34#endif
35
36#include <linux/linkage.h>
37 .text
38 .globl memcpy
39 .type memcpy, @function
40 .ent memcpy
41
42memcpy:
43fast_memcpy_ascending:
44 /* move d to return register as value of function */
45 addi r3, r5, 0
46
47 addi r4, r0, 4 /* n = 4 */
48 cmpu r4, r4, r7 /* n = c - n (unsigned) */
49 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
50
51 /* transfer first 0~3 bytes to get aligned dest address */
52 andi r4, r5, 3 /* n = d & 3 */
53 /* if zero, destination already aligned */
54 beqi r4, a_dalign_done
55 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
56 rsubi r4, r4, 4
57 rsub r7, r4, r7 /* c = c - n adjust c */
58
59a_xfer_first_loop:
60 /* if no bytes left to transfer, transfer the bulk */
61 beqi r4, a_dalign_done
62 lbui r11, r6, 0 /* h = *s */
63 sbi r11, r5, 0 /* *d = h */
64 addi r6, r6, 1 /* s++ */
65 addi r5, r5, 1 /* d++ */
66 brid a_xfer_first_loop /* loop */
67 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
68
69a_dalign_done:
70 addi r4, r0, 32 /* n = 32 */
71 cmpu r4, r4, r7 /* n = c - n (unsigned) */
72 /* if n < 0, less than one block to transfer */
73 blti r4, a_block_done
74
75a_block_xfer:
76 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
77 rsub r7, r4, r7 /* c = c - n */
78
79 andi r9, r6, 3 /* t1 = s & 3 */
80 /* if temp != 0, unaligned transfers needed */
81 bnei r9, a_block_unaligned
82
83a_block_aligned:
84 lwi r9, r6, 0 /* t1 = *(s + 0) */
85 lwi r10, r6, 4 /* t2 = *(s + 4) */
86 lwi r11, r6, 8 /* t3 = *(s + 8) */
87 lwi r12, r6, 12 /* t4 = *(s + 12) */
88 swi r9, r5, 0 /* *(d + 0) = t1 */
89 swi r10, r5, 4 /* *(d + 4) = t2 */
90 swi r11, r5, 8 /* *(d + 8) = t3 */
91 swi r12, r5, 12 /* *(d + 12) = t4 */
92 lwi r9, r6, 16 /* t1 = *(s + 16) */
93 lwi r10, r6, 20 /* t2 = *(s + 20) */
94 lwi r11, r6, 24 /* t3 = *(s + 24) */
95 lwi r12, r6, 28 /* t4 = *(s + 28) */
96 swi r9, r5, 16 /* *(d + 16) = t1 */
97 swi r10, r5, 20 /* *(d + 20) = t2 */
98 swi r11, r5, 24 /* *(d + 24) = t3 */
99 swi r12, r5, 28 /* *(d + 28) = t4 */
100 addi r6, r6, 32 /* s = s + 32 */
101 addi r4, r4, -32 /* n = n - 32 */
102 bneid r4, a_block_aligned /* while (n) loop */
103 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
104 bri a_block_done
105
106a_block_unaligned:
107 andi r8, r6, 0xfffffffc /* as = s & ~3 */
108 add r6, r6, r4 /* s = s + n */
109 lwi r11, r8, 0 /* h = *(as + 0) */
110
111 addi r9, r9, -1
112 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
113 addi r9, r9, -1
114 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
115
116a_block_u3:
117 bslli r11, r11, 24 /* h = h << 24 */
118a_bu3_loop:
119 lwi r12, r8, 4 /* v = *(as + 4) */
120 bsrli r9, r12, 8 /* t1 = v >> 8 */
121 or r9, r11, r9 /* t1 = h | t1 */
122 swi r9, r5, 0 /* *(d + 0) = t1 */
123 bslli r11, r12, 24 /* h = v << 24 */
124 lwi r12, r8, 8 /* v = *(as + 8) */
125 bsrli r9, r12, 8 /* t1 = v >> 8 */
126 or r9, r11, r9 /* t1 = h | t1 */
127 swi r9, r5, 4 /* *(d + 4) = t1 */
128 bslli r11, r12, 24 /* h = v << 24 */
129 lwi r12, r8, 12 /* v = *(as + 12) */
130 bsrli r9, r12, 8 /* t1 = v >> 8 */
131 or r9, r11, r9 /* t1 = h | t1 */
132 swi r9, r5, 8 /* *(d + 8) = t1 */
133 bslli r11, r12, 24 /* h = v << 24 */
134 lwi r12, r8, 16 /* v = *(as + 16) */
135 bsrli r9, r12, 8 /* t1 = v >> 8 */
136 or r9, r11, r9 /* t1 = h | t1 */
137 swi r9, r5, 12 /* *(d + 12) = t1 */
138 bslli r11, r12, 24 /* h = v << 24 */
139 lwi r12, r8, 20 /* v = *(as + 20) */
140 bsrli r9, r12, 8 /* t1 = v >> 8 */
141 or r9, r11, r9 /* t1 = h | t1 */
142 swi r9, r5, 16 /* *(d + 16) = t1 */
143 bslli r11, r12, 24 /* h = v << 24 */
144 lwi r12, r8, 24 /* v = *(as + 24) */
145 bsrli r9, r12, 8 /* t1 = v >> 8 */
146 or r9, r11, r9 /* t1 = h | t1 */
147 swi r9, r5, 20 /* *(d + 20) = t1 */
148 bslli r11, r12, 24 /* h = v << 24 */
149 lwi r12, r8, 28 /* v = *(as + 28) */
150 bsrli r9, r12, 8 /* t1 = v >> 8 */
151 or r9, r11, r9 /* t1 = h | t1 */
152 swi r9, r5, 24 /* *(d + 24) = t1 */
153 bslli r11, r12, 24 /* h = v << 24 */
154 lwi r12, r8, 32 /* v = *(as + 32) */
155 bsrli r9, r12, 8 /* t1 = v >> 8 */
156 or r9, r11, r9 /* t1 = h | t1 */
157 swi r9, r5, 28 /* *(d + 28) = t1 */
158 bslli r11, r12, 24 /* h = v << 24 */
159 addi r8, r8, 32 /* as = as + 32 */
160 addi r4, r4, -32 /* n = n - 32 */
161 bneid r4, a_bu3_loop /* while (n) loop */
162 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
163 bri a_block_done
164
165a_block_u1:
166 bslli r11, r11, 8 /* h = h << 8 */
167a_bu1_loop:
168 lwi r12, r8, 4 /* v = *(as + 4) */
169 bsrli r9, r12, 24 /* t1 = v >> 24 */
170 or r9, r11, r9 /* t1 = h | t1 */
171 swi r9, r5, 0 /* *(d + 0) = t1 */
172 bslli r11, r12, 8 /* h = v << 8 */
173 lwi r12, r8, 8 /* v = *(as + 8) */
174 bsrli r9, r12, 24 /* t1 = v >> 24 */
175 or r9, r11, r9 /* t1 = h | t1 */
176 swi r9, r5, 4 /* *(d + 4) = t1 */
177 bslli r11, r12, 8 /* h = v << 8 */
178 lwi r12, r8, 12 /* v = *(as + 12) */
179 bsrli r9, r12, 24 /* t1 = v >> 24 */
180 or r9, r11, r9 /* t1 = h | t1 */
181 swi r9, r5, 8 /* *(d + 8) = t1 */
182 bslli r11, r12, 8 /* h = v << 8 */
183 lwi r12, r8, 16 /* v = *(as + 16) */
184 bsrli r9, r12, 24 /* t1 = v >> 24 */
185 or r9, r11, r9 /* t1 = h | t1 */
186 swi r9, r5, 12 /* *(d + 12) = t1 */
187 bslli r11, r12, 8 /* h = v << 8 */
188 lwi r12, r8, 20 /* v = *(as + 20) */
189 bsrli r9, r12, 24 /* t1 = v >> 24 */
190 or r9, r11, r9 /* t1 = h | t1 */
191 swi r9, r5, 16 /* *(d + 16) = t1 */
192 bslli r11, r12, 8 /* h = v << 8 */
193 lwi r12, r8, 24 /* v = *(as + 24) */
194 bsrli r9, r12, 24 /* t1 = v >> 24 */
195 or r9, r11, r9 /* t1 = h | t1 */
196 swi r9, r5, 20 /* *(d + 20) = t1 */
197 bslli r11, r12, 8 /* h = v << 8 */
198 lwi r12, r8, 28 /* v = *(as + 28) */
199 bsrli r9, r12, 24 /* t1 = v >> 24 */
200 or r9, r11, r9 /* t1 = h | t1 */
201 swi r9, r5, 24 /* *(d + 24) = t1 */
202 bslli r11, r12, 8 /* h = v << 8 */
203 lwi r12, r8, 32 /* v = *(as + 32) */
204 bsrli r9, r12, 24 /* t1 = v >> 24 */
205 or r9, r11, r9 /* t1 = h | t1 */
206 swi r9, r5, 28 /* *(d + 28) = t1 */
207 bslli r11, r12, 8 /* h = v << 8 */
208 addi r8, r8, 32 /* as = as + 32 */
209 addi r4, r4, -32 /* n = n - 32 */
210 bneid r4, a_bu1_loop /* while (n) loop */
211 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
212 bri a_block_done
213
214a_block_u2:
215 bslli r11, r11, 16 /* h = h << 16 */
216a_bu2_loop:
217 lwi r12, r8, 4 /* v = *(as + 4) */
218 bsrli r9, r12, 16 /* t1 = v >> 16 */
219 or r9, r11, r9 /* t1 = h | t1 */
220 swi r9, r5, 0 /* *(d + 0) = t1 */
221 bslli r11, r12, 16 /* h = v << 16 */
222 lwi r12, r8, 8 /* v = *(as + 8) */
223 bsrli r9, r12, 16 /* t1 = v >> 16 */
224 or r9, r11, r9 /* t1 = h | t1 */
225 swi r9, r5, 4 /* *(d + 4) = t1 */
226 bslli r11, r12, 16 /* h = v << 16 */
227 lwi r12, r8, 12 /* v = *(as + 12) */
228 bsrli r9, r12, 16 /* t1 = v >> 16 */
229 or r9, r11, r9 /* t1 = h | t1 */
230 swi r9, r5, 8 /* *(d + 8) = t1 */
231 bslli r11, r12, 16 /* h = v << 16 */
232 lwi r12, r8, 16 /* v = *(as + 16) */
233 bsrli r9, r12, 16 /* t1 = v >> 16 */
234 or r9, r11, r9 /* t1 = h | t1 */
235 swi r9, r5, 12 /* *(d + 12) = t1 */
236 bslli r11, r12, 16 /* h = v << 16 */
237 lwi r12, r8, 20 /* v = *(as + 20) */
238 bsrli r9, r12, 16 /* t1 = v >> 16 */
239 or r9, r11, r9 /* t1 = h | t1 */
240 swi r9, r5, 16 /* *(d + 16) = t1 */
241 bslli r11, r12, 16 /* h = v << 16 */
242 lwi r12, r8, 24 /* v = *(as + 24) */
243 bsrli r9, r12, 16 /* t1 = v >> 16 */
244 or r9, r11, r9 /* t1 = h | t1 */
245 swi r9, r5, 20 /* *(d + 20) = t1 */
246 bslli r11, r12, 16 /* h = v << 16 */
247 lwi r12, r8, 28 /* v = *(as + 28) */
248 bsrli r9, r12, 16 /* t1 = v >> 16 */
249 or r9, r11, r9 /* t1 = h | t1 */
250 swi r9, r5, 24 /* *(d + 24) = t1 */
251 bslli r11, r12, 16 /* h = v << 16 */
252 lwi r12, r8, 32 /* v = *(as + 32) */
253 bsrli r9, r12, 16 /* t1 = v >> 16 */
254 or r9, r11, r9 /* t1 = h | t1 */
255 swi r9, r5, 28 /* *(d + 28) = t1 */
256 bslli r11, r12, 16 /* h = v << 16 */
257 addi r8, r8, 32 /* as = as + 32 */
258 addi r4, r4, -32 /* n = n - 32 */
259 bneid r4, a_bu2_loop /* while (n) loop */
260 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
261
262a_block_done:
263 addi r4, r0, 4 /* n = 4 */
264 cmpu r4, r4, r7 /* n = c - n (unsigned) */
265 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
266
267a_word_xfer:
268 andi r4, r7, 0xfffffffc /* n = c & ~3 */
269 addi r10, r0, 0 /* offset = 0 */
270
271 andi r9, r6, 3 /* t1 = s & 3 */
272 /* if temp != 0, unaligned transfers needed */
273 bnei r9, a_word_unaligned
274
275a_word_aligned:
276 lw r9, r6, r10 /* t1 = *(s+offset) */
277 sw r9, r5, r10 /* *(d+offset) = t1 */
278 addi r4, r4,-4 /* n-- */
279 bneid r4, a_word_aligned /* loop */
280 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
281
282 bri a_word_done
283
284a_word_unaligned:
285 andi r8, r6, 0xfffffffc /* as = s & ~3 */
286 lwi r11, r8, 0 /* h = *(as + 0) */
287 addi r8, r8, 4 /* as = as + 4 */
288
289 addi r9, r9, -1
290 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
291 addi r9, r9, -1
292 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
293
294a_word_u3:
295 bslli r11, r11, 24 /* h = h << 24 */
296a_wu3_loop:
297 lw r12, r8, r10 /* v = *(as + offset) */
298 bsrli r9, r12, 8 /* t1 = v >> 8 */
299 or r9, r11, r9 /* t1 = h | t1 */
300 sw r9, r5, r10 /* *(d + offset) = t1 */
301 bslli r11, r12, 24 /* h = v << 24 */
302 addi r4, r4,-4 /* n = n - 4 */
303 bneid r4, a_wu3_loop /* while (n) loop */
304 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
305
306 bri a_word_done
307
308a_word_u1:
309 bslli r11, r11, 8 /* h = h << 8 */
310a_wu1_loop:
311 lw r12, r8, r10 /* v = *(as + offset) */
312 bsrli r9, r12, 24 /* t1 = v >> 24 */
313 or r9, r11, r9 /* t1 = h | t1 */
314 sw r9, r5, r10 /* *(d + offset) = t1 */
315 bslli r11, r12, 8 /* h = v << 8 */
316 addi r4, r4,-4 /* n = n - 4 */
317 bneid r4, a_wu1_loop /* while (n) loop */
318 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
319
320 bri a_word_done
321
322a_word_u2:
323 bslli r11, r11, 16 /* h = h << 16 */
324a_wu2_loop:
325 lw r12, r8, r10 /* v = *(as + offset) */
326 bsrli r9, r12, 16 /* t1 = v >> 16 */
327 or r9, r11, r9 /* t1 = h | t1 */
328 sw r9, r5, r10 /* *(d + offset) = t1 */
329 bslli r11, r12, 16 /* h = v << 16 */
330 addi r4, r4,-4 /* n = n - 4 */
331 bneid r4, a_wu2_loop /* while (n) loop */
332 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
333
334a_word_done:
335 add r5, r5, r10 /* d = d + offset */
336 add r6, r6, r10 /* s = s + offset */
337 rsub r7, r10, r7 /* c = c - offset */
338
339a_xfer_end:
340a_xfer_end_loop:
341 beqi r7, a_done /* while (c) */
342 lbui r9, r6, 0 /* t1 = *s */
343 addi r6, r6, 1 /* s++ */
344 sbi r9, r5, 0 /* *d = t1 */
345 addi r7, r7, -1 /* c-- */
346 brid a_xfer_end_loop /* loop */
347 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
348
349a_done:
350 rtsd r15, 8
351 nop
352
353.size memcpy, . - memcpy
354.end memcpy
355/*----------------------------------------------------------------------------*/
356 .globl memmove
357 .type memmove, @function
358 .ent memmove
359
360memmove:
361 cmpu r4, r5, r6 /* n = s - d */
362 bgei r4,fast_memcpy_ascending
363
364fast_memcpy_descending:
365 /* move d to return register as value of function */
366 addi r3, r5, 0
367
368 add r5, r5, r7 /* d = d + c */
369 add r6, r6, r7 /* s = s + c */
370
371 addi r4, r0, 4 /* n = 4 */
372 cmpu r4, r4, r7 /* n = c - n (unsigned) */
373 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
374
375 /* transfer first 0~3 bytes to get aligned dest address */
376 andi r4, r5, 3 /* n = d & 3 */
377 /* if zero, destination already aligned */
378 beqi r4,d_dalign_done
379 rsub r7, r4, r7 /* c = c - n adjust c */
380
381d_xfer_first_loop:
382 /* if no bytes left to transfer, transfer the bulk */
383 beqi r4,d_dalign_done
384 addi r6, r6, -1 /* s-- */
385 addi r5, r5, -1 /* d-- */
386 lbui r11, r6, 0 /* h = *s */
387 sbi r11, r5, 0 /* *d = h */
388 brid d_xfer_first_loop /* loop */
389 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
390
391d_dalign_done:
392 addi r4, r0, 32 /* n = 32 */
393 cmpu r4, r4, r7 /* n = c - n (unsigned) */
394 /* if n < 0, less than one block to transfer */
395 blti r4, d_block_done
396
397d_block_xfer:
398 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
399 rsub r7, r4, r7 /* c = c - n */
400
401 andi r9, r6, 3 /* t1 = s & 3 */
402 /* if temp != 0, unaligned transfers needed */
403 bnei r9, d_block_unaligned
404
405d_block_aligned:
406 addi r6, r6, -32 /* s = s - 32 */
407 addi r5, r5, -32 /* d = d - 32 */
408 lwi r9, r6, 28 /* t1 = *(s + 28) */
409 lwi r10, r6, 24 /* t2 = *(s + 24) */
410 lwi r11, r6, 20 /* t3 = *(s + 20) */
411 lwi r12, r6, 16 /* t4 = *(s + 16) */
412 swi r9, r5, 28 /* *(d + 28) = t1 */
413 swi r10, r5, 24 /* *(d + 24) = t2 */
414 swi r11, r5, 20 /* *(d + 20) = t3 */
415 swi r12, r5, 16 /* *(d + 16) = t4 */
416 lwi r9, r6, 12 /* t1 = *(s + 12) */
417 lwi r10, r6, 8 /* t2 = *(s + 8) */
418 lwi r11, r6, 4 /* t3 = *(s + 4) */
419 lwi r12, r6, 0 /* t4 = *(s + 0) */
420 swi r9, r5, 12 /* *(d + 12) = t1 */
421 swi r10, r5, 8 /* *(d + 8) = t2 */
422 swi r11, r5, 4 /* *(d + 4) = t3 */
423 addi r4, r4, -32 /* n = n - 32 */
424 bneid r4, d_block_aligned /* while (n) loop */
425 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
426 bri d_block_done
427
428d_block_unaligned:
429 andi r8, r6, 0xfffffffc /* as = s & ~3 */
430 rsub r6, r4, r6 /* s = s - n */
431 lwi r11, r8, 0 /* h = *(as + 0) */
432
433 addi r9, r9, -1
434 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
435 addi r9, r9, -1
436 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
437
438d_block_u3:
439 bsrli r11, r11, 8 /* h = h >> 8 */
440d_bu3_loop:
441 addi r8, r8, -32 /* as = as - 32 */
442 addi r5, r5, -32 /* d = d - 32 */
443 lwi r12, r8, 28 /* v = *(as + 28) */
444 bslli r9, r12, 24 /* t1 = v << 24 */
445 or r9, r11, r9 /* t1 = h | t1 */
446 swi r9, r5, 28 /* *(d + 28) = t1 */
447 bsrli r11, r12, 8 /* h = v >> 8 */
448 lwi r12, r8, 24 /* v = *(as + 24) */
449 bslli r9, r12, 24 /* t1 = v << 24 */
450 or r9, r11, r9 /* t1 = h | t1 */
451 swi r9, r5, 24 /* *(d + 24) = t1 */
452 bsrli r11, r12, 8 /* h = v >> 8 */
453 lwi r12, r8, 20 /* v = *(as + 20) */
454 bslli r9, r12, 24 /* t1 = v << 24 */
455 or r9, r11, r9 /* t1 = h | t1 */
456 swi r9, r5, 20 /* *(d + 20) = t1 */
457 bsrli r11, r12, 8 /* h = v >> 8 */
458 lwi r12, r8, 16 /* v = *(as + 16) */
459 bslli r9, r12, 24 /* t1 = v << 24 */
460 or r9, r11, r9 /* t1 = h | t1 */
461 swi r9, r5, 16 /* *(d + 16) = t1 */
462 bsrli r11, r12, 8 /* h = v >> 8 */
463 lwi r12, r8, 12 /* v = *(as + 12) */
464 bslli r9, r12, 24 /* t1 = v << 24 */
465 or r9, r11, r9 /* t1 = h | t1 */
466 swi r9, r5, 12 /* *(d + 112) = t1 */
467 bsrli r11, r12, 8 /* h = v >> 8 */
468 lwi r12, r8, 8 /* v = *(as + 8) */
469 bslli r9, r12, 24 /* t1 = v << 24 */
470 or r9, r11, r9 /* t1 = h | t1 */
471 swi r9, r5, 8 /* *(d + 8) = t1 */
472 bsrli r11, r12, 8 /* h = v >> 8 */
473 lwi r12, r8, 4 /* v = *(as + 4) */
474 bslli r9, r12, 24 /* t1 = v << 24 */
475 or r9, r11, r9 /* t1 = h | t1 */
476 swi r9, r5, 4 /* *(d + 4) = t1 */
477 bsrli r11, r12, 8 /* h = v >> 8 */
478 lwi r12, r8, 0 /* v = *(as + 0) */
479 bslli r9, r12, 24 /* t1 = v << 24 */
480 or r9, r11, r9 /* t1 = h | t1 */
481 swi r9, r5, 0 /* *(d + 0) = t1 */
482 addi r4, r4, -32 /* n = n - 32 */
483 bneid r4, d_bu3_loop /* while (n) loop */
484 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
485 bri d_block_done
486
487d_block_u1:
488 bsrli r11, r11, 24 /* h = h >> 24 */
489d_bu1_loop:
490 addi r8, r8, -32 /* as = as - 32 */
491 addi r5, r5, -32 /* d = d - 32 */
492 lwi r12, r8, 28 /* v = *(as + 28) */
493 bslli r9, r12, 8 /* t1 = v << 8 */
494 or r9, r11, r9 /* t1 = h | t1 */
495 swi r9, r5, 28 /* *(d + 28) = t1 */
496 bsrli r11, r12, 24 /* h = v >> 24 */
497 lwi r12, r8, 24 /* v = *(as + 24) */
498 bslli r9, r12, 8 /* t1 = v << 8 */
499 or r9, r11, r9 /* t1 = h | t1 */
500 swi r9, r5, 24 /* *(d + 24) = t1 */
501 bsrli r11, r12, 24 /* h = v >> 24 */
502 lwi r12, r8, 20 /* v = *(as + 20) */
503 bslli r9, r12, 8 /* t1 = v << 8 */
504 or r9, r11, r9 /* t1 = h | t1 */
505 swi r9, r5, 20 /* *(d + 20) = t1 */
506 bsrli r11, r12, 24 /* h = v >> 24 */
507 lwi r12, r8, 16 /* v = *(as + 16) */
508 bslli r9, r12, 8 /* t1 = v << 8 */
509 or r9, r11, r9 /* t1 = h | t1 */
510 swi r9, r5, 16 /* *(d + 16) = t1 */
511 bsrli r11, r12, 24 /* h = v >> 24 */
512 lwi r12, r8, 12 /* v = *(as + 12) */
513 bslli r9, r12, 8 /* t1 = v << 8 */
514 or r9, r11, r9 /* t1 = h | t1 */
515 swi r9, r5, 12 /* *(d + 112) = t1 */
516 bsrli r11, r12, 24 /* h = v >> 24 */
517 lwi r12, r8, 8 /* v = *(as + 8) */
518 bslli r9, r12, 8 /* t1 = v << 8 */
519 or r9, r11, r9 /* t1 = h | t1 */
520 swi r9, r5, 8 /* *(d + 8) = t1 */
521 bsrli r11, r12, 24 /* h = v >> 24 */
522 lwi r12, r8, 4 /* v = *(as + 4) */
523 bslli r9, r12, 8 /* t1 = v << 8 */
524 or r9, r11, r9 /* t1 = h | t1 */
525 swi r9, r5, 4 /* *(d + 4) = t1 */
526 bsrli r11, r12, 24 /* h = v >> 24 */
527 lwi r12, r8, 0 /* v = *(as + 0) */
528 bslli r9, r12, 8 /* t1 = v << 8 */
529 or r9, r11, r9 /* t1 = h | t1 */
530 swi r9, r5, 0 /* *(d + 0) = t1 */
531 addi r4, r4, -32 /* n = n - 32 */
532 bneid r4, d_bu1_loop /* while (n) loop */
533 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
534 bri d_block_done
535
536d_block_u2:
537 bsrli r11, r11, 16 /* h = h >> 16 */
538d_bu2_loop:
539 addi r8, r8, -32 /* as = as - 32 */
540 addi r5, r5, -32 /* d = d - 32 */
541 lwi r12, r8, 28 /* v = *(as + 28) */
542 bslli r9, r12, 16 /* t1 = v << 16 */
543 or r9, r11, r9 /* t1 = h | t1 */
544 swi r9, r5, 28 /* *(d + 28) = t1 */
545 bsrli r11, r12, 16 /* h = v >> 16 */
546 lwi r12, r8, 24 /* v = *(as + 24) */
547 bslli r9, r12, 16 /* t1 = v << 16 */
548 or r9, r11, r9 /* t1 = h | t1 */
549 swi r9, r5, 24 /* *(d + 24) = t1 */
550 bsrli r11, r12, 16 /* h = v >> 16 */
551 lwi r12, r8, 20 /* v = *(as + 20) */
552 bslli r9, r12, 16 /* t1 = v << 16 */
553 or r9, r11, r9 /* t1 = h | t1 */
554 swi r9, r5, 20 /* *(d + 20) = t1 */
555 bsrli r11, r12, 16 /* h = v >> 16 */
556 lwi r12, r8, 16 /* v = *(as + 16) */
557 bslli r9, r12, 16 /* t1 = v << 16 */
558 or r9, r11, r9 /* t1 = h | t1 */
559 swi r9, r5, 16 /* *(d + 16) = t1 */
560 bsrli r11, r12, 16 /* h = v >> 16 */
561 lwi r12, r8, 12 /* v = *(as + 12) */
562 bslli r9, r12, 16 /* t1 = v << 16 */
563 or r9, r11, r9 /* t1 = h | t1 */
564 swi r9, r5, 12 /* *(d + 112) = t1 */
565 bsrli r11, r12, 16 /* h = v >> 16 */
566 lwi r12, r8, 8 /* v = *(as + 8) */
567 bslli r9, r12, 16 /* t1 = v << 16 */
568 or r9, r11, r9 /* t1 = h | t1 */
569 swi r9, r5, 8 /* *(d + 8) = t1 */
570 bsrli r11, r12, 16 /* h = v >> 16 */
571 lwi r12, r8, 4 /* v = *(as + 4) */
572 bslli r9, r12, 16 /* t1 = v << 16 */
573 or r9, r11, r9 /* t1 = h | t1 */
574 swi r9, r5, 4 /* *(d + 4) = t1 */
575 bsrli r11, r12, 16 /* h = v >> 16 */
576 lwi r12, r8, 0 /* v = *(as + 0) */
577 bslli r9, r12, 16 /* t1 = v << 16 */
578 or r9, r11, r9 /* t1 = h | t1 */
579 swi r9, r5, 0 /* *(d + 0) = t1 */
580 addi r4, r4, -32 /* n = n - 32 */
581 bneid r4, d_bu2_loop /* while (n) loop */
582 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
583
584d_block_done:
585 addi r4, r0, 4 /* n = 4 */
586 cmpu r4, r4, r7 /* n = c - n (unsigned) */
587 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
588
589d_word_xfer:
590 andi r4, r7, 0xfffffffc /* n = c & ~3 */
591 rsub r5, r4, r5 /* d = d - n */
592 rsub r6, r4, r6 /* s = s - n */
593 rsub r7, r4, r7 /* c = c - n */
594
595 andi r9, r6, 3 /* t1 = s & 3 */
596 /* if temp != 0, unaligned transfers needed */
597 bnei r9, d_word_unaligned
598
599d_word_aligned:
600 addi r4, r4,-4 /* n-- */
601 lw r9, r6, r4 /* t1 = *(s+n) */
602 bneid r4, d_word_aligned /* loop */
603 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
604
605 bri d_word_done
606
607d_word_unaligned:
608 andi r8, r6, 0xfffffffc /* as = s & ~3 */
609 lw r11, r8, r4 /* h = *(as + n) */
610
611 addi r9, r9, -1
612 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
613 addi r9, r9, -1
614 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
615
616d_word_u3:
617 bsrli r11, r11, 8 /* h = h >> 8 */
618d_wu3_loop:
619 addi r4, r4,-4 /* n = n - 4 */
620 lw r12, r8, r4 /* v = *(as + n) */
621 bslli r9, r12, 24 /* t1 = v << 24 */
622 or r9, r11, r9 /* t1 = h | t1 */
623 sw r9, r5, r4 /* *(d + n) = t1 */
624 bneid r4, d_wu3_loop /* while (n) loop */
625 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
626
627 bri d_word_done
628
629d_word_u1:
630 bsrli r11, r11, 24 /* h = h >> 24 */
631d_wu1_loop:
632 addi r4, r4,-4 /* n = n - 4 */
633 lw r12, r8, r4 /* v = *(as + n) */
634 bslli r9, r12, 8 /* t1 = v << 8 */
635 or r9, r11, r9 /* t1 = h | t1 */
636 sw r9, r5, r4 /* *(d + n) = t1 */
637 bneid r4, d_wu1_loop /* while (n) loop */
638 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
639
640 bri d_word_done
641
642d_word_u2:
643 bsrli r11, r11, 16 /* h = h >> 16 */
644d_wu2_loop:
645 addi r4, r4,-4 /* n = n - 4 */
646 lw r12, r8, r4 /* v = *(as + n) */
647 bslli r9, r12, 16 /* t1 = v << 16 */
648 or r9, r11, r9 /* t1 = h | t1 */
649 sw r9, r5, r4 /* *(d + n) = t1 */
650 bneid r4, d_wu2_loop /* while (n) loop */
651 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
652
653d_word_done:
654
655d_xfer_end:
656d_xfer_end_loop:
657 beqi r7, a_done /* while (c) */
658 addi r6, r6, -1 /* s-- */
659 lbui r9, r6, 0 /* t1 = *s */
660 addi r5, r5, -1 /* d-- */
661 sbi r9, r5, 0 /* *d = t1 */
662 brid d_xfer_end_loop /* loop */
663 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
664
665d_done:
666 rtsd r15, 8
667 nop
668
669.size memmove, . - memmove
670.end memmove