Loading...
1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memcpy
5!
6! by Toshiyasu Morita (tm@netcom.com)
7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8! SH5 code Copyright 2002 SuperH Ltd.
9!
10! Entry: ARG0: destination pointer
11! ARG1: source pointer
12! ARG2: byte count
13!
14! Exit: RESULT: destination pointer
15! any other registers in the range r0-r7: trashed
16!
17! Notes: Usually one wants to do small reads and write a longword, but
18! unfortunately it is difficult in some cases to concatanate bytes
19! into a longword on the SH, so this does a longword read and small
20! writes.
21!
22! This implementation makes two assumptions about how it is called:
23!
24! 1.: If the byte count is nonzero, the address of the last byte to be
25! copied is unsigned greater than the address of the first byte to
26! be copied. This could be easily swapped for a signed comparison,
27! but the algorithm used needs some comparison.
28!
29! 2.: When there are two or three bytes in the last word of an 11-or-more
30! bytes memory chunk to b copied, the rest of the word can be read
31! without side effects.
32! This could be easily changed by increasing the minimum size of
33! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34! however, this would cost a few extra cyles on average.
35! For SHmedia, the assumption is that any quadword can be read in its
36! enirety if at least one byte is included in the copy.
37!
38
39 .section .text..SHmedia32,"ax"
40 .globl memcpy
41 .type memcpy, @function
42 .align 5
43
44memcpy:
45
46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
50
51 ld.b r3,0,r63
52 pta/l Large,tr0
53 movi 25,r0
54 bgeu/u r4,r0,tr0
55 nsb r4,r0
56 shlli r0,5,r0
57 movi (L1-L0+63*32 + 1) & 0xffff,r1
58 sub r1, r0, r0
59L0: ptrel r0,tr0
60 add r2,r4,r5
61 ptabs r18,tr1
62 add r3,r4,r6
63 blink tr0,r63
64
65/* Rearranged to make cut2 safe */
66 .balign 8
67L4_7: /* 4..7 byte memcpy cntd. */
68 stlo.l r2, 0, r0
69 or r6, r7, r6
70 sthi.l r5, -1, r6
71 stlo.l r5, -4, r6
72 blink tr1,r63
73
74 .balign 8
75L1: /* 0 byte memcpy */
76 nop
77 blink tr1,r63
78 nop
79 nop
80 nop
81 nop
82
83L2_3: /* 2 or 3 byte memcpy cntd. */
84 st.b r5,-1,r6
85 blink tr1,r63
86
87 /* 1 byte memcpy */
88 ld.b r3,0,r0
89 st.b r2,0,r0
90 blink tr1,r63
91
92L8_15: /* 8..15 byte memcpy cntd. */
93 stlo.q r2, 0, r0
94 or r6, r7, r6
95 sthi.q r5, -1, r6
96 stlo.q r5, -8, r6
97 blink tr1,r63
98
99 /* 2 or 3 byte memcpy */
100 ld.b r3,0,r0
101 ld.b r2,0,r63
102 ld.b r3,1,r1
103 st.b r2,0,r0
104 pta/l L2_3,tr0
105 ld.b r6,-1,r6
106 st.b r2,1,r1
107 blink tr0, r63
108
109 /* 4 .. 7 byte memcpy */
110 LDUAL (r3, 0, r0, r1)
111 pta L4_7, tr0
112 ldlo.l r6, -4, r7
113 or r0, r1, r0
114 sthi.l r2, 3, r0
115 ldhi.l r6, -1, r6
116 blink tr0, r63
117
118 /* 8 .. 15 byte memcpy */
119 LDUAQ (r3, 0, r0, r1)
120 pta L8_15, tr0
121 ldlo.q r6, -8, r7
122 or r0, r1, r0
123 sthi.q r2, 7, r0
124 ldhi.q r6, -1, r6
125 blink tr0, r63
126
127 /* 16 .. 24 byte memcpy */
128 LDUAQ (r3, 0, r0, r1)
129 LDUAQ (r3, 8, r8, r9)
130 or r0, r1, r0
131 sthi.q r2, 7, r0
132 or r8, r9, r8
133 sthi.q r2, 15, r8
134 ldlo.q r6, -8, r7
135 ldhi.q r6, -1, r6
136 stlo.q r2, 8, r8
137 stlo.q r2, 0, r0
138 or r6, r7, r6
139 sthi.q r5, -1, r6
140 stlo.q r5, -8, r6
141 blink tr1,r63
142
143Large:
144 ld.b r2, 0, r63
145 pta/l Loop_ua, tr1
146 ori r3, -8, r7
147 sub r2, r7, r22
148 sub r3, r2, r6
149 add r2, r4, r5
150 ldlo.q r3, 0, r0
151 addi r5, -16, r5
152 movi 64+8, r27 // could subtract r7 from that.
153 stlo.q r2, 0, r0
154 sthi.q r2, 7, r0
155 ldx.q r22, r6, r0
156 bgtu/l r27, r4, tr1
157
158 addi r5, -48, r27
159 pta/l Loop_line, tr0
160 addi r6, 64, r36
161 addi r6, -24, r19
162 addi r6, -16, r20
163 addi r6, -8, r21
164
165Loop_line:
166 ldx.q r22, r36, r63
167 alloco r22, 32
168 addi r22, 32, r22
169 ldx.q r22, r19, r23
170 sthi.q r22, -25, r0
171 ldx.q r22, r20, r24
172 ldx.q r22, r21, r25
173 stlo.q r22, -32, r0
174 ldx.q r22, r6, r0
175 sthi.q r22, -17, r23
176 sthi.q r22, -9, r24
177 sthi.q r22, -1, r25
178 stlo.q r22, -24, r23
179 stlo.q r22, -16, r24
180 stlo.q r22, -8, r25
181 bgeu r27, r22, tr0
182
183Loop_ua:
184 addi r22, 8, r22
185 sthi.q r22, -1, r0
186 stlo.q r22, -8, r0
187 ldx.q r22, r6, r0
188 bgtu/l r5, r22, tr1
189
190 add r3, r4, r7
191 ldlo.q r7, -8, r1
192 sthi.q r22, 7, r0
193 ldhi.q r7, -1, r7
194 ptabs r18,tr1
195 stlo.q r22, 0, r0
196 or r1, r7, r1
197 sthi.q r5, 15, r1
198 stlo.q r5, 8, r1
199 blink tr1, r63
200
201 .size memcpy,.-memcpy
1/* SPDX-License-Identifier: GPL-2.0 */
2/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
3/* Modified by SuperH, Inc. September 2003 */
4!
5! Fast SH memcpy
6!
7! by Toshiyasu Morita (tm@netcom.com)
8! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
9! SH5 code Copyright 2002 SuperH Ltd.
10!
11! Entry: ARG0: destination pointer
12! ARG1: source pointer
13! ARG2: byte count
14!
15! Exit: RESULT: destination pointer
16! any other registers in the range r0-r7: trashed
17!
18! Notes: Usually one wants to do small reads and write a longword, but
19! unfortunately it is difficult in some cases to concatanate bytes
20! into a longword on the SH, so this does a longword read and small
21! writes.
22!
23! This implementation makes two assumptions about how it is called:
24!
25! 1.: If the byte count is nonzero, the address of the last byte to be
26! copied is unsigned greater than the address of the first byte to
27! be copied. This could be easily swapped for a signed comparison,
28! but the algorithm used needs some comparison.
29!
30! 2.: When there are two or three bytes in the last word of an 11-or-more
31! bytes memory chunk to b copied, the rest of the word can be read
32! without side effects.
33! This could be easily changed by increasing the minimum size of
34! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
35! however, this would cost a few extra cyles on average.
36! For SHmedia, the assumption is that any quadword can be read in its
37! enirety if at least one byte is included in the copy.
38!
39
40 .section .text..SHmedia32,"ax"
41 .globl memcpy
42 .type memcpy, @function
43 .align 5
44
45memcpy:
46
47#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
48#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
49#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
50#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
51
52 ld.b r3,0,r63
53 pta/l Large,tr0
54 movi 25,r0
55 bgeu/u r4,r0,tr0
56 nsb r4,r0
57 shlli r0,5,r0
58 movi (L1-L0+63*32 + 1) & 0xffff,r1
59 sub r1, r0, r0
60L0: ptrel r0,tr0
61 add r2,r4,r5
62 ptabs r18,tr1
63 add r3,r4,r6
64 blink tr0,r63
65
66/* Rearranged to make cut2 safe */
67 .balign 8
68L4_7: /* 4..7 byte memcpy cntd. */
69 stlo.l r2, 0, r0
70 or r6, r7, r6
71 sthi.l r5, -1, r6
72 stlo.l r5, -4, r6
73 blink tr1,r63
74
75 .balign 8
76L1: /* 0 byte memcpy */
77 nop
78 blink tr1,r63
79 nop
80 nop
81 nop
82 nop
83
84L2_3: /* 2 or 3 byte memcpy cntd. */
85 st.b r5,-1,r6
86 blink tr1,r63
87
88 /* 1 byte memcpy */
89 ld.b r3,0,r0
90 st.b r2,0,r0
91 blink tr1,r63
92
93L8_15: /* 8..15 byte memcpy cntd. */
94 stlo.q r2, 0, r0
95 or r6, r7, r6
96 sthi.q r5, -1, r6
97 stlo.q r5, -8, r6
98 blink tr1,r63
99
100 /* 2 or 3 byte memcpy */
101 ld.b r3,0,r0
102 ld.b r2,0,r63
103 ld.b r3,1,r1
104 st.b r2,0,r0
105 pta/l L2_3,tr0
106 ld.b r6,-1,r6
107 st.b r2,1,r1
108 blink tr0, r63
109
110 /* 4 .. 7 byte memcpy */
111 LDUAL (r3, 0, r0, r1)
112 pta L4_7, tr0
113 ldlo.l r6, -4, r7
114 or r0, r1, r0
115 sthi.l r2, 3, r0
116 ldhi.l r6, -1, r6
117 blink tr0, r63
118
119 /* 8 .. 15 byte memcpy */
120 LDUAQ (r3, 0, r0, r1)
121 pta L8_15, tr0
122 ldlo.q r6, -8, r7
123 or r0, r1, r0
124 sthi.q r2, 7, r0
125 ldhi.q r6, -1, r6
126 blink tr0, r63
127
128 /* 16 .. 24 byte memcpy */
129 LDUAQ (r3, 0, r0, r1)
130 LDUAQ (r3, 8, r8, r9)
131 or r0, r1, r0
132 sthi.q r2, 7, r0
133 or r8, r9, r8
134 sthi.q r2, 15, r8
135 ldlo.q r6, -8, r7
136 ldhi.q r6, -1, r6
137 stlo.q r2, 8, r8
138 stlo.q r2, 0, r0
139 or r6, r7, r6
140 sthi.q r5, -1, r6
141 stlo.q r5, -8, r6
142 blink tr1,r63
143
144Large:
145 ld.b r2, 0, r63
146 pta/l Loop_ua, tr1
147 ori r3, -8, r7
148 sub r2, r7, r22
149 sub r3, r2, r6
150 add r2, r4, r5
151 ldlo.q r3, 0, r0
152 addi r5, -16, r5
153 movi 64+8, r27 // could subtract r7 from that.
154 stlo.q r2, 0, r0
155 sthi.q r2, 7, r0
156 ldx.q r22, r6, r0
157 bgtu/l r27, r4, tr1
158
159 addi r5, -48, r27
160 pta/l Loop_line, tr0
161 addi r6, 64, r36
162 addi r6, -24, r19
163 addi r6, -16, r20
164 addi r6, -8, r21
165
166Loop_line:
167 ldx.q r22, r36, r63
168 alloco r22, 32
169 addi r22, 32, r22
170 ldx.q r22, r19, r23
171 sthi.q r22, -25, r0
172 ldx.q r22, r20, r24
173 ldx.q r22, r21, r25
174 stlo.q r22, -32, r0
175 ldx.q r22, r6, r0
176 sthi.q r22, -17, r23
177 sthi.q r22, -9, r24
178 sthi.q r22, -1, r25
179 stlo.q r22, -24, r23
180 stlo.q r22, -16, r24
181 stlo.q r22, -8, r25
182 bgeu r27, r22, tr0
183
184Loop_ua:
185 addi r22, 8, r22
186 sthi.q r22, -1, r0
187 stlo.q r22, -8, r0
188 ldx.q r22, r6, r0
189 bgtu/l r5, r22, tr1
190
191 add r3, r4, r7
192 ldlo.q r7, -8, r1
193 sthi.q r22, 7, r0
194 ldhi.q r7, -1, r7
195 ptabs r18,tr1
196 stlo.q r22, 0, r0
197 or r1, r7, r1
198 sthi.q r5, 15, r1
199 stlo.q r5, 8, r1
200 blink tr1, r63
201
202 .size memcpy,.-memcpy