Linux Audio

Check our new training course

Loading...
v3.5.6
 
  1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
  2/* Modified by SuperH, Inc. September 2003 */
  3!
  4! Fast SH memcpy
  5!
  6! by Toshiyasu Morita (tm@netcom.com)
  7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
  8! SH5 code Copyright 2002 SuperH Ltd.
  9!
 10! Entry: ARG0: destination pointer
 11!        ARG1: source pointer
 12!        ARG2: byte count
 13!
 14! Exit:  RESULT: destination pointer
 15!        any other registers in the range r0-r7: trashed
 16!
 17! Notes: Usually one wants to do small reads and write a longword, but
 18!        unfortunately it is difficult in some cases to concatanate bytes
 19!        into a longword on the SH, so this does a longword read and small
 20!        writes.
 21!
 22! This implementation makes two assumptions about how it is called:
 23!
 24! 1.: If the byte count is nonzero, the address of the last byte to be
 25!     copied is unsigned greater than the address of the first byte to
 26!     be copied.  This could be easily swapped for a signed comparison,
 27!     but the algorithm used needs some comparison.
 28!
 29! 2.: When there are two or three bytes in the last word of an 11-or-more
 30!     bytes memory chunk to b copied, the rest of the word can be read
 31!     without side effects.
 32!     This could be easily changed by increasing the minimum size of
 33!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
 34!     however, this would cost a few extra cyles on average.
 35!     For SHmedia, the assumption is that any quadword can be read in its
 36!     enirety if at least one byte is included in the copy.
 37!
 38
 39	.section .text..SHmedia32,"ax"
 40	.globl	memcpy
 41	.type	memcpy, @function
 42	.align	5
 43
 44memcpy:
 45
 46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
 47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
 48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
 49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
 50
 51	ld.b r3,0,r63
 52	pta/l Large,tr0
 53	movi 25,r0
 54	bgeu/u r4,r0,tr0
 55	nsb r4,r0
 56	shlli r0,5,r0
 57	movi (L1-L0+63*32 + 1) & 0xffff,r1
 58	sub r1, r0, r0
 59L0:	ptrel r0,tr0
 60	add r2,r4,r5
 61	ptabs r18,tr1
 62	add r3,r4,r6
 63	blink tr0,r63
 64	
 65/* Rearranged to make cut2 safe */
 66	.balign 8
 67L4_7:	/* 4..7 byte memcpy cntd. */
 68	stlo.l r2, 0, r0
 69	or r6, r7, r6
 70	sthi.l r5, -1, r6
 71	stlo.l r5, -4, r6
 72	blink tr1,r63
 73
 74	.balign 8
 75L1:	/* 0 byte memcpy */
 76	nop
 77	blink tr1,r63
 78	nop
 79	nop
 80	nop
 81	nop
 82
 83L2_3:	/* 2 or 3 byte memcpy cntd. */
 84	st.b r5,-1,r6
 85	blink tr1,r63
 86
 87	/* 1 byte memcpy */
 88	ld.b r3,0,r0
 89	st.b r2,0,r0
 90	blink tr1,r63
 91
 92L8_15:	/* 8..15 byte memcpy cntd. */
 93	stlo.q r2, 0, r0
 94	or r6, r7, r6
 95	sthi.q r5, -1, r6
 96	stlo.q r5, -8, r6
 97	blink tr1,r63
 98	
 99	/* 2 or 3 byte memcpy */
100	ld.b r3,0,r0
101	ld.b r2,0,r63
102	ld.b r3,1,r1
103	st.b r2,0,r0
104	pta/l L2_3,tr0
105	ld.b r6,-1,r6
106	st.b r2,1,r1
107	blink tr0, r63
108
109	/* 4 .. 7 byte memcpy */
110	LDUAL (r3, 0, r0, r1)
111	pta L4_7, tr0
112	ldlo.l r6, -4, r7
113	or r0, r1, r0
114	sthi.l r2, 3, r0
115	ldhi.l r6, -1, r6
116	blink tr0, r63
117
118	/* 8 .. 15 byte memcpy */
119	LDUAQ (r3, 0, r0, r1)
120	pta L8_15, tr0
121	ldlo.q r6, -8, r7
122	or r0, r1, r0
123	sthi.q r2, 7, r0
124	ldhi.q r6, -1, r6
125	blink tr0, r63
126
127	/* 16 .. 24 byte memcpy */
128	LDUAQ (r3, 0, r0, r1)
129	LDUAQ (r3, 8, r8, r9)
130	or r0, r1, r0
131	sthi.q r2, 7, r0
132	or r8, r9, r8
133	sthi.q r2, 15, r8
134	ldlo.q r6, -8, r7
135	ldhi.q r6, -1, r6
136	stlo.q r2, 8, r8
137	stlo.q r2, 0, r0
138	or r6, r7, r6
139	sthi.q r5, -1, r6
140	stlo.q r5, -8, r6
141	blink tr1,r63
142
143Large:
144	ld.b r2, 0, r63
145	pta/l  Loop_ua, tr1
146	ori r3, -8, r7
147	sub r2, r7, r22
148	sub r3, r2, r6
149	add r2, r4, r5
150	ldlo.q r3, 0, r0
151	addi r5, -16, r5
152	movi 64+8, r27 // could subtract r7 from that.
153	stlo.q r2, 0, r0
154	sthi.q r2, 7, r0
155	ldx.q r22, r6, r0
156	bgtu/l r27, r4, tr1
157
158	addi r5, -48, r27
159	pta/l Loop_line, tr0
160	addi r6, 64, r36
161	addi r6, -24, r19
162	addi r6, -16, r20
163	addi r6, -8, r21
164
165Loop_line:
166	ldx.q r22, r36, r63
167	alloco r22, 32
168	addi r22, 32, r22
169	ldx.q r22, r19, r23
170	sthi.q r22, -25, r0
171	ldx.q r22, r20, r24
172	ldx.q r22, r21, r25
173	stlo.q r22, -32, r0
174	ldx.q r22, r6,  r0
175	sthi.q r22, -17, r23
176	sthi.q r22,  -9, r24
177	sthi.q r22,  -1, r25
178	stlo.q r22, -24, r23
179	stlo.q r22, -16, r24
180	stlo.q r22,  -8, r25
181	bgeu r27, r22, tr0
182
183Loop_ua:
184	addi r22, 8, r22
185	sthi.q r22, -1, r0
186	stlo.q r22, -8, r0
187	ldx.q r22, r6, r0
188	bgtu/l r5, r22, tr1
189
190	add r3, r4, r7
191	ldlo.q r7, -8, r1
192	sthi.q r22, 7, r0
193	ldhi.q r7, -1, r7
194	ptabs r18,tr1
195	stlo.q r22, 0, r0
196	or r1, r7, r1
197	sthi.q r5, 15, r1
198	stlo.q r5, 8, r1
199	blink tr1, r63
200
201	.size memcpy,.-memcpy
v5.4
  1/* SPDX-License-Identifier: GPL-2.0 */
  2/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
  3/* Modified by SuperH, Inc. September 2003 */
  4!
  5! Fast SH memcpy
  6!
  7! by Toshiyasu Morita (tm@netcom.com)
  8! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
  9! SH5 code Copyright 2002 SuperH Ltd.
 10!
 11! Entry: ARG0: destination pointer
 12!        ARG1: source pointer
 13!        ARG2: byte count
 14!
 15! Exit:  RESULT: destination pointer
 16!        any other registers in the range r0-r7: trashed
 17!
 18! Notes: Usually one wants to do small reads and write a longword, but
 19!        unfortunately it is difficult in some cases to concatanate bytes
 20!        into a longword on the SH, so this does a longword read and small
 21!        writes.
 22!
 23! This implementation makes two assumptions about how it is called:
 24!
 25! 1.: If the byte count is nonzero, the address of the last byte to be
 26!     copied is unsigned greater than the address of the first byte to
 27!     be copied.  This could be easily swapped for a signed comparison,
 28!     but the algorithm used needs some comparison.
 29!
 30! 2.: When there are two or three bytes in the last word of an 11-or-more
 31!     bytes memory chunk to b copied, the rest of the word can be read
 32!     without side effects.
 33!     This could be easily changed by increasing the minimum size of
 34!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
 35!     however, this would cost a few extra cyles on average.
 36!     For SHmedia, the assumption is that any quadword can be read in its
 37!     enirety if at least one byte is included in the copy.
 38!
 39
 40	.section .text..SHmedia32,"ax"
 41	.globl	memcpy
 42	.type	memcpy, @function
 43	.align	5
 44
 45memcpy:
 46
 47#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
 48#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
 49#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
 50#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
 51
 52	ld.b r3,0,r63
 53	pta/l Large,tr0
 54	movi 25,r0
 55	bgeu/u r4,r0,tr0
 56	nsb r4,r0
 57	shlli r0,5,r0
 58	movi (L1-L0+63*32 + 1) & 0xffff,r1
 59	sub r1, r0, r0
 60L0:	ptrel r0,tr0
 61	add r2,r4,r5
 62	ptabs r18,tr1
 63	add r3,r4,r6
 64	blink tr0,r63
 65	
 66/* Rearranged to make cut2 safe */
 67	.balign 8
 68L4_7:	/* 4..7 byte memcpy cntd. */
 69	stlo.l r2, 0, r0
 70	or r6, r7, r6
 71	sthi.l r5, -1, r6
 72	stlo.l r5, -4, r6
 73	blink tr1,r63
 74
 75	.balign 8
 76L1:	/* 0 byte memcpy */
 77	nop
 78	blink tr1,r63
 79	nop
 80	nop
 81	nop
 82	nop
 83
 84L2_3:	/* 2 or 3 byte memcpy cntd. */
 85	st.b r5,-1,r6
 86	blink tr1,r63
 87
 88	/* 1 byte memcpy */
 89	ld.b r3,0,r0
 90	st.b r2,0,r0
 91	blink tr1,r63
 92
 93L8_15:	/* 8..15 byte memcpy cntd. */
 94	stlo.q r2, 0, r0
 95	or r6, r7, r6
 96	sthi.q r5, -1, r6
 97	stlo.q r5, -8, r6
 98	blink tr1,r63
 99	
100	/* 2 or 3 byte memcpy */
101	ld.b r3,0,r0
102	ld.b r2,0,r63
103	ld.b r3,1,r1
104	st.b r2,0,r0
105	pta/l L2_3,tr0
106	ld.b r6,-1,r6
107	st.b r2,1,r1
108	blink tr0, r63
109
110	/* 4 .. 7 byte memcpy */
111	LDUAL (r3, 0, r0, r1)
112	pta L4_7, tr0
113	ldlo.l r6, -4, r7
114	or r0, r1, r0
115	sthi.l r2, 3, r0
116	ldhi.l r6, -1, r6
117	blink tr0, r63
118
119	/* 8 .. 15 byte memcpy */
120	LDUAQ (r3, 0, r0, r1)
121	pta L8_15, tr0
122	ldlo.q r6, -8, r7
123	or r0, r1, r0
124	sthi.q r2, 7, r0
125	ldhi.q r6, -1, r6
126	blink tr0, r63
127
128	/* 16 .. 24 byte memcpy */
129	LDUAQ (r3, 0, r0, r1)
130	LDUAQ (r3, 8, r8, r9)
131	or r0, r1, r0
132	sthi.q r2, 7, r0
133	or r8, r9, r8
134	sthi.q r2, 15, r8
135	ldlo.q r6, -8, r7
136	ldhi.q r6, -1, r6
137	stlo.q r2, 8, r8
138	stlo.q r2, 0, r0
139	or r6, r7, r6
140	sthi.q r5, -1, r6
141	stlo.q r5, -8, r6
142	blink tr1,r63
143
144Large:
145	ld.b r2, 0, r63
146	pta/l  Loop_ua, tr1
147	ori r3, -8, r7
148	sub r2, r7, r22
149	sub r3, r2, r6
150	add r2, r4, r5
151	ldlo.q r3, 0, r0
152	addi r5, -16, r5
153	movi 64+8, r27 // could subtract r7 from that.
154	stlo.q r2, 0, r0
155	sthi.q r2, 7, r0
156	ldx.q r22, r6, r0
157	bgtu/l r27, r4, tr1
158
159	addi r5, -48, r27
160	pta/l Loop_line, tr0
161	addi r6, 64, r36
162	addi r6, -24, r19
163	addi r6, -16, r20
164	addi r6, -8, r21
165
166Loop_line:
167	ldx.q r22, r36, r63
168	alloco r22, 32
169	addi r22, 32, r22
170	ldx.q r22, r19, r23
171	sthi.q r22, -25, r0
172	ldx.q r22, r20, r24
173	ldx.q r22, r21, r25
174	stlo.q r22, -32, r0
175	ldx.q r22, r6,  r0
176	sthi.q r22, -17, r23
177	sthi.q r22,  -9, r24
178	sthi.q r22,  -1, r25
179	stlo.q r22, -24, r23
180	stlo.q r22, -16, r24
181	stlo.q r22,  -8, r25
182	bgeu r27, r22, tr0
183
184Loop_ua:
185	addi r22, 8, r22
186	sthi.q r22, -1, r0
187	stlo.q r22, -8, r0
188	ldx.q r22, r6, r0
189	bgtu/l r5, r22, tr1
190
191	add r3, r4, r7
192	ldlo.q r7, -8, r1
193	sthi.q r22, 7, r0
194	ldhi.q r7, -1, r7
195	ptabs r18,tr1
196	stlo.q r22, 0, r0
197	or r1, r7, r1
198	sthi.q r5, 15, r1
199	stlo.q r5, 8, r1
200	blink tr1, r63
201
202	.size memcpy,.-memcpy