Linux Audio

Check our new training course

Loading...
v3.5.6
  1/* U3memcpy.S: UltraSparc-III optimized memcpy.
  2 *
  3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
  4 */
  5
  6#ifdef __KERNEL__
 
  7#include <asm/visasm.h>
  8#include <asm/asi.h>
  9#define GLOBAL_SPARE	%g7
 10#else
 11#define ASI_BLK_P 0xf0
 12#define FPRS_FEF  0x04
 13#ifdef MEMCPY_DEBUG
 14#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
 15		     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
 16#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
 17#else
 18#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
 19#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
 20#endif
 21#define GLOBAL_SPARE	%g5
 22#endif
 23
 24#ifndef EX_LD
 25#define EX_LD(x)	x
 
 
 
 26#endif
 27
 28#ifndef EX_ST
 29#define EX_ST(x)	x
 30#endif
 31
 32#ifndef EX_RETVAL
 33#define EX_RETVAL(x)	x
 34#endif
 35
 36#ifndef LOAD
 37#define LOAD(type,addr,dest)	type [addr], dest
 38#endif
 39
 40#ifndef STORE
 41#define STORE(type,src,addr)	type src, [addr]
 42#endif
 43
 44#ifndef STORE_BLK
 45#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
 46#endif
 47
 48#ifndef FUNC_NAME
 49#define FUNC_NAME	U3memcpy
 50#endif
 51
 52#ifndef PREAMBLE
 53#define PREAMBLE
 54#endif
 55
 56#ifndef XCC
 57#define XCC xcc
 58#endif
 59
 60	.register	%g2,#scratch
 61	.register	%g3,#scratch
 62
 63	/* Special/non-trivial issues of this code:
 64	 *
 65	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
 66	 * 2) Only low 32 FPU registers are used so that only the
 67	 *    lower half of the FPU register set is dirtied by this
 68	 *    code.  This is especially important in the kernel.
 69	 * 3) This code never prefetches cachelines past the end
 70	 *    of the source buffer.
 71	 */
 72
 73	.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 74	.align		64
 75
 76	/* The cheetah's flexible spine, oversized liver, enlarged heart,
 77	 * slender muscular body, and claws make it the swiftest hunter
 78	 * in Africa and the fastest animal on land.  Can reach speeds
 79	 * of up to 2.4GB per second.
 80	 */
 81
 82	.globl	FUNC_NAME
 83	.type	FUNC_NAME,#function
 84FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 85	srlx		%o2, 31, %g2
 86	cmp		%g2, 0
 87	tne		%xcc, 5
 88	PREAMBLE
 89	mov		%o0, %o4
 90	cmp		%o2, 0
 91	be,pn		%XCC, 85f
 92	 or		%o0, %o1, %o3
 93	cmp		%o2, 16
 94	blu,a,pn	%XCC, 80f
 95	 or		%o3, %o2, %o3
 96
 97	cmp		%o2, (3 * 64)
 98	blu,pt		%XCC, 70f
 99	 andcc		%o3, 0x7, %g0
100
101	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
102	 * o5 from here until we hit VISExitHalf.
103	 */
104	VISEntryHalf
105
106	/* Is 'dst' already aligned on an 64-byte boundary? */
107	andcc		%o0, 0x3f, %g2
108	be,pt		%XCC, 2f
109
110	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
111	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
112	 * subtract this from 'len'.
113	 */
114	 sub		%o0, %o1, GLOBAL_SPARE
115	sub		%g2, 0x40, %g2
116	sub		%g0, %g2, %g2
117	sub		%o2, %g2, %o2
118	andcc		%g2, 0x7, %g1
119	be,pt		%icc, 2f
120	 and		%g2, 0x38, %g2
121
1221:	subcc		%g1, 0x1, %g1
123	EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
124	EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE))
125	bgu,pt		%XCC, 1b
126	 add		%o1, 0x1, %o1
127
128	add		%o1, GLOBAL_SPARE, %o0
129
1302:	cmp		%g2, 0x0
131	and		%o1, 0x7, %g1
132	be,pt		%icc, 3f
133	 alignaddr	%o1, %g0, %o1
134
135	EX_LD(LOAD(ldd, %o1, %f4))
1361:	EX_LD(LOAD(ldd, %o1 + 0x8, %f6))
137	add		%o1, 0x8, %o1
138	subcc		%g2, 0x8, %g2
139	faligndata	%f4, %f6, %f0
140	EX_ST(STORE(std, %f0, %o0))
141	be,pn		%icc, 3f
142	 add		%o0, 0x8, %o0
143
144	EX_LD(LOAD(ldd, %o1 + 0x8, %f4))
145	add		%o1, 0x8, %o1
146	subcc		%g2, 0x8, %g2
147	faligndata	%f6, %f4, %f2
148	EX_ST(STORE(std, %f2, %o0))
149	bne,pt		%icc, 1b
150	 add		%o0, 0x8, %o0
151
1523:	LOAD(prefetch, %o1 + 0x000, #one_read)
153	LOAD(prefetch, %o1 + 0x040, #one_read)
154	andn		%o2, (0x40 - 1), GLOBAL_SPARE
155	LOAD(prefetch, %o1 + 0x080, #one_read)
156	LOAD(prefetch, %o1 + 0x0c0, #one_read)
157	LOAD(prefetch, %o1 + 0x100, #one_read)
158	EX_LD(LOAD(ldd, %o1 + 0x000, %f0))
159	LOAD(prefetch, %o1 + 0x140, #one_read)
160	EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
161	LOAD(prefetch, %o1 + 0x180, #one_read)
162	EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
163	LOAD(prefetch, %o1 + 0x1c0, #one_read)
164	faligndata	%f0, %f2, %f16
165	EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
166	faligndata	%f2, %f4, %f18
167	EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
168	faligndata	%f4, %f6, %f20
169	EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
170	faligndata	%f6, %f8, %f22
171
172	EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
173	faligndata	%f8, %f10, %f24
174	EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
175	faligndata	%f10, %f12, %f26
176	EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
177
178	subcc		GLOBAL_SPARE, 0x80, GLOBAL_SPARE
179	add		%o1, 0x40, %o1
180	bgu,pt		%XCC, 1f
181	 srl		GLOBAL_SPARE, 6, %o3
182	ba,pt		%xcc, 2f
183	 nop
184
185	.align		64
1861:
187	EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
188	faligndata	%f12, %f14, %f28
189	EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
190	faligndata	%f14, %f0, %f30
191	EX_ST(STORE_BLK(%f16, %o0))
192	EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
193	faligndata	%f0, %f2, %f16
194	add		%o0, 0x40, %o0
195
196	EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
197	faligndata	%f2, %f4, %f18
198	EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
199	faligndata	%f4, %f6, %f20
200	EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
201	subcc		%o3, 0x01, %o3
202	faligndata	%f6, %f8, %f22
203	EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
204
205	faligndata	%f8, %f10, %f24
206	EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
207	LOAD(prefetch, %o1 + 0x1c0, #one_read)
208	faligndata	%f10, %f12, %f26
209	bg,pt		%XCC, 1b
210	 add		%o1, 0x40, %o1
211
212	/* Finally we copy the last full 64-byte block. */
2132:
214	EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
215	faligndata	%f12, %f14, %f28
216	EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
217	faligndata	%f14, %f0, %f30
218	EX_ST(STORE_BLK(%f16, %o0))
219	EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
220	faligndata	%f0, %f2, %f16
221	EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
222	faligndata	%f2, %f4, %f18
223	EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
224	faligndata	%f4, %f6, %f20
225	EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
226	faligndata	%f6, %f8, %f22
227	EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
228	faligndata	%f8, %f10, %f24
229	cmp		%g1, 0
230	be,pt		%XCC, 1f
231	 add		%o0, 0x40, %o0
232	EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
2331:	faligndata	%f10, %f12, %f26
234	faligndata	%f12, %f14, %f28
235	faligndata	%f14, %f0, %f30
236	EX_ST(STORE_BLK(%f16, %o0))
237	add		%o0, 0x40, %o0
238	add		%o1, 0x40, %o1
239	membar		#Sync
240
241	/* Now we copy the (len modulo 64) bytes at the end.
242	 * Note how we borrow the %f0 loaded above.
243	 *
244	 * Also notice how this code is careful not to perform a
245	 * load past the end of the src buffer.
246	 */
247	and		%o2, 0x3f, %o2
248	andcc		%o2, 0x38, %g2
249	be,pn		%XCC, 2f
250	 subcc		%g2, 0x8, %g2
251	be,pn		%XCC, 2f
252	 cmp		%g1, 0
253
254	sub		%o2, %g2, %o2
255	be,a,pt		%XCC, 1f
256	 EX_LD(LOAD(ldd, %o1 + 0x00, %f0))
257
2581:	EX_LD(LOAD(ldd, %o1 + 0x08, %f2))
259	add		%o1, 0x8, %o1
260	subcc		%g2, 0x8, %g2
261	faligndata	%f0, %f2, %f8
262	EX_ST(STORE(std, %f8, %o0))
263	be,pn		%XCC, 2f
264	 add		%o0, 0x8, %o0
265	EX_LD(LOAD(ldd, %o1 + 0x08, %f0))
266	add		%o1, 0x8, %o1
267	subcc		%g2, 0x8, %g2
268	faligndata	%f2, %f0, %f8
269	EX_ST(STORE(std, %f8, %o0))
270	bne,pn		%XCC, 1b
271	 add		%o0, 0x8, %o0
272
273	/* If anything is left, we copy it one byte at a time.
274	 * Note that %g1 is (src & 0x3) saved above before the
275	 * alignaddr was performed.
276	 */
2772:
278	cmp		%o2, 0
279	add		%o1, %g1, %o1
280	VISExitHalf
281	be,pn		%XCC, 85f
282	 sub		%o0, %o1, %o3
283
284	andcc		%g1, 0x7, %g0
285	bne,pn		%icc, 90f
286	 andcc		%o2, 0x8, %g0
287	be,pt		%icc, 1f
288	 nop
289	EX_LD(LOAD(ldx, %o1, %o5))
290	EX_ST(STORE(stx, %o5, %o1 + %o3))
291	add		%o1, 0x8, %o1
 
292
2931:	andcc		%o2, 0x4, %g0
294	be,pt		%icc, 1f
295	 nop
296	EX_LD(LOAD(lduw, %o1, %o5))
297	EX_ST(STORE(stw, %o5, %o1 + %o3))
298	add		%o1, 0x4, %o1
 
299
3001:	andcc		%o2, 0x2, %g0
301	be,pt		%icc, 1f
302	 nop
303	EX_LD(LOAD(lduh, %o1, %o5))
304	EX_ST(STORE(sth, %o5, %o1 + %o3))
305	add		%o1, 0x2, %o1
 
306
3071:	andcc		%o2, 0x1, %g0
308	be,pt		%icc, 85f
309	 nop
310	EX_LD(LOAD(ldub, %o1, %o5))
311	ba,pt		%xcc, 85f
312	 EX_ST(STORE(stb, %o5, %o1 + %o3))
313
314	.align		64
31570: /* 16 < len <= 64 */
316	bne,pn		%XCC, 75f
317	 sub		%o0, %o1, %o3
318
31972:
320	andn		%o2, 0xf, GLOBAL_SPARE
321	and		%o2, 0xf, %o2
3221:	subcc		GLOBAL_SPARE, 0x10, GLOBAL_SPARE
323	EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
324	EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
325	EX_ST(STORE(stx, %o5, %o1 + %o3))
326	add		%o1, 0x8, %o1
327	EX_ST(STORE(stx, %g1, %o1 + %o3))
328	bgu,pt		%XCC, 1b
329	 add		%o1, 0x8, %o1
33073:	andcc		%o2, 0x8, %g0
331	be,pt		%XCC, 1f
332	 nop
333	sub		%o2, 0x8, %o2
334	EX_LD(LOAD(ldx, %o1, %o5))
335	EX_ST(STORE(stx, %o5, %o1 + %o3))
336	add		%o1, 0x8, %o1
3371:	andcc		%o2, 0x4, %g0
338	be,pt		%XCC, 1f
339	 nop
340	sub		%o2, 0x4, %o2
341	EX_LD(LOAD(lduw, %o1, %o5))
342	EX_ST(STORE(stw, %o5, %o1 + %o3))
343	add		%o1, 0x4, %o1
3441:	cmp		%o2, 0
345	be,pt		%XCC, 85f
346	 nop
347	ba,pt		%xcc, 90f
348	 nop
349
35075:
351	andcc		%o0, 0x7, %g1
352	sub		%g1, 0x8, %g1
353	be,pn		%icc, 2f
354	 sub		%g0, %g1, %g1
355	sub		%o2, %g1, %o2
356
3571:	subcc		%g1, 1, %g1
358	EX_LD(LOAD(ldub, %o1, %o5))
359	EX_ST(STORE(stb, %o5, %o1 + %o3))
360	bgu,pt		%icc, 1b
361	 add		%o1, 1, %o1
362
3632:	add		%o1, %o3, %o0
364	andcc		%o1, 0x7, %g1
365	bne,pt		%icc, 8f
366	 sll		%g1, 3, %g1
367
368	cmp		%o2, 16
369	bgeu,pt		%icc, 72b
370	 nop
371	ba,a,pt		%xcc, 73b
372
3738:	mov		64, %o3
374	andn		%o1, 0x7, %o1
375	EX_LD(LOAD(ldx, %o1, %g2))
376	sub		%o3, %g1, %o3
377	andn		%o2, 0x7, GLOBAL_SPARE
378	sllx		%g2, %g1, %g2
3791:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
380	subcc		GLOBAL_SPARE, 0x8, GLOBAL_SPARE
381	add		%o1, 0x8, %o1
382	srlx		%g3, %o3, %o5
383	or		%o5, %g2, %o5
384	EX_ST(STORE(stx, %o5, %o0))
385	add		%o0, 0x8, %o0
386	bgu,pt		%icc, 1b
387	 sllx		%g3, %g1, %g2
388
389	srl		%g1, 3, %g1
390	andcc		%o2, 0x7, %o2
391	be,pn		%icc, 85f
392	 add		%o1, %g1, %o1
393	ba,pt		%xcc, 90f
394	 sub		%o0, %o1, %o3
395
396	.align		64
39780: /* 0 < len <= 16 */
398	andcc		%o3, 0x3, %g0
399	bne,pn		%XCC, 90f
400	 sub		%o0, %o1, %o3
401
4021:
403	subcc		%o2, 4, %o2
404	EX_LD(LOAD(lduw, %o1, %g1))
405	EX_ST(STORE(stw, %g1, %o1 + %o3))
406	bgu,pt		%XCC, 1b
407	 add		%o1, 4, %o1
408
40985:	retl
410	 mov		EX_RETVAL(%o4), %o0
411
412	.align		32
41390:
414	subcc		%o2, 1, %o2
415	EX_LD(LOAD(ldub, %o1, %g1))
416	EX_ST(STORE(stb, %g1, %o1 + %o3))
417	bgu,pt		%XCC, 90b
418	 add		%o1, 1, %o1
419	retl
420	 mov		EX_RETVAL(%o4), %o0
421
422	.size		FUNC_NAME, .-FUNC_NAME
v4.10.11
  1/* U3memcpy.S: UltraSparc-III optimized memcpy.
  2 *
  3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
  4 */
  5
  6#ifdef __KERNEL__
  7#include <linux/linkage.h>
  8#include <asm/visasm.h>
  9#include <asm/asi.h>
 10#define GLOBAL_SPARE	%g7
 11#else
 12#define ASI_BLK_P 0xf0
 13#define FPRS_FEF  0x04
 14#ifdef MEMCPY_DEBUG
 15#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
 16		     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
 17#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
 18#else
 19#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
 20#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
 21#endif
 22#define GLOBAL_SPARE	%g5
 23#endif
 24
 25#ifndef EX_LD
 26#define EX_LD(x,y)	x
 27#endif
 28#ifndef EX_LD_FP
 29#define EX_LD_FP(x,y)	x
 30#endif
 31
 32#ifndef EX_ST
 33#define EX_ST(x,y)	x
 34#endif
 35#ifndef EX_ST_FP
 36#define EX_ST_FP(x,y)	x
 
 37#endif
 38
 39#ifndef LOAD
 40#define LOAD(type,addr,dest)	type [addr], dest
 41#endif
 42
 43#ifndef STORE
 44#define STORE(type,src,addr)	type src, [addr]
 45#endif
 46
 47#ifndef STORE_BLK
 48#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
 49#endif
 50
 51#ifndef FUNC_NAME
 52#define FUNC_NAME	U3memcpy
 53#endif
 54
 55#ifndef PREAMBLE
 56#define PREAMBLE
 57#endif
 58
 59#ifndef XCC
 60#define XCC xcc
 61#endif
 62
 63	.register	%g2,#scratch
 64	.register	%g3,#scratch
 65
 66	/* Special/non-trivial issues of this code:
 67	 *
 68	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
 69	 * 2) Only low 32 FPU registers are used so that only the
 70	 *    lower half of the FPU register set is dirtied by this
 71	 *    code.  This is especially important in the kernel.
 72	 * 3) This code never prefetches cachelines past the end
 73	 *    of the source buffer.
 74	 */
 75
 76	.text
 77#ifndef EX_RETVAL
 78#define EX_RETVAL(x)	x
 79__restore_fp:
 80	VISExitHalf
 81	retl
 82	 nop
 83ENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
 84	add	%g1, 1, %g1
 85	add	%g2, %g1, %g2
 86	ba,pt	%xcc, __restore_fp
 87	 add	%o2, %g2, %o0
 88ENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
 89ENTRY(U3_retl_o2_plus_g2_fp)
 90	ba,pt	%xcc, __restore_fp
 91	 add	%o2, %g2, %o0
 92ENDPROC(U3_retl_o2_plus_g2_fp)
 93ENTRY(U3_retl_o2_plus_g2_plus_8_fp)
 94	add	%g2, 8, %g2
 95	ba,pt	%xcc, __restore_fp
 96	 add	%o2, %g2, %o0
 97ENDPROC(U3_retl_o2_plus_g2_plus_8_fp)
 98ENTRY(U3_retl_o2)
 99	retl
100	 mov	%o2, %o0
101ENDPROC(U3_retl_o2)
102ENTRY(U3_retl_o2_plus_1)
103	retl
104	 add	%o2, 1, %o0
105ENDPROC(U3_retl_o2_plus_1)
106ENTRY(U3_retl_o2_plus_4)
107	retl
108	 add	%o2, 4, %o0
109ENDPROC(U3_retl_o2_plus_4)
110ENTRY(U3_retl_o2_plus_8)
111	retl
112	 add	%o2, 8, %o0
113ENDPROC(U3_retl_o2_plus_8)
114ENTRY(U3_retl_o2_plus_g1_plus_1)
115	add	%g1, 1, %g1
116	retl
117	 add	%o2, %g1, %o0
118ENDPROC(U3_retl_o2_plus_g1_plus_1)
119ENTRY(U3_retl_o2_fp)
120	ba,pt	%xcc, __restore_fp
121	 mov	%o2, %o0
122ENDPROC(U3_retl_o2_fp)
123ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
124	sll	%o3, 6, %o3
125	add	%o3, 0x80, %o3
126	ba,pt	%xcc, __restore_fp
127	 add	%o2, %o3, %o0
128ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
129ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
130	sll	%o3, 6, %o3
131	add	%o3, 0x40, %o3
132	ba,pt	%xcc, __restore_fp
133	 add	%o2, %o3, %o0
134ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
135ENTRY(U3_retl_o2_plus_GS_plus_0x10)
136	add	GLOBAL_SPARE, 0x10, GLOBAL_SPARE
137	retl
138	 add	%o2, GLOBAL_SPARE, %o0
139ENDPROC(U3_retl_o2_plus_GS_plus_0x10)
140ENTRY(U3_retl_o2_plus_GS_plus_0x08)
141	add	GLOBAL_SPARE, 0x08, GLOBAL_SPARE
142	retl
143	 add	%o2, GLOBAL_SPARE, %o0
144ENDPROC(U3_retl_o2_plus_GS_plus_0x08)
145ENTRY(U3_retl_o2_and_7_plus_GS)
146	and	%o2, 7, %o2
147	retl
148	 add	%o2, GLOBAL_SPARE, %o2
149ENDPROC(U3_retl_o2_and_7_plus_GS)
150ENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
151	add	GLOBAL_SPARE, 8, GLOBAL_SPARE
152	and	%o2, 7, %o2
153	retl
154	 add	%o2, GLOBAL_SPARE, %o2
155ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
156#endif
157
158	.align		64
159
160	/* The cheetah's flexible spine, oversized liver, enlarged heart,
161	 * slender muscular body, and claws make it the swiftest hunter
162	 * in Africa and the fastest animal on land.  Can reach speeds
163	 * of up to 2.4GB per second.
164	 */
165
166	.globl	FUNC_NAME
167	.type	FUNC_NAME,#function
168FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
169	srlx		%o2, 31, %g2
170	cmp		%g2, 0
171	tne		%xcc, 5
172	PREAMBLE
173	mov		%o0, %o4
174	cmp		%o2, 0
175	be,pn		%XCC, 85f
176	 or		%o0, %o1, %o3
177	cmp		%o2, 16
178	blu,a,pn	%XCC, 80f
179	 or		%o3, %o2, %o3
180
181	cmp		%o2, (3 * 64)
182	blu,pt		%XCC, 70f
183	 andcc		%o3, 0x7, %g0
184
185	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
186	 * o5 from here until we hit VISExitHalf.
187	 */
188	VISEntryHalf
189
190	/* Is 'dst' already aligned on an 64-byte boundary? */
191	andcc		%o0, 0x3f, %g2
192	be,pt		%XCC, 2f
193
194	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
195	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
196	 * subtract this from 'len'.
197	 */
198	 sub		%o0, %o1, GLOBAL_SPARE
199	sub		%g2, 0x40, %g2
200	sub		%g0, %g2, %g2
201	sub		%o2, %g2, %o2
202	andcc		%g2, 0x7, %g1
203	be,pt		%icc, 2f
204	 and		%g2, 0x38, %g2
205
2061:	subcc		%g1, 0x1, %g1
207	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1)
208	EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1)
209	bgu,pt		%XCC, 1b
210	 add		%o1, 0x1, %o1
211
212	add		%o1, GLOBAL_SPARE, %o0
213
2142:	cmp		%g2, 0x0
215	and		%o1, 0x7, %g1
216	be,pt		%icc, 3f
217	 alignaddr	%o1, %g0, %o1
218
219	EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2)
2201:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2)
221	add		%o1, 0x8, %o1
222	subcc		%g2, 0x8, %g2
223	faligndata	%f4, %f6, %f0
224	EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8)
225	be,pn		%icc, 3f
226	 add		%o0, 0x8, %o0
227
228	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2)
229	add		%o1, 0x8, %o1
230	subcc		%g2, 0x8, %g2
231	faligndata	%f6, %f4, %f2
232	EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8)
233	bne,pt		%icc, 1b
234	 add		%o0, 0x8, %o0
235
2363:	LOAD(prefetch, %o1 + 0x000, #one_read)
237	LOAD(prefetch, %o1 + 0x040, #one_read)
238	andn		%o2, (0x40 - 1), GLOBAL_SPARE
239	LOAD(prefetch, %o1 + 0x080, #one_read)
240	LOAD(prefetch, %o1 + 0x0c0, #one_read)
241	LOAD(prefetch, %o1 + 0x100, #one_read)
242	EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2)
243	LOAD(prefetch, %o1 + 0x140, #one_read)
244	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2)
245	LOAD(prefetch, %o1 + 0x180, #one_read)
246	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2)
247	LOAD(prefetch, %o1 + 0x1c0, #one_read)
248	faligndata	%f0, %f2, %f16
249	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2)
250	faligndata	%f2, %f4, %f18
251	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2)
252	faligndata	%f4, %f6, %f20
253	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2)
254	faligndata	%f6, %f8, %f22
255
256	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2)
257	faligndata	%f8, %f10, %f24
258	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2)
259	faligndata	%f10, %f12, %f26
260	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2)
261
262	subcc		GLOBAL_SPARE, 0x80, GLOBAL_SPARE
263	add		%o1, 0x40, %o1
264	bgu,pt		%XCC, 1f
265	 srl		GLOBAL_SPARE, 6, %o3
266	ba,pt		%xcc, 2f
267	 nop
268
269	.align		64
2701:
271	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
272	faligndata	%f12, %f14, %f28
273	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
274	faligndata	%f14, %f0, %f30
275	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
276	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
277	faligndata	%f0, %f2, %f16
278	add		%o0, 0x40, %o0
279
280	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
281	faligndata	%f2, %f4, %f18
282	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
283	faligndata	%f4, %f6, %f20
284	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
285	subcc		%o3, 0x01, %o3
286	faligndata	%f6, %f8, %f22
287	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80)
288
289	faligndata	%f8, %f10, %f24
290	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
291	LOAD(prefetch, %o1 + 0x1c0, #one_read)
292	faligndata	%f10, %f12, %f26
293	bg,pt		%XCC, 1b
294	 add		%o1, 0x40, %o1
295
296	/* Finally we copy the last full 64-byte block. */
2972:
298	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
299	faligndata	%f12, %f14, %f28
300	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
301	faligndata	%f14, %f0, %f30
302	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
303	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
304	faligndata	%f0, %f2, %f16
305	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
306	faligndata	%f2, %f4, %f18
307	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
308	faligndata	%f4, %f6, %f20
309	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
310	faligndata	%f6, %f8, %f22
311	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40)
312	faligndata	%f8, %f10, %f24
313	cmp		%g1, 0
314	be,pt		%XCC, 1f
315	 add		%o0, 0x40, %o0
316	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
3171:	faligndata	%f10, %f12, %f26
318	faligndata	%f12, %f14, %f28
319	faligndata	%f14, %f0, %f30
320	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
321	add		%o0, 0x40, %o0
322	add		%o1, 0x40, %o1
323	membar		#Sync
324
325	/* Now we copy the (len modulo 64) bytes at the end.
326	 * Note how we borrow the %f0 loaded above.
327	 *
328	 * Also notice how this code is careful not to perform a
329	 * load past the end of the src buffer.
330	 */
331	and		%o2, 0x3f, %o2
332	andcc		%o2, 0x38, %g2
333	be,pn		%XCC, 2f
334	 subcc		%g2, 0x8, %g2
335	be,pn		%XCC, 2f
336	 cmp		%g1, 0
337
338	sub		%o2, %g2, %o2
339	be,a,pt		%XCC, 1f
340	 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2)
341
3421:	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2)
343	add		%o1, 0x8, %o1
344	subcc		%g2, 0x8, %g2
345	faligndata	%f0, %f2, %f8
346	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
347	be,pn		%XCC, 2f
348	 add		%o0, 0x8, %o0
349	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2)
350	add		%o1, 0x8, %o1
351	subcc		%g2, 0x8, %g2
352	faligndata	%f2, %f0, %f8
353	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
354	bne,pn		%XCC, 1b
355	 add		%o0, 0x8, %o0
356
357	/* If anything is left, we copy it one byte at a time.
358	 * Note that %g1 is (src & 0x3) saved above before the
359	 * alignaddr was performed.
360	 */
3612:
362	cmp		%o2, 0
363	add		%o1, %g1, %o1
364	VISExitHalf
365	be,pn		%XCC, 85f
366	 sub		%o0, %o1, %o3
367
368	andcc		%g1, 0x7, %g0
369	bne,pn		%icc, 90f
370	 andcc		%o2, 0x8, %g0
371	be,pt		%icc, 1f
372	 nop
373	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2)
374	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2)
375	add		%o1, 0x8, %o1
376	sub		%o2, 8, %o2
377
3781:	andcc		%o2, 0x4, %g0
379	be,pt		%icc, 1f
380	 nop
381	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2)
382	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2)
383	add		%o1, 0x4, %o1
384	sub		%o2, 4, %o2
385
3861:	andcc		%o2, 0x2, %g0
387	be,pt		%icc, 1f
388	 nop
389	EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2)
390	EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2)
391	add		%o1, 0x2, %o1
392	sub		%o2, 2, %o2
393
3941:	andcc		%o2, 0x1, %g0
395	be,pt		%icc, 85f
396	 nop
397	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2)
398	ba,pt		%xcc, 85f
399	 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2)
400
401	.align		64
40270: /* 16 < len <= 64 */
403	bne,pn		%XCC, 75f
404	 sub		%o0, %o1, %o3
405
40672:
407	andn		%o2, 0xf, GLOBAL_SPARE
408	and		%o2, 0xf, %o2
4091:	subcc		GLOBAL_SPARE, 0x10, GLOBAL_SPARE
410	EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10)
411	EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10)
412	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10)
413	add		%o1, 0x8, %o1
414	EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08)
415	bgu,pt		%XCC, 1b
416	 add		%o1, 0x8, %o1
41773:	andcc		%o2, 0x8, %g0
418	be,pt		%XCC, 1f
419	 nop
420	sub		%o2, 0x8, %o2
421	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8)
422	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8)
423	add		%o1, 0x8, %o1
4241:	andcc		%o2, 0x4, %g0
425	be,pt		%XCC, 1f
426	 nop
427	sub		%o2, 0x4, %o2
428	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4)
429	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4)
430	add		%o1, 0x4, %o1
4311:	cmp		%o2, 0
432	be,pt		%XCC, 85f
433	 nop
434	ba,pt		%xcc, 90f
435	 nop
436
43775:
438	andcc		%o0, 0x7, %g1
439	sub		%g1, 0x8, %g1
440	be,pn		%icc, 2f
441	 sub		%g0, %g1, %g1
442	sub		%o2, %g1, %o2
443
4441:	subcc		%g1, 1, %g1
445	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1)
446	EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1)
447	bgu,pt		%icc, 1b
448	 add		%o1, 1, %o1
449
4502:	add		%o1, %o3, %o0
451	andcc		%o1, 0x7, %g1
452	bne,pt		%icc, 8f
453	 sll		%g1, 3, %g1
454
455	cmp		%o2, 16
456	bgeu,pt		%icc, 72b
457	 nop
458	ba,a,pt		%xcc, 73b
459
4608:	mov		64, %o3
461	andn		%o1, 0x7, %o1
462	EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2)
463	sub		%o3, %g1, %o3
464	andn		%o2, 0x7, GLOBAL_SPARE
465	sllx		%g2, %g1, %g2
4661:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS)
467	subcc		GLOBAL_SPARE, 0x8, GLOBAL_SPARE
468	add		%o1, 0x8, %o1
469	srlx		%g3, %o3, %o5
470	or		%o5, %g2, %o5
471	EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8)
472	add		%o0, 0x8, %o0
473	bgu,pt		%icc, 1b
474	 sllx		%g3, %g1, %g2
475
476	srl		%g1, 3, %g1
477	andcc		%o2, 0x7, %o2
478	be,pn		%icc, 85f
479	 add		%o1, %g1, %o1
480	ba,pt		%xcc, 90f
481	 sub		%o0, %o1, %o3
482
483	.align		64
48480: /* 0 < len <= 16 */
485	andcc		%o3, 0x3, %g0
486	bne,pn		%XCC, 90f
487	 sub		%o0, %o1, %o3
488
4891:
490	subcc		%o2, 4, %o2
491	EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4)
492	EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4)
493	bgu,pt		%XCC, 1b
494	 add		%o1, 4, %o1
495
49685:	retl
497	 mov		EX_RETVAL(%o4), %o0
498
499	.align		32
50090:
501	subcc		%o2, 1, %o2
502	EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1)
503	EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1)
504	bgu,pt		%XCC, 90b
505	 add		%o1, 1, %o1
506	retl
507	 mov		EX_RETVAL(%o4), %o0
508
509	.size		FUNC_NAME, .-FUNC_NAME