copy_user_memcpy.S - arch/sh/lib64/copy_user_memcpy.S - Linux diff v5.4

  1! SPDX-License-Identifier: GPL-2.0
  2!
  3! Fast SH memcpy
  4!
  5! by Toshiyasu Morita (tm@netcom.com)
  6! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
  7! SH5 code Copyright 2002 SuperH Ltd.
  8!
  9! Entry: ARG0: destination pointer
 10!        ARG1: source pointer
 11!        ARG2: byte count
 12!
 13! Exit:  RESULT: destination pointer
 14!        any other registers in the range r0-r7: trashed
 15!
 16! Notes: Usually one wants to do small reads and write a longword, but
 17!        unfortunately it is difficult in some cases to concatanate bytes
 18!        into a longword on the SH, so this does a longword read and small
 19!        writes.
 20!
 21! This implementation makes two assumptions about how it is called:
 22!
 23! 1.: If the byte count is nonzero, the address of the last byte to be
 24!     copied is unsigned greater than the address of the first byte to
 25!     be copied.  This could be easily swapped for a signed comparison,
 26!     but the algorithm used needs some comparison.
 27!
 28! 2.: When there are two or three bytes in the last word of an 11-or-more
 29!     bytes memory chunk to b copied, the rest of the word can be read
 30!     without side effects.
 31!     This could be easily changed by increasing the minimum size of
 32!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
 33!     however, this would cost a few extra cyles on average.
 34!     For SHmedia, the assumption is that any quadword can be read in its
 35!     enirety if at least one byte is included in the copy.
 36
 37/* Imported into Linux kernel by Richard Curnow.  This is used to implement the
 38   __copy_user function in the general case, so it has to be a distinct
 39   function from intra-kernel memcpy to allow for exception fix-ups in the
 40   event that the user pointer is bad somewhere in the copy (e.g. due to
 41   running off the end of the vma).
 42
 43   Note, this algorithm will be slightly wasteful in the case where the source
 44   and destination pointers are equally aligned, because the stlo/sthi pairs
 45   could then be merged back into single stores.  If there are a lot of cache
 46   misses, this is probably offset by the stall lengths on the preloads.
 47
 48*/
 49
 50/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
 51 * erratum.  The first two prefetches are nop-ed out to avoid upsetting the
 52 * instruction counts used in the jump address calculation.
 53 * */
 54
 55	.section .text..SHmedia32,"ax"
 56	.little
 57	.balign 32
 58	.global copy_user_memcpy
 59	.global copy_user_memcpy_end
 60copy_user_memcpy:
 61
 62#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
 63#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
 64#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
 65#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
 66
 67	nop ! ld.b r3,0,r63 ! TAKum03020
 68	pta/l Large,tr0
 69	movi 25,r0
 70	bgeu/u r4,r0,tr0
 71	nsb r4,r0
 72	shlli r0,5,r0
 73	movi (L1-L0+63*32 + 1) & 0xffff,r1
 74	sub r1, r0, r0
 75L0:	ptrel r0,tr0
 76	add r2,r4,r5
 77	ptabs r18,tr1
 78	add r3,r4,r6
 79	blink tr0,r63
 80
 81/* Rearranged to make cut2 safe */
 82	.balign 8
 83L4_7:	/* 4..7 byte memcpy cntd. */
 84	stlo.l r2, 0, r0
 85	or r6, r7, r6
 86	sthi.l r5, -1, r6
 87	stlo.l r5, -4, r6
 88	blink tr1,r63
 89
 90	.balign 8
 91L1:	/* 0 byte memcpy */
 92	nop
 93	blink tr1,r63
 94	nop
 95	nop
 96	nop
 97	nop
 98
 99L2_3:	/* 2 or 3 byte memcpy cntd. */
100	st.b r5,-1,r6
101	blink tr1,r63
102
103	/* 1 byte memcpy */
104	ld.b r3,0,r0
105	st.b r2,0,r0
106	blink tr1,r63
107
108L8_15:	/* 8..15 byte memcpy cntd. */
109	stlo.q r2, 0, r0
110	or r6, r7, r6
111	sthi.q r5, -1, r6
112	stlo.q r5, -8, r6
113	blink tr1,r63
114
115	/* 2 or 3 byte memcpy */
116	ld.b r3,0,r0
117	nop ! ld.b r2,0,r63 ! TAKum03020
118	ld.b r3,1,r1
119	st.b r2,0,r0
120	pta/l L2_3,tr0
121	ld.b r6,-1,r6
122	st.b r2,1,r1
123	blink tr0, r63
124
125	/* 4 .. 7 byte memcpy */
126	LDUAL (r3, 0, r0, r1)
127	pta L4_7, tr0
128	ldlo.l r6, -4, r7
129	or r0, r1, r0
130	sthi.l r2, 3, r0
131	ldhi.l r6, -1, r6
132	blink tr0, r63
133
134	/* 8 .. 15 byte memcpy */
135	LDUAQ (r3, 0, r0, r1)
136	pta L8_15, tr0
137	ldlo.q r6, -8, r7
138	or r0, r1, r0
139	sthi.q r2, 7, r0
140	ldhi.q r6, -1, r6
141	blink tr0, r63
142
143	/* 16 .. 24 byte memcpy */
144	LDUAQ (r3, 0, r0, r1)
145	LDUAQ (r3, 8, r8, r9)
146	or r0, r1, r0
147	sthi.q r2, 7, r0
148	or r8, r9, r8
149	sthi.q r2, 15, r8
150	ldlo.q r6, -8, r7
151	ldhi.q r6, -1, r6
152	stlo.q r2, 8, r8
153	stlo.q r2, 0, r0
154	or r6, r7, r6
155	sthi.q r5, -1, r6
156	stlo.q r5, -8, r6
157	blink tr1,r63
158
159Large:
160	! ld.b r2, 0, r63 ! TAKum03020
161	pta/l  Loop_ua, tr1
162	ori r3, -8, r7
163	sub r2, r7, r22
164	sub r3, r2, r6
165	add r2, r4, r5
166	ldlo.q r3, 0, r0
167	addi r5, -16, r5
168	movi 64+8, r27 ! could subtract r7 from that.
169	stlo.q r2, 0, r0
170	sthi.q r2, 7, r0
171	ldx.q r22, r6, r0
172	bgtu/l r27, r4, tr1
173
174	addi r5, -48, r27
175	pta/l Loop_line, tr0
176	addi r6, 64, r36
177	addi r6, -24, r19
178	addi r6, -16, r20
179	addi r6, -8, r21
180
181Loop_line:
182	! ldx.q r22, r36, r63 ! TAKum03020
183	alloco r22, 32
184	synco
185	addi r22, 32, r22
186	ldx.q r22, r19, r23
187	sthi.q r22, -25, r0
188	ldx.q r22, r20, r24
189	ldx.q r22, r21, r25
190	stlo.q r22, -32, r0
191	ldx.q r22, r6,  r0
192	sthi.q r22, -17, r23
193	sthi.q r22,  -9, r24
194	sthi.q r22,  -1, r25
195	stlo.q r22, -24, r23
196	stlo.q r22, -16, r24
197	stlo.q r22,  -8, r25
198	bgeu r27, r22, tr0
199
200Loop_ua:
201	addi r22, 8, r22
202	sthi.q r22, -1, r0
203	stlo.q r22, -8, r0
204	ldx.q r22, r6, r0
205	bgtu/l r5, r22, tr1
206
207	add r3, r4, r7
208	ldlo.q r7, -8, r1
209	sthi.q r22, 7, r0
210	ldhi.q r7, -1, r7
211	ptabs r18,tr1
212	stlo.q r22, 0, r0
213	or r1, r7, r1
214	sthi.q r5, 15, r1
215	stlo.q r5, 8, r1
216	blink tr1, r63
217copy_user_memcpy_end:
218	nop

 
  1!
  2! Fast SH memcpy
  3!
  4! by Toshiyasu Morita (tm@netcom.com)
  5! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
  6! SH5 code Copyright 2002 SuperH Ltd.
  7!
  8! Entry: ARG0: destination pointer
  9!        ARG1: source pointer
 10!        ARG2: byte count
 11!
 12! Exit:  RESULT: destination pointer
 13!        any other registers in the range r0-r7: trashed
 14!
 15! Notes: Usually one wants to do small reads and write a longword, but
 16!        unfortunately it is difficult in some cases to concatanate bytes
 17!        into a longword on the SH, so this does a longword read and small
 18!        writes.
 19!
 20! This implementation makes two assumptions about how it is called:
 21!
 22! 1.: If the byte count is nonzero, the address of the last byte to be
 23!     copied is unsigned greater than the address of the first byte to
 24!     be copied.  This could be easily swapped for a signed comparison,
 25!     but the algorithm used needs some comparison.
 26!
 27! 2.: When there are two or three bytes in the last word of an 11-or-more
 28!     bytes memory chunk to b copied, the rest of the word can be read
 29!     without side effects.
 30!     This could be easily changed by increasing the minimum size of
 31!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
 32!     however, this would cost a few extra cyles on average.
 33!     For SHmedia, the assumption is that any quadword can be read in its
 34!     enirety if at least one byte is included in the copy.
 35
 36/* Imported into Linux kernel by Richard Curnow.  This is used to implement the
 37   __copy_user function in the general case, so it has to be a distinct
 38   function from intra-kernel memcpy to allow for exception fix-ups in the
 39   event that the user pointer is bad somewhere in the copy (e.g. due to
 40   running off the end of the vma).
 41
 42   Note, this algorithm will be slightly wasteful in the case where the source
 43   and destination pointers are equally aligned, because the stlo/sthi pairs
 44   could then be merged back into single stores.  If there are a lot of cache
 45   misses, this is probably offset by the stall lengths on the preloads.
 46
 47*/
 48
 49/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
 50 * erratum.  The first two prefetches are nop-ed out to avoid upsetting the
 51 * instruction counts used in the jump address calculation.
 52 * */
 53
 54	.section .text..SHmedia32,"ax"
 55	.little
 56	.balign 32
 57	.global copy_user_memcpy
 58	.global copy_user_memcpy_end
 59copy_user_memcpy:
 60
 61#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
 62#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
 63#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
 64#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
 65
 66	nop ! ld.b r3,0,r63 ! TAKum03020
 67	pta/l Large,tr0
 68	movi 25,r0
 69	bgeu/u r4,r0,tr0
 70	nsb r4,r0
 71	shlli r0,5,r0
 72	movi (L1-L0+63*32 + 1) & 0xffff,r1
 73	sub r1, r0, r0
 74L0:	ptrel r0,tr0
 75	add r2,r4,r5
 76	ptabs r18,tr1
 77	add r3,r4,r6
 78	blink tr0,r63
 79
 80/* Rearranged to make cut2 safe */
 81	.balign 8
 82L4_7:	/* 4..7 byte memcpy cntd. */
 83	stlo.l r2, 0, r0
 84	or r6, r7, r6
 85	sthi.l r5, -1, r6
 86	stlo.l r5, -4, r6
 87	blink tr1,r63
 88
 89	.balign 8
 90L1:	/* 0 byte memcpy */
 91	nop
 92	blink tr1,r63
 93	nop
 94	nop
 95	nop
 96	nop
 97
 98L2_3:	/* 2 or 3 byte memcpy cntd. */
 99	st.b r5,-1,r6
100	blink tr1,r63
101
102	/* 1 byte memcpy */
103	ld.b r3,0,r0
104	st.b r2,0,r0
105	blink tr1,r63
106
107L8_15:	/* 8..15 byte memcpy cntd. */
108	stlo.q r2, 0, r0
109	or r6, r7, r6
110	sthi.q r5, -1, r6
111	stlo.q r5, -8, r6
112	blink tr1,r63
113
114	/* 2 or 3 byte memcpy */
115	ld.b r3,0,r0
116	nop ! ld.b r2,0,r63 ! TAKum03020
117	ld.b r3,1,r1
118	st.b r2,0,r0
119	pta/l L2_3,tr0
120	ld.b r6,-1,r6
121	st.b r2,1,r1
122	blink tr0, r63
123
124	/* 4 .. 7 byte memcpy */
125	LDUAL (r3, 0, r0, r1)
126	pta L4_7, tr0
127	ldlo.l r6, -4, r7
128	or r0, r1, r0
129	sthi.l r2, 3, r0
130	ldhi.l r6, -1, r6
131	blink tr0, r63
132
133	/* 8 .. 15 byte memcpy */
134	LDUAQ (r3, 0, r0, r1)
135	pta L8_15, tr0
136	ldlo.q r6, -8, r7
137	or r0, r1, r0
138	sthi.q r2, 7, r0
139	ldhi.q r6, -1, r6
140	blink tr0, r63
141
142	/* 16 .. 24 byte memcpy */
143	LDUAQ (r3, 0, r0, r1)
144	LDUAQ (r3, 8, r8, r9)
145	or r0, r1, r0
146	sthi.q r2, 7, r0
147	or r8, r9, r8
148	sthi.q r2, 15, r8
149	ldlo.q r6, -8, r7
150	ldhi.q r6, -1, r6
151	stlo.q r2, 8, r8
152	stlo.q r2, 0, r0
153	or r6, r7, r6
154	sthi.q r5, -1, r6
155	stlo.q r5, -8, r6
156	blink tr1,r63
157
158Large:
159	! ld.b r2, 0, r63 ! TAKum03020
160	pta/l  Loop_ua, tr1
161	ori r3, -8, r7
162	sub r2, r7, r22
163	sub r3, r2, r6
164	add r2, r4, r5
165	ldlo.q r3, 0, r0
166	addi r5, -16, r5
167	movi 64+8, r27 ! could subtract r7 from that.
168	stlo.q r2, 0, r0
169	sthi.q r2, 7, r0
170	ldx.q r22, r6, r0
171	bgtu/l r27, r4, tr1
172
173	addi r5, -48, r27
174	pta/l Loop_line, tr0
175	addi r6, 64, r36
176	addi r6, -24, r19
177	addi r6, -16, r20
178	addi r6, -8, r21
179
180Loop_line:
181	! ldx.q r22, r36, r63 ! TAKum03020
182	alloco r22, 32
183	synco
184	addi r22, 32, r22
185	ldx.q r22, r19, r23
186	sthi.q r22, -25, r0
187	ldx.q r22, r20, r24
188	ldx.q r22, r21, r25
189	stlo.q r22, -32, r0
190	ldx.q r22, r6,  r0
191	sthi.q r22, -17, r23
192	sthi.q r22,  -9, r24
193	sthi.q r22,  -1, r25
194	stlo.q r22, -24, r23
195	stlo.q r22, -16, r24
196	stlo.q r22,  -8, r25
197	bgeu r27, r22, tr0
198
199Loop_ua:
200	addi r22, 8, r22
201	sthi.q r22, -1, r0
202	stlo.q r22, -8, r0
203	ldx.q r22, r6, r0
204	bgtu/l r5, r22, tr1
205
206	add r3, r4, r7
207	ldlo.q r7, -8, r1
208	sthi.q r22, 7, r0
209	ldhi.q r7, -1, r7
210	ptabs r18,tr1
211	stlo.q r22, 0, r0
212	or r1, r7, r1
213	sthi.q r5, 15, r1
214	stlo.q r5, 8, r1
215	blink tr1, r63
216copy_user_memcpy_end:
217	nop