memmove.S - arch/riscv/lib/memmove.S - Linux diff v6.8

  1/* SPDX-License-Identifier: GPL-2.0-only */
  2/*
  3 * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
  4 */
  5
  6#include <linux/linkage.h>
  7#include <asm/asm.h>
  8
  9SYM_FUNC_START(__memmove)
 
 10	/*
 11	 * Returns
 12	 *   a0 - dest
 13	 *
 14	 * Parameters
 15	 *   a0 - Inclusive first byte of dest
 16	 *   a1 - Inclusive first byte of src
 17	 *   a2 - Length of copy n
 18	 *
 19	 * Because the return matches the parameter register a0,
 20	 * we will not clobber or modify that register.
 21	 *
 22	 * Note: This currently only works on little-endian.
 23	 * To port to big-endian, reverse the direction of shifts
 24	 * in the 2 misaligned fixup copy loops.
 25	 */
 26
 27	/* Return if nothing to do */
 28	beq a0, a1, .Lreturn_from_memmove
 29	beqz a2, .Lreturn_from_memmove
 30
 31	/*
 32	 * Register Uses
 33	 *      Forward Copy: a1 - Index counter of src
 34	 *      Reverse Copy: a4 - Index counter of src
 35	 *      Forward Copy: t3 - Index counter of dest
 36	 *      Reverse Copy: t4 - Index counter of dest
 37	 *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
 38	 *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
 39	 *   Both Copy Modes: t0 - Link / Temporary for load-store
 40	 *   Both Copy Modes: t1 - Temporary for load-store
 41	 *   Both Copy Modes: t2 - Temporary for load-store
 42	 *   Both Copy Modes: a5 - dest to src alignment offset
 43	 *   Both Copy Modes: a6 - Shift ammount
 44	 *   Both Copy Modes: a7 - Inverse Shift ammount
 45	 *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
 46	 */
 47
 48	/*
 49	 * Solve for some register values now.
 50	 * Byte copy does not need t5 or t6.
 51	 */
 52	mv   t3, a0
 53	add  t4, a0, a2
 54	add  a4, a1, a2
 55
 56	/*
 57	 * Byte copy if copying less than (2 * SZREG) bytes. This can
 58	 * cause problems with the bulk copy implementation and is
 59	 * small enough not to bother.
 60	 */
 61	andi t0, a2, -(2 * SZREG)
 62	beqz t0, .Lbyte_copy
 63
 64	/*
 65	 * Now solve for t5 and t6.
 66	 */
 67	andi t5, t3, -SZREG
 68	andi t6, t4, -SZREG
 69	/*
 70	 * If dest(Register t3) rounded down to the nearest naturally
 71	 * aligned SZREG address, does not equal dest, then add SZREG
 72	 * to find the low-bound of SZREG alignment in the dest memory
 73	 * region.  Note that this could overshoot the dest memory
 74	 * region if n is less than SZREG.  This is one reason why
 75	 * we always byte copy if n is less than SZREG.
 76	 * Otherwise, dest is already naturally aligned to SZREG.
 77	 */
 78	beq  t5, t3, 1f
 79		addi t5, t5, SZREG
 80	1:
 81
 82	/*
 83	 * If the dest and src are co-aligned to SZREG, then there is
 84	 * no need for the full rigmarole of a full misaligned fixup copy.
 85	 * Instead, do a simpler co-aligned copy.
 86	 */
 87	xor  t0, a0, a1
 88	andi t1, t0, (SZREG - 1)
 89	beqz t1, .Lcoaligned_copy
 90	/* Fall through to misaligned fixup copy */
 91
 92.Lmisaligned_fixup_copy:
 93	bltu a1, a0, .Lmisaligned_fixup_copy_reverse
 94
 95.Lmisaligned_fixup_copy_forward:
 96	jal  t0, .Lbyte_copy_until_aligned_forward
 97
 98	andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
 99	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
100	sub  a5, a1, t3 /* Find the difference between src and dest */
101	andi a1, a1, -SZREG /* Align the src pointer */
102	addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
103
104	/*
105	 * Compute The Inverse Shift
106	 * a7 = XLEN - a6 = XLEN + -a6
107	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
108	 * Add that to XLEN.  XLEN = SZREG * 8.
109	 */
110	not  a7, a6
111	addi a7, a7, (SZREG * 8 + 1)
112
113	/*
114	 * Fix Misalignment Copy Loop - Forward
115	 * load_val0 = load_ptr[0];
116	 * do {
117	 * 	load_val1 = load_ptr[1];
118	 * 	store_ptr += 2;
119	 * 	store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
120	 *
121	 * 	if (store_ptr == {a2})
122	 * 		break;
123	 *
124	 * 	load_val0 = load_ptr[2];
125	 * 	load_ptr += 2;
126	 * 	store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
127	 *
128	 * } while (store_ptr != store_ptr_end);
129	 * store_ptr = store_ptr_end;
130	 */
131
132	REG_L t0, (0 * SZREG)(a1)
133	1:
134	REG_L t1, (1 * SZREG)(a1)
135	addi  t3, t3, (2 * SZREG)
136	srl   t0, t0, a6
137	sll   t2, t1, a7
138	or    t2, t0, t2
139	REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
140
141	beq   t3, a2, 2f
142
143	REG_L t0, (2 * SZREG)(a1)
144	addi  a1, a1, (2 * SZREG)
145	srl   t1, t1, a6
146	sll   t2, t0, a7
147	or    t2, t1, t2
148	REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
149
150	bne   t3, t6, 1b
151	2:
152	mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
153
154	add  a1, t3, a5 /* Restore the src pointer */
155	j .Lbyte_copy_forward /* Copy any remaining bytes */
156
157.Lmisaligned_fixup_copy_reverse:
158	jal  t0, .Lbyte_copy_until_aligned_reverse
159
160	andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
161	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
162	sub  a5, a4, t4 /* Find the difference between src and dest */
163	andi a4, a4, -SZREG /* Align the src pointer */
164	addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
165
166	/*
167	 * Compute The Inverse Shift
168	 * a7 = XLEN - a6 = XLEN + -a6
169	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
170	 * Add that to XLEN.  XLEN = SZREG * 8.
171	 */
172	not  a7, a6
173	addi a7, a7, (SZREG * 8 + 1)
174
175	/*
176	 * Fix Misalignment Copy Loop - Reverse
177	 * load_val1 = load_ptr[0];
178	 * do {
179	 * 	load_val0 = load_ptr[-1];
180	 * 	store_ptr -= 2;
181	 * 	store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
182	 *
183	 * 	if (store_ptr == {a2})
184	 * 		break;
185	 *
186	 * 	load_val1 = load_ptr[-2];
187	 * 	load_ptr -= 2;
188	 * 	store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
189	 *
190	 * } while (store_ptr != store_ptr_end);
191	 * store_ptr = store_ptr_end;
192	 */
193
194	REG_L t1, ( 0 * SZREG)(a4)
195	1:
196	REG_L t0, (-1 * SZREG)(a4)
197	addi  t4, t4, (-2 * SZREG)
198	sll   t1, t1, a7
199	srl   t2, t0, a6
200	or    t2, t1, t2
201	REG_S t2, ( 1 * SZREG)(t4)
202
203	beq   t4, a2, 2f
204
205	REG_L t1, (-2 * SZREG)(a4)
206	addi  a4, a4, (-2 * SZREG)
207	sll   t0, t0, a7
208	srl   t2, t1, a6
209	or    t2, t0, t2
210	REG_S t2, ( 0 * SZREG)(t4)
211
212	bne   t4, t5, 1b
213	2:
214	mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
215
216	add  a4, t4, a5 /* Restore the src pointer */
217	j .Lbyte_copy_reverse /* Copy any remaining bytes */
218
219/*
220 * Simple copy loops for SZREG co-aligned memory locations.
221 * These also make calls to do byte copies for any unaligned
222 * data at their terminations.
223 */
224.Lcoaligned_copy:
225	bltu a1, a0, .Lcoaligned_copy_reverse
226
227.Lcoaligned_copy_forward:
228	jal t0, .Lbyte_copy_until_aligned_forward
229
230	1:
231	REG_L t1, ( 0 * SZREG)(a1)
232	addi  a1, a1, SZREG
233	addi  t3, t3, SZREG
234	REG_S t1, (-1 * SZREG)(t3)
235	bne   t3, t6, 1b
236
237	j .Lbyte_copy_forward /* Copy any remaining bytes */
238
239.Lcoaligned_copy_reverse:
240	jal t0, .Lbyte_copy_until_aligned_reverse
241
242	1:
243	REG_L t1, (-1 * SZREG)(a4)
244	addi  a4, a4, -SZREG
245	addi  t4, t4, -SZREG
246	REG_S t1, ( 0 * SZREG)(t4)
247	bne   t4, t5, 1b
248
249	j .Lbyte_copy_reverse /* Copy any remaining bytes */
250
251/*
252 * These are basically sub-functions within the function.  They
253 * are used to byte copy until the dest pointer is in alignment.
254 * At which point, a bulk copy method can be used by the
255 * calling code.  These work on the same registers as the bulk
256 * copy loops.  Therefore, the register values can be picked
257 * up from where they were left and we avoid code duplication
258 * without any overhead except the call in and return jumps.
259 */
260.Lbyte_copy_until_aligned_forward:
261	beq  t3, t5, 2f
262	1:
263	lb   t1,  0(a1)
264	addi a1, a1, 1
265	addi t3, t3, 1
266	sb   t1, -1(t3)
267	bne  t3, t5, 1b
268	2:
269	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
270
271.Lbyte_copy_until_aligned_reverse:
272	beq  t4, t6, 2f
273	1:
274	lb   t1, -1(a4)
275	addi a4, a4, -1
276	addi t4, t4, -1
277	sb   t1,  0(t4)
278	bne  t4, t6, 1b
279	2:
280	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
281
282/*
283 * Simple byte copy loops.
284 * These will byte copy until they reach the end of data to copy.
285 * At that point, they will call to return from memmove.
286 */
287.Lbyte_copy:
288	bltu a1, a0, .Lbyte_copy_reverse
289
290.Lbyte_copy_forward:
291	beq  t3, t4, 2f
292	1:
293	lb   t1,  0(a1)
294	addi a1, a1, 1
295	addi t3, t3, 1
296	sb   t1, -1(t3)
297	bne  t3, t4, 1b
298	2:
299	ret
300
301.Lbyte_copy_reverse:
302	beq  t4, t3, 2f
303	1:
304	lb   t1, -1(a4)
305	addi a4, a4, -1
306	addi t4, t4, -1
307	sb   t1,  0(t4)
308	bne  t4, t3, 1b
309	2:
310
311.Lreturn_from_memmove:
312	ret
313
 
314SYM_FUNC_END(__memmove)
315SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
316SYM_FUNC_ALIAS(__pi_memmove, __memmove)
317SYM_FUNC_ALIAS(__pi___memmove, __memmove)

  1/* SPDX-License-Identifier: GPL-2.0-only */
  2/*
  3 * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
  4 */
  5
  6#include <linux/linkage.h>
  7#include <asm/asm.h>
  8
  9SYM_FUNC_START(__memmove)
 10SYM_FUNC_START_WEAK(memmove)
 11	/*
 12	 * Returns
 13	 *   a0 - dest
 14	 *
 15	 * Parameters
 16	 *   a0 - Inclusive first byte of dest
 17	 *   a1 - Inclusive first byte of src
 18	 *   a2 - Length of copy n
 19	 *
 20	 * Because the return matches the parameter register a0,
 21	 * we will not clobber or modify that register.
 22	 *
 23	 * Note: This currently only works on little-endian.
 24	 * To port to big-endian, reverse the direction of shifts
 25	 * in the 2 misaligned fixup copy loops.
 26	 */
 27
 28	/* Return if nothing to do */
 29	beq a0, a1, return_from_memmove
 30	beqz a2, return_from_memmove
 31
 32	/*
 33	 * Register Uses
 34	 *      Forward Copy: a1 - Index counter of src
 35	 *      Reverse Copy: a4 - Index counter of src
 36	 *      Forward Copy: t3 - Index counter of dest
 37	 *      Reverse Copy: t4 - Index counter of dest
 38	 *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
 39	 *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
 40	 *   Both Copy Modes: t0 - Link / Temporary for load-store
 41	 *   Both Copy Modes: t1 - Temporary for load-store
 42	 *   Both Copy Modes: t2 - Temporary for load-store
 43	 *   Both Copy Modes: a5 - dest to src alignment offset
 44	 *   Both Copy Modes: a6 - Shift ammount
 45	 *   Both Copy Modes: a7 - Inverse Shift ammount
 46	 *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
 47	 */
 48
 49	/*
 50	 * Solve for some register values now.
 51	 * Byte copy does not need t5 or t6.
 52	 */
 53	mv   t3, a0
 54	add  t4, a0, a2
 55	add  a4, a1, a2
 56
 57	/*
 58	 * Byte copy if copying less than (2 * SZREG) bytes. This can
 59	 * cause problems with the bulk copy implementation and is
 60	 * small enough not to bother.
 61	 */
 62	andi t0, a2, -(2 * SZREG)
 63	beqz t0, byte_copy
 64
 65	/*
 66	 * Now solve for t5 and t6.
 67	 */
 68	andi t5, t3, -SZREG
 69	andi t6, t4, -SZREG
 70	/*
 71	 * If dest(Register t3) rounded down to the nearest naturally
 72	 * aligned SZREG address, does not equal dest, then add SZREG
 73	 * to find the low-bound of SZREG alignment in the dest memory
 74	 * region.  Note that this could overshoot the dest memory
 75	 * region if n is less than SZREG.  This is one reason why
 76	 * we always byte copy if n is less than SZREG.
 77	 * Otherwise, dest is already naturally aligned to SZREG.
 78	 */
 79	beq  t5, t3, 1f
 80		addi t5, t5, SZREG
 81	1:
 82
 83	/*
 84	 * If the dest and src are co-aligned to SZREG, then there is
 85	 * no need for the full rigmarole of a full misaligned fixup copy.
 86	 * Instead, do a simpler co-aligned copy.
 87	 */
 88	xor  t0, a0, a1
 89	andi t1, t0, (SZREG - 1)
 90	beqz t1, coaligned_copy
 91	/* Fall through to misaligned fixup copy */
 92
 93misaligned_fixup_copy:
 94	bltu a1, a0, misaligned_fixup_copy_reverse
 95
 96misaligned_fixup_copy_forward:
 97	jal  t0, byte_copy_until_aligned_forward
 98
 99	andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
100	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
101	sub  a5, a1, t3 /* Find the difference between src and dest */
102	andi a1, a1, -SZREG /* Align the src pointer */
103	addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
104
105	/*
106	 * Compute The Inverse Shift
107	 * a7 = XLEN - a6 = XLEN + -a6
108	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
109	 * Add that to XLEN.  XLEN = SZREG * 8.
110	 */
111	not  a7, a6
112	addi a7, a7, (SZREG * 8 + 1)
113
114	/*
115	 * Fix Misalignment Copy Loop - Forward
116	 * load_val0 = load_ptr[0];
117	 * do {
118	 * 	load_val1 = load_ptr[1];
119	 * 	store_ptr += 2;
120	 * 	store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
121	 *
122	 * 	if (store_ptr == {a2})
123	 * 		break;
124	 *
125	 * 	load_val0 = load_ptr[2];
126	 * 	load_ptr += 2;
127	 * 	store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
128	 *
129	 * } while (store_ptr != store_ptr_end);
130	 * store_ptr = store_ptr_end;
131	 */
132
133	REG_L t0, (0 * SZREG)(a1)
134	1:
135	REG_L t1, (1 * SZREG)(a1)
136	addi  t3, t3, (2 * SZREG)
137	srl   t0, t0, a6
138	sll   t2, t1, a7
139	or    t2, t0, t2
140	REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
141
142	beq   t3, a2, 2f
143
144	REG_L t0, (2 * SZREG)(a1)
145	addi  a1, a1, (2 * SZREG)
146	srl   t1, t1, a6
147	sll   t2, t0, a7
148	or    t2, t1, t2
149	REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
150
151	bne   t3, t6, 1b
152	2:
153	mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
154
155	add  a1, t3, a5 /* Restore the src pointer */
156	j byte_copy_forward /* Copy any remaining bytes */
157
158misaligned_fixup_copy_reverse:
159	jal  t0, byte_copy_until_aligned_reverse
160
161	andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
162	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
163	sub  a5, a4, t4 /* Find the difference between src and dest */
164	andi a4, a4, -SZREG /* Align the src pointer */
165	addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
166
167	/*
168	 * Compute The Inverse Shift
169	 * a7 = XLEN - a6 = XLEN + -a6
170	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
171	 * Add that to XLEN.  XLEN = SZREG * 8.
172	 */
173	not  a7, a6
174	addi a7, a7, (SZREG * 8 + 1)
175
176	/*
177	 * Fix Misalignment Copy Loop - Reverse
178	 * load_val1 = load_ptr[0];
179	 * do {
180	 * 	load_val0 = load_ptr[-1];
181	 * 	store_ptr -= 2;
182	 * 	store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
183	 *
184	 * 	if (store_ptr == {a2})
185	 * 		break;
186	 *
187	 * 	load_val1 = load_ptr[-2];
188	 * 	load_ptr -= 2;
189	 * 	store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
190	 *
191	 * } while (store_ptr != store_ptr_end);
192	 * store_ptr = store_ptr_end;
193	 */
194
195	REG_L t1, ( 0 * SZREG)(a4)
196	1:
197	REG_L t0, (-1 * SZREG)(a4)
198	addi  t4, t4, (-2 * SZREG)
199	sll   t1, t1, a7
200	srl   t2, t0, a6
201	or    t2, t1, t2
202	REG_S t2, ( 1 * SZREG)(t4)
203
204	beq   t4, a2, 2f
205
206	REG_L t1, (-2 * SZREG)(a4)
207	addi  a4, a4, (-2 * SZREG)
208	sll   t0, t0, a7
209	srl   t2, t1, a6
210	or    t2, t0, t2
211	REG_S t2, ( 0 * SZREG)(t4)
212
213	bne   t4, t5, 1b
214	2:
215	mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
216
217	add  a4, t4, a5 /* Restore the src pointer */
218	j byte_copy_reverse /* Copy any remaining bytes */
219
220/*
221 * Simple copy loops for SZREG co-aligned memory locations.
222 * These also make calls to do byte copies for any unaligned
223 * data at their terminations.
224 */
225coaligned_copy:
226	bltu a1, a0, coaligned_copy_reverse
227
228coaligned_copy_forward:
229	jal t0, byte_copy_until_aligned_forward
230
231	1:
232	REG_L t1, ( 0 * SZREG)(a1)
233	addi  a1, a1, SZREG
234	addi  t3, t3, SZREG
235	REG_S t1, (-1 * SZREG)(t3)
236	bne   t3, t6, 1b
237
238	j byte_copy_forward /* Copy any remaining bytes */
239
240coaligned_copy_reverse:
241	jal t0, byte_copy_until_aligned_reverse
242
243	1:
244	REG_L t1, (-1 * SZREG)(a4)
245	addi  a4, a4, -SZREG
246	addi  t4, t4, -SZREG
247	REG_S t1, ( 0 * SZREG)(t4)
248	bne   t4, t5, 1b
249
250	j byte_copy_reverse /* Copy any remaining bytes */
251
252/*
253 * These are basically sub-functions within the function.  They
254 * are used to byte copy until the dest pointer is in alignment.
255 * At which point, a bulk copy method can be used by the
256 * calling code.  These work on the same registers as the bulk
257 * copy loops.  Therefore, the register values can be picked
258 * up from where they were left and we avoid code duplication
259 * without any overhead except the call in and return jumps.
260 */
261byte_copy_until_aligned_forward:
262	beq  t3, t5, 2f
263	1:
264	lb   t1,  0(a1)
265	addi a1, a1, 1
266	addi t3, t3, 1
267	sb   t1, -1(t3)
268	bne  t3, t5, 1b
269	2:
270	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
271
272byte_copy_until_aligned_reverse:
273	beq  t4, t6, 2f
274	1:
275	lb   t1, -1(a4)
276	addi a4, a4, -1
277	addi t4, t4, -1
278	sb   t1,  0(t4)
279	bne  t4, t6, 1b
280	2:
281	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
282
283/*
284 * Simple byte copy loops.
285 * These will byte copy until they reach the end of data to copy.
286 * At that point, they will call to return from memmove.
287 */
288byte_copy:
289	bltu a1, a0, byte_copy_reverse
290
291byte_copy_forward:
292	beq  t3, t4, 2f
293	1:
294	lb   t1,  0(a1)
295	addi a1, a1, 1
296	addi t3, t3, 1
297	sb   t1, -1(t3)
298	bne  t3, t4, 1b
299	2:
300	ret
301
302byte_copy_reverse:
303	beq  t4, t3, 2f
304	1:
305	lb   t1, -1(a4)
306	addi a4, a4, -1
307	addi t4, t4, -1
308	sb   t1,  0(t4)
309	bne  t4, t3, 1b
310	2:
311
312return_from_memmove:
313	ret
314
315SYM_FUNC_END(memmove)
316SYM_FUNC_END(__memmove)