csum_partial.S - arch/mips/lib/csum_partial.S - Linux diff v6.2

  1/*
  2 * This file is subject to the terms and conditions of the GNU General Public
  3 * License.  See the file "COPYING" in the main directory of this archive
  4 * for more details.
  5 *
  6 * Quick'n'dirty IP checksum ...
  7 *
  8 * Copyright (C) 1998, 1999 Ralf Baechle
  9 * Copyright (C) 1999 Silicon Graphics, Inc.
 10 * Copyright (C) 2007  Maciej W. Rozycki
 11 * Copyright (C) 2014 Imagination Technologies Ltd.
 12 */
 13#include <linux/errno.h>
 14#include <asm/asm.h>
 15#include <asm/asm-offsets.h>
 16#include <asm/export.h>
 17#include <asm/regdef.h>
 18
 19#ifdef CONFIG_64BIT
 20/*
 21 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 22 * register definitions). We need to redefine the register definitions from
 23 * the n64 ABI register naming to the o32 ABI register naming.
 24 */
 25#undef t0
 26#undef t1
 27#undef t2
 28#undef t3
 29#define t0	$8
 30#define t1	$9
 31#define t2	$10
 32#define t3	$11
 33#define t4	$12
 34#define t5	$13
 35#define t6	$14
 36#define t7	$15
 37
 38#define USE_DOUBLE
 39#endif
 40
 41#ifdef USE_DOUBLE
 42
 43#define LOAD   ld
 44#define LOAD32 lwu
 45#define ADD    daddu
 46#define NBYTES 8
 47
 48#else
 49
 50#define LOAD   lw
 51#define LOAD32 lw
 52#define ADD    addu
 53#define NBYTES 4
 54
 55#endif /* USE_DOUBLE */
 56
 57#define UNIT(unit)  ((unit)*NBYTES)
 58
 59#define ADDC(sum,reg)						\
 60	.set	push;						\
 61	.set	noat;						\
 62	ADD	sum, reg;					\
 63	sltu	v1, sum, reg;					\
 64	ADD	sum, v1;					\
 65	.set	pop
 66
 67#define ADDC32(sum,reg)						\
 68	.set	push;						\
 69	.set	noat;						\
 70	addu	sum, reg;					\
 71	sltu	v1, sum, reg;					\
 72	addu	sum, v1;					\
 73	.set	pop
 74
 75#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
 76	LOAD	_t0, (offset + UNIT(0))(src);			\
 77	LOAD	_t1, (offset + UNIT(1))(src);			\
 78	LOAD	_t2, (offset + UNIT(2))(src);			\
 79	LOAD	_t3, (offset + UNIT(3))(src);			\
 80	ADDC(_t0, _t1);						\
 81	ADDC(_t2, _t3);						\
 82	ADDC(sum, _t0);						\
 83	ADDC(sum, _t2)
 84
 85#ifdef USE_DOUBLE
 86#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
 87	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
 88#else
 89#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
 90	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);	\
 91	CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
 92#endif
 93
 94/*
 95 * a0: source address
 96 * a1: length of the area to checksum
 97 * a2: partial checksum
 98 */
 99
100#define src a0
101#define sum v0
102
103	.text
104	.set	noreorder
105	.align	5
106LEAF(csum_partial)
107EXPORT_SYMBOL(csum_partial)
108	move	sum, zero
109	move	t7, zero
110
111	sltiu	t8, a1, 0x8
112	bnez	t8, .Lsmall_csumcpy		/* < 8 bytes to copy */
113	 move	t2, a1
114
115	andi	t7, src, 0x1			/* odd buffer? */
116
117.Lhword_align:
118	beqz	t7, .Lword_align
119	 andi	t8, src, 0x2
120
121	lbu	t0, (src)
122	LONG_SUBU	a1, a1, 0x1
123#ifdef __MIPSEL__
124	sll	t0, t0, 8
125#endif
126	ADDC(sum, t0)
127	PTR_ADDU	src, src, 0x1
128	andi	t8, src, 0x2
129
130.Lword_align:
131	beqz	t8, .Ldword_align
132	 sltiu	t8, a1, 56
133
134	lhu	t0, (src)
135	LONG_SUBU	a1, a1, 0x2
136	ADDC(sum, t0)
137	sltiu	t8, a1, 56
138	PTR_ADDU	src, src, 0x2
139
140.Ldword_align:
141	bnez	t8, .Ldo_end_words
142	 move	t8, a1
143
144	andi	t8, src, 0x4
145	beqz	t8, .Lqword_align
146	 andi	t8, src, 0x8
147
148	LOAD32	t0, 0x00(src)
149	LONG_SUBU	a1, a1, 0x4
150	ADDC(sum, t0)
151	PTR_ADDU	src, src, 0x4
152	andi	t8, src, 0x8
153
154.Lqword_align:
155	beqz	t8, .Loword_align
156	 andi	t8, src, 0x10
157
158#ifdef USE_DOUBLE
159	ld	t0, 0x00(src)
160	LONG_SUBU	a1, a1, 0x8
161	ADDC(sum, t0)
162#else
163	lw	t0, 0x00(src)
164	lw	t1, 0x04(src)
165	LONG_SUBU	a1, a1, 0x8
166	ADDC(sum, t0)
167	ADDC(sum, t1)
168#endif
169	PTR_ADDU	src, src, 0x8
170	andi	t8, src, 0x10
171
172.Loword_align:
173	beqz	t8, .Lbegin_movement
174	 LONG_SRL	t8, a1, 0x7
175
176#ifdef USE_DOUBLE
177	ld	t0, 0x00(src)
178	ld	t1, 0x08(src)
179	ADDC(sum, t0)
180	ADDC(sum, t1)
181#else
182	CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
183#endif
184	LONG_SUBU	a1, a1, 0x10
185	PTR_ADDU	src, src, 0x10
186	LONG_SRL	t8, a1, 0x7
187
188.Lbegin_movement:
189	beqz	t8, 1f
190	 andi	t2, a1, 0x40
191
192.Lmove_128bytes:
193	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
194	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
195	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
196	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
197	LONG_SUBU	t8, t8, 0x01
198	.set	reorder				/* DADDI_WAR */
199	PTR_ADDU	src, src, 0x80
200	bnez	t8, .Lmove_128bytes
201	.set	noreorder
202
2031:
204	beqz	t2, 1f
205	 andi	t2, a1, 0x20
206
207.Lmove_64bytes:
208	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
209	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
210	PTR_ADDU	src, src, 0x40
211
2121:
213	beqz	t2, .Ldo_end_words
214	 andi	t8, a1, 0x1c
215
216.Lmove_32bytes:
217	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
218	andi	t8, a1, 0x1c
219	PTR_ADDU	src, src, 0x20
220
221.Ldo_end_words:
222	beqz	t8, .Lsmall_csumcpy
223	 andi	t2, a1, 0x3
224	LONG_SRL	t8, t8, 0x2
225
226.Lend_words:
227	LOAD32	t0, (src)
228	LONG_SUBU	t8, t8, 0x1
229	ADDC(sum, t0)
230	.set	reorder				/* DADDI_WAR */
231	PTR_ADDU	src, src, 0x4
232	bnez	t8, .Lend_words
233	.set	noreorder
234
235/* unknown src alignment and < 8 bytes to go  */
236.Lsmall_csumcpy:
237	move	a1, t2
238
239	andi	t0, a1, 4
240	beqz	t0, 1f
241	 andi	t0, a1, 2
242
243	/* Still a full word to go  */
244	ulw	t1, (src)
245	PTR_ADDIU	src, 4
246#ifdef USE_DOUBLE
247	dsll	t1, t1, 32			/* clear lower 32bit */
248#endif
249	ADDC(sum, t1)
250
2511:	move	t1, zero
252	beqz	t0, 1f
253	 andi	t0, a1, 1
254
255	/* Still a halfword to go  */
256	ulhu	t1, (src)
257	PTR_ADDIU	src, 2
258
2591:	beqz	t0, 1f
260	 sll	t1, t1, 16
261
262	lbu	t2, (src)
263	 nop
264
265#ifdef __MIPSEB__
266	sll	t2, t2, 8
267#endif
268	or	t1, t2
269
2701:	ADDC(sum, t1)
271
272	/* fold checksum */
273#ifdef USE_DOUBLE
274	dsll32	v1, sum, 0
275	daddu	sum, v1
276	sltu	v1, sum, v1
277	dsra32	sum, sum, 0
278	addu	sum, v1
279#endif
280
281	/* odd buffer alignment? */
282#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
283    defined(CONFIG_CPU_LOONGSON64)
284	.set	push
285	.set	arch=mips32r2
286	wsbh	v1, sum
287	movn	sum, v1, t7
288	.set	pop
289#else
290	beqz	t7, 1f			/* odd buffer alignment? */
291	 lui	v1, 0x00ff
292	addu	v1, 0x00ff
293	and	t0, sum, v1
294	sll	t0, t0, 8
295	srl	sum, sum, 8
296	and	sum, sum, v1
297	or	sum, sum, t0
2981:
299#endif
300	.set	reorder
301	/* Add the passed partial csum.	 */
302	ADDC32(sum, a2)
303	jr	ra
304	.set	noreorder
305	END(csum_partial)
306
307
308/*
309 * checksum and copy routines based on memcpy.S
310 *
311 *	csum_partial_copy_nocheck(src, dst, len)
312 *	__csum_partial_copy_kernel(src, dst, len)
313 *
314 * See "Spec" in memcpy.S for details.	Unlike __copy_user, all
315 * function in this file use the standard calling convention.
316 */
317
318#define src a0
319#define dst a1
320#define len a2
 
321#define sum v0
322#define odd t8
 
323
324/*
325 * All exception handlers simply return 0.
 
 
 
 
 
 
 
 
 
 
 
 
 
326 */
327
328/* Instruction type */
329#define LD_INSN 1
330#define ST_INSN 2
331#define LEGACY_MODE 1
332#define EVA_MODE    2
333#define USEROP   1
334#define KERNELOP 2
335
336/*
337 * Wrapper to add an entry in the exception table
338 * in case the insn causes a memory exception.
339 * Arguments:
340 * insn    : Load/store instruction
341 * type    : Instruction type
342 * reg     : Register
343 * addr    : Address
344 * handler : Exception handler
345 */
346#define EXC(insn, type, reg, addr)		\
347	.if \mode == LEGACY_MODE;		\
3489:		insn reg, addr;			\
349		.section __ex_table,"a";	\
350		PTR_WD	9b, .L_exc;		\
351		.previous;			\
352	/* This is enabled in EVA mode */	\
353	.else;					\
354		/* If loading from user or storing to user */	\
355		.if ((\from == USEROP) && (type == LD_INSN)) || \
356		    ((\to == USEROP) && (type == ST_INSN));	\
3579:			__BUILD_EVA_INSN(insn##e, reg, addr);	\
358			.section __ex_table,"a";		\
359			PTR_WD	9b, .L_exc;			\
360			.previous;				\
361		.else;						\
362			/* EVA without exception */		\
363			insn reg, addr;				\
364		.endif;						\
365	.endif
366
367#undef LOAD
368
369#ifdef USE_DOUBLE
370
371#define LOADK	ld /* No exception */
372#define LOAD(reg, addr)		EXC(ld, LD_INSN, reg, addr)
373#define LOADBU(reg, addr)	EXC(lbu, LD_INSN, reg, addr)
374#define LOADL(reg, addr)	EXC(ldl, LD_INSN, reg, addr)
375#define LOADR(reg, addr)	EXC(ldr, LD_INSN, reg, addr)
376#define STOREB(reg, addr)	EXC(sb, ST_INSN, reg, addr)
377#define STOREL(reg, addr)	EXC(sdl, ST_INSN, reg, addr)
378#define STORER(reg, addr)	EXC(sdr, ST_INSN, reg, addr)
379#define STORE(reg, addr)	EXC(sd, ST_INSN, reg, addr)
380#define ADD    daddu
381#define SUB    dsubu
382#define SRL    dsrl
383#define SLL    dsll
384#define SLLV   dsllv
385#define SRLV   dsrlv
386#define NBYTES 8
387#define LOG_NBYTES 3
388
389#else
390
391#define LOADK	lw /* No exception */
392#define LOAD(reg, addr)		EXC(lw, LD_INSN, reg, addr)
393#define LOADBU(reg, addr)	EXC(lbu, LD_INSN, reg, addr)
394#define LOADL(reg, addr)	EXC(lwl, LD_INSN, reg, addr)
395#define LOADR(reg, addr)	EXC(lwr, LD_INSN, reg, addr)
396#define STOREB(reg, addr)	EXC(sb, ST_INSN, reg, addr)
397#define STOREL(reg, addr)	EXC(swl, ST_INSN, reg, addr)
398#define STORER(reg, addr)	EXC(swr, ST_INSN, reg, addr)
399#define STORE(reg, addr)	EXC(sw, ST_INSN, reg, addr)
400#define ADD    addu
401#define SUB    subu
402#define SRL    srl
403#define SLL    sll
404#define SLLV   sllv
405#define SRLV   srlv
406#define NBYTES 4
407#define LOG_NBYTES 2
408
409#endif /* USE_DOUBLE */
410
411#ifdef CONFIG_CPU_LITTLE_ENDIAN
412#define LDFIRST LOADR
413#define LDREST	LOADL
414#define STFIRST STORER
415#define STREST	STOREL
416#define SHIFT_DISCARD SLLV
417#define SHIFT_DISCARD_REVERT SRLV
418#else
419#define LDFIRST LOADL
420#define LDREST	LOADR
421#define STFIRST STOREL
422#define STREST	STORER
423#define SHIFT_DISCARD SRLV
424#define SHIFT_DISCARD_REVERT SLLV
425#endif
426
427#define FIRST(unit) ((unit)*NBYTES)
428#define REST(unit)  (FIRST(unit)+NBYTES-1)
429
430#define ADDRMASK (NBYTES-1)
431
432#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
433	.set	noat
434#else
435	.set	at=v1
436#endif
437
438	.macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to
439
440	li	sum, -1
 
 
 
 
 
 
 
 
 
 
 
 
 
441	move	odd, zero
442	/*
443	 * Note: dst & src may be unaligned, len may be 0
444	 * Temps
445	 */
446	/*
447	 * The "issue break"s below are very approximate.
448	 * Issue delays for dcache fills will perturb the schedule, as will
449	 * load queue full replay traps, etc.
450	 *
451	 * If len < NBYTES use byte operations.
452	 */
453	sltu	t2, len, NBYTES
454	and	t1, dst, ADDRMASK
455	bnez	t2, .Lcopy_bytes_checklen\@
456	 and	t0, src, ADDRMASK
457	andi	odd, dst, 0x1			/* odd buffer? */
458	bnez	t1, .Ldst_unaligned\@
459	 nop
460	bnez	t0, .Lsrc_unaligned_dst_aligned\@
461	/*
462	 * use delay slot for fall-through
463	 * src and dst are aligned; need to compute rem
464	 */
465.Lboth_aligned\@:
466	 SRL	t0, len, LOG_NBYTES+3	 # +3 for 8 units/iter
467	beqz	t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
468	 nop
469	SUB	len, 8*NBYTES		# subtract here for bgez loop
470	.align	4
4711:
472	LOAD(t0, UNIT(0)(src))
473	LOAD(t1, UNIT(1)(src))
474	LOAD(t2, UNIT(2)(src))
475	LOAD(t3, UNIT(3)(src))
476	LOAD(t4, UNIT(4)(src))
477	LOAD(t5, UNIT(5)(src))
478	LOAD(t6, UNIT(6)(src))
479	LOAD(t7, UNIT(7)(src))
480	SUB	len, len, 8*NBYTES
481	ADD	src, src, 8*NBYTES
482	STORE(t0, UNIT(0)(dst))
483	ADDC(t0, t1)
484	STORE(t1, UNIT(1)(dst))
485	ADDC(sum, t0)
486	STORE(t2, UNIT(2)(dst))
487	ADDC(t2, t3)
488	STORE(t3, UNIT(3)(dst))
489	ADDC(sum, t2)
490	STORE(t4, UNIT(4)(dst))
491	ADDC(t4, t5)
492	STORE(t5, UNIT(5)(dst))
493	ADDC(sum, t4)
494	STORE(t6, UNIT(6)(dst))
495	ADDC(t6, t7)
496	STORE(t7, UNIT(7)(dst))
497	ADDC(sum, t6)
498	.set	reorder				/* DADDI_WAR */
499	ADD	dst, dst, 8*NBYTES
500	bgez	len, 1b
501	.set	noreorder
502	ADD	len, 8*NBYTES		# revert len (see above)
503
504	/*
505	 * len == the number of bytes left to copy < 8*NBYTES
506	 */
507.Lcleanup_both_aligned\@:
508#define rem t7
509	beqz	len, .Ldone\@
510	 sltu	t0, len, 4*NBYTES
511	bnez	t0, .Lless_than_4units\@
512	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
513	/*
514	 * len >= 4*NBYTES
515	 */
516	LOAD(t0, UNIT(0)(src))
517	LOAD(t1, UNIT(1)(src))
518	LOAD(t2, UNIT(2)(src))
519	LOAD(t3, UNIT(3)(src))
520	SUB	len, len, 4*NBYTES
521	ADD	src, src, 4*NBYTES
522	STORE(t0, UNIT(0)(dst))
523	ADDC(t0, t1)
524	STORE(t1, UNIT(1)(dst))
525	ADDC(sum, t0)
526	STORE(t2, UNIT(2)(dst))
527	ADDC(t2, t3)
528	STORE(t3, UNIT(3)(dst))
529	ADDC(sum, t2)
530	.set	reorder				/* DADDI_WAR */
531	ADD	dst, dst, 4*NBYTES
532	beqz	len, .Ldone\@
533	.set	noreorder
534.Lless_than_4units\@:
535	/*
536	 * rem = len % NBYTES
537	 */
538	beq	rem, len, .Lcopy_bytes\@
539	 nop
5401:
541	LOAD(t0, 0(src))
542	ADD	src, src, NBYTES
543	SUB	len, len, NBYTES
544	STORE(t0, 0(dst))
545	ADDC(sum, t0)
546	.set	reorder				/* DADDI_WAR */
547	ADD	dst, dst, NBYTES
548	bne	rem, len, 1b
549	.set	noreorder
550
551	/*
552	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
553	 * A loop would do only a byte at a time with possible branch
554	 * mispredicts.	 Can't do an explicit LOAD dst,mask,or,STORE
555	 * because can't assume read-access to dst.  Instead, use
556	 * STREST dst, which doesn't require read access to dst.
557	 *
558	 * This code should perform better than a simple loop on modern,
559	 * wide-issue mips processors because the code has fewer branches and
560	 * more instruction-level parallelism.
561	 */
562#define bits t2
563	beqz	len, .Ldone\@
564	 ADD	t1, dst, len	# t1 is just past last byte of dst
565	li	bits, 8*NBYTES
566	SLL	rem, len, 3	# rem = number of bits to keep
567	LOAD(t0, 0(src))
568	SUB	bits, bits, rem # bits = number of bits to discard
569	SHIFT_DISCARD t0, t0, bits
570	STREST(t0, -1(t1))
571	SHIFT_DISCARD_REVERT t0, t0, bits
572	.set reorder
573	ADDC(sum, t0)
574	b	.Ldone\@
575	.set noreorder
576.Ldst_unaligned\@:
577	/*
578	 * dst is unaligned
579	 * t0 = src & ADDRMASK
580	 * t1 = dst & ADDRMASK; T1 > 0
581	 * len >= NBYTES
582	 *
583	 * Copy enough bytes to align dst
584	 * Set match = (src and dst have same alignment)
585	 */
586#define match rem
587	LDFIRST(t3, FIRST(0)(src))
588	ADD	t2, zero, NBYTES
589	LDREST(t3, REST(0)(src))
590	SUB	t2, t2, t1	# t2 = number of bytes copied
591	xor	match, t0, t1
592	STFIRST(t3, FIRST(0)(dst))
593	SLL	t4, t1, 3		# t4 = number of bits to discard
594	SHIFT_DISCARD t3, t3, t4
595	/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
596	ADDC(sum, t3)
597	beq	len, t2, .Ldone\@
598	 SUB	len, len, t2
599	ADD	dst, dst, t2
600	beqz	match, .Lboth_aligned\@
601	 ADD	src, src, t2
602
603.Lsrc_unaligned_dst_aligned\@:
604	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
605	beqz	t0, .Lcleanup_src_unaligned\@
606	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
6071:
608/*
609 * Avoid consecutive LD*'s to the same register since some mips
610 * implementations can't issue them in the same cycle.
611 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
612 * are to the same unit (unless src is aligned, but it's not).
613 */
614	LDFIRST(t0, FIRST(0)(src))
615	LDFIRST(t1, FIRST(1)(src))
616	SUB	len, len, 4*NBYTES
617	LDREST(t0, REST(0)(src))
618	LDREST(t1, REST(1)(src))
619	LDFIRST(t2, FIRST(2)(src))
620	LDFIRST(t3, FIRST(3)(src))
621	LDREST(t2, REST(2)(src))
622	LDREST(t3, REST(3)(src))
623	ADD	src, src, 4*NBYTES
624#ifdef CONFIG_CPU_SB1
625	nop				# improves slotting
626#endif
627	STORE(t0, UNIT(0)(dst))
628	ADDC(t0, t1)
629	STORE(t1, UNIT(1)(dst))
630	ADDC(sum, t0)
631	STORE(t2, UNIT(2)(dst))
632	ADDC(t2, t3)
633	STORE(t3, UNIT(3)(dst))
634	ADDC(sum, t2)
635	.set	reorder				/* DADDI_WAR */
636	ADD	dst, dst, 4*NBYTES
637	bne	len, rem, 1b
638	.set	noreorder
639
640.Lcleanup_src_unaligned\@:
641	beqz	len, .Ldone\@
642	 and	rem, len, NBYTES-1  # rem = len % NBYTES
643	beq	rem, len, .Lcopy_bytes\@
644	 nop
6451:
646	LDFIRST(t0, FIRST(0)(src))
647	LDREST(t0, REST(0)(src))
648	ADD	src, src, NBYTES
649	SUB	len, len, NBYTES
650	STORE(t0, 0(dst))
651	ADDC(sum, t0)
652	.set	reorder				/* DADDI_WAR */
653	ADD	dst, dst, NBYTES
654	bne	len, rem, 1b
655	.set	noreorder
656
657.Lcopy_bytes_checklen\@:
658	beqz	len, .Ldone\@
659	 nop
660.Lcopy_bytes\@:
661	/* 0 < len < NBYTES  */
662#ifdef CONFIG_CPU_LITTLE_ENDIAN
663#define SHIFT_START 0
664#define SHIFT_INC 8
665#else
666#define SHIFT_START 8*(NBYTES-1)
667#define SHIFT_INC -8
668#endif
669	move	t2, zero	# partial word
670	li	t3, SHIFT_START # shift
 
671#define COPY_BYTE(N)			\
672	LOADBU(t0, N(src));		\
673	SUB	len, len, 1;		\
674	STOREB(t0, N(dst));		\
675	SLLV	t0, t0, t3;		\
676	addu	t3, SHIFT_INC;		\
677	beqz	len, .Lcopy_bytes_done\@; \
678	 or	t2, t0
679
680	COPY_BYTE(0)
681	COPY_BYTE(1)
682#ifdef USE_DOUBLE
683	COPY_BYTE(2)
684	COPY_BYTE(3)
685	COPY_BYTE(4)
686	COPY_BYTE(5)
687#endif
688	LOADBU(t0, NBYTES-2(src))
689	SUB	len, len, 1
690	STOREB(t0, NBYTES-2(dst))
691	SLLV	t0, t0, t3
692	or	t2, t0
693.Lcopy_bytes_done\@:
694	ADDC(sum, t2)
695.Ldone\@:
696	/* fold checksum */
697	.set	push
698	.set	noat
699#ifdef USE_DOUBLE
700	dsll32	v1, sum, 0
701	daddu	sum, v1
702	sltu	v1, sum, v1
703	dsra32	sum, sum, 0
704	addu	sum, v1
705#endif
706
707#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
708    defined(CONFIG_CPU_LOONGSON64)
709	.set	push
710	.set	arch=mips32r2
711	wsbh	v1, sum
712	movn	sum, v1, odd
713	.set	pop
714#else
715	beqz	odd, 1f			/* odd buffer alignment? */
716	 lui	v1, 0x00ff
717	addu	v1, 0x00ff
718	and	t0, sum, v1
719	sll	t0, t0, 8
720	srl	sum, sum, 8
721	and	sum, sum, v1
722	or	sum, sum, t0
7231:
724#endif
725	.set	pop
726	.set reorder
 
727	jr	ra
728	.set noreorder
729	.endm
730
731	.set noreorder
732.L_exc:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733	jr	ra
734	 li	v0, 0
 
 
735
736FEXPORT(__csum_partial_copy_nocheck)
737EXPORT_SYMBOL(__csum_partial_copy_nocheck)
738#ifndef CONFIG_EVA
739FEXPORT(__csum_partial_copy_to_user)
740EXPORT_SYMBOL(__csum_partial_copy_to_user)
741FEXPORT(__csum_partial_copy_from_user)
742EXPORT_SYMBOL(__csum_partial_copy_from_user)
743#endif
744__BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP
 
745
746#ifdef CONFIG_EVA
747LEAF(__csum_partial_copy_to_user)
748__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP
749END(__csum_partial_copy_to_user)
750
751LEAF(__csum_partial_copy_from_user)
752__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP
753END(__csum_partial_copy_from_user)
754#endif

  1/*
  2 * This file is subject to the terms and conditions of the GNU General Public
  3 * License.  See the file "COPYING" in the main directory of this archive
  4 * for more details.
  5 *
  6 * Quick'n'dirty IP checksum ...
  7 *
  8 * Copyright (C) 1998, 1999 Ralf Baechle
  9 * Copyright (C) 1999 Silicon Graphics, Inc.
 10 * Copyright (C) 2007  Maciej W. Rozycki
 11 * Copyright (C) 2014 Imagination Technologies Ltd.
 12 */
 13#include <linux/errno.h>
 14#include <asm/asm.h>
 15#include <asm/asm-offsets.h>
 16#include <asm/export.h>
 17#include <asm/regdef.h>
 18
 19#ifdef CONFIG_64BIT
 20/*
 21 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 22 * register definitions). We need to redefine the register definitions from
 23 * the n64 ABI register naming to the o32 ABI register naming.
 24 */
 25#undef t0
 26#undef t1
 27#undef t2
 28#undef t3
 29#define t0	$8
 30#define t1	$9
 31#define t2	$10
 32#define t3	$11
 33#define t4	$12
 34#define t5	$13
 35#define t6	$14
 36#define t7	$15
 37
 38#define USE_DOUBLE
 39#endif
 40
 41#ifdef USE_DOUBLE
 42
 43#define LOAD   ld
 44#define LOAD32 lwu
 45#define ADD    daddu
 46#define NBYTES 8
 47
 48#else
 49
 50#define LOAD   lw
 51#define LOAD32 lw
 52#define ADD    addu
 53#define NBYTES 4
 54
 55#endif /* USE_DOUBLE */
 56
 57#define UNIT(unit)  ((unit)*NBYTES)
 58
 59#define ADDC(sum,reg)						\
 60	.set	push;						\
 61	.set	noat;						\
 62	ADD	sum, reg;					\
 63	sltu	v1, sum, reg;					\
 64	ADD	sum, v1;					\
 65	.set	pop
 66
 67#define ADDC32(sum,reg)						\
 68	.set	push;						\
 69	.set	noat;						\
 70	addu	sum, reg;					\
 71	sltu	v1, sum, reg;					\
 72	addu	sum, v1;					\
 73	.set	pop
 74
 75#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
 76	LOAD	_t0, (offset + UNIT(0))(src);			\
 77	LOAD	_t1, (offset + UNIT(1))(src);			\
 78	LOAD	_t2, (offset + UNIT(2))(src);			\
 79	LOAD	_t3, (offset + UNIT(3))(src);			\
 80	ADDC(_t0, _t1);						\
 81	ADDC(_t2, _t3);						\
 82	ADDC(sum, _t0);						\
 83	ADDC(sum, _t2)
 84
 85#ifdef USE_DOUBLE
 86#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
 87	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
 88#else
 89#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
 90	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);	\
 91	CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
 92#endif
 93
 94/*
 95 * a0: source address
 96 * a1: length of the area to checksum
 97 * a2: partial checksum
 98 */
 99
100#define src a0
101#define sum v0
102
103	.text
104	.set	noreorder
105	.align	5
106LEAF(csum_partial)
107EXPORT_SYMBOL(csum_partial)
108	move	sum, zero
109	move	t7, zero
110
111	sltiu	t8, a1, 0x8
112	bnez	t8, .Lsmall_csumcpy		/* < 8 bytes to copy */
113	 move	t2, a1
114
115	andi	t7, src, 0x1			/* odd buffer? */
116
117.Lhword_align:
118	beqz	t7, .Lword_align
119	 andi	t8, src, 0x2
120
121	lbu	t0, (src)
122	LONG_SUBU	a1, a1, 0x1
123#ifdef __MIPSEL__
124	sll	t0, t0, 8
125#endif
126	ADDC(sum, t0)
127	PTR_ADDU	src, src, 0x1
128	andi	t8, src, 0x2
129
130.Lword_align:
131	beqz	t8, .Ldword_align
132	 sltiu	t8, a1, 56
133
134	lhu	t0, (src)
135	LONG_SUBU	a1, a1, 0x2
136	ADDC(sum, t0)
137	sltiu	t8, a1, 56
138	PTR_ADDU	src, src, 0x2
139
140.Ldword_align:
141	bnez	t8, .Ldo_end_words
142	 move	t8, a1
143
144	andi	t8, src, 0x4
145	beqz	t8, .Lqword_align
146	 andi	t8, src, 0x8
147
148	LOAD32	t0, 0x00(src)
149	LONG_SUBU	a1, a1, 0x4
150	ADDC(sum, t0)
151	PTR_ADDU	src, src, 0x4
152	andi	t8, src, 0x8
153
154.Lqword_align:
155	beqz	t8, .Loword_align
156	 andi	t8, src, 0x10
157
158#ifdef USE_DOUBLE
159	ld	t0, 0x00(src)
160	LONG_SUBU	a1, a1, 0x8
161	ADDC(sum, t0)
162#else
163	lw	t0, 0x00(src)
164	lw	t1, 0x04(src)
165	LONG_SUBU	a1, a1, 0x8
166	ADDC(sum, t0)
167	ADDC(sum, t1)
168#endif
169	PTR_ADDU	src, src, 0x8
170	andi	t8, src, 0x10
171
172.Loword_align:
173	beqz	t8, .Lbegin_movement
174	 LONG_SRL	t8, a1, 0x7
175
176#ifdef USE_DOUBLE
177	ld	t0, 0x00(src)
178	ld	t1, 0x08(src)
179	ADDC(sum, t0)
180	ADDC(sum, t1)
181#else
182	CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
183#endif
184	LONG_SUBU	a1, a1, 0x10
185	PTR_ADDU	src, src, 0x10
186	LONG_SRL	t8, a1, 0x7
187
188.Lbegin_movement:
189	beqz	t8, 1f
190	 andi	t2, a1, 0x40
191
192.Lmove_128bytes:
193	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
194	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
195	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
196	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
197	LONG_SUBU	t8, t8, 0x01
198	.set	reorder				/* DADDI_WAR */
199	PTR_ADDU	src, src, 0x80
200	bnez	t8, .Lmove_128bytes
201	.set	noreorder
202
2031:
204	beqz	t2, 1f
205	 andi	t2, a1, 0x20
206
207.Lmove_64bytes:
208	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
209	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
210	PTR_ADDU	src, src, 0x40
211
2121:
213	beqz	t2, .Ldo_end_words
214	 andi	t8, a1, 0x1c
215
216.Lmove_32bytes:
217	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
218	andi	t8, a1, 0x1c
219	PTR_ADDU	src, src, 0x20
220
221.Ldo_end_words:
222	beqz	t8, .Lsmall_csumcpy
223	 andi	t2, a1, 0x3
224	LONG_SRL	t8, t8, 0x2
225
226.Lend_words:
227	LOAD32	t0, (src)
228	LONG_SUBU	t8, t8, 0x1
229	ADDC(sum, t0)
230	.set	reorder				/* DADDI_WAR */
231	PTR_ADDU	src, src, 0x4
232	bnez	t8, .Lend_words
233	.set	noreorder
234
235/* unknown src alignment and < 8 bytes to go  */
236.Lsmall_csumcpy:
237	move	a1, t2
238
239	andi	t0, a1, 4
240	beqz	t0, 1f
241	 andi	t0, a1, 2
242
243	/* Still a full word to go  */
244	ulw	t1, (src)
245	PTR_ADDIU	src, 4
246#ifdef USE_DOUBLE
247	dsll	t1, t1, 32			/* clear lower 32bit */
248#endif
249	ADDC(sum, t1)
250
2511:	move	t1, zero
252	beqz	t0, 1f
253	 andi	t0, a1, 1
254
255	/* Still a halfword to go  */
256	ulhu	t1, (src)
257	PTR_ADDIU	src, 2
258
2591:	beqz	t0, 1f
260	 sll	t1, t1, 16
261
262	lbu	t2, (src)
263	 nop
264
265#ifdef __MIPSEB__
266	sll	t2, t2, 8
267#endif
268	or	t1, t2
269
2701:	ADDC(sum, t1)
271
272	/* fold checksum */
273#ifdef USE_DOUBLE
274	dsll32	v1, sum, 0
275	daddu	sum, v1
276	sltu	v1, sum, v1
277	dsra32	sum, sum, 0
278	addu	sum, v1
279#endif
280
281	/* odd buffer alignment? */
282#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
283    defined(CONFIG_CPU_LOONGSON64)
284	.set	push
285	.set	arch=mips32r2
286	wsbh	v1, sum
287	movn	sum, v1, t7
288	.set	pop
289#else
290	beqz	t7, 1f			/* odd buffer alignment? */
291	 lui	v1, 0x00ff
292	addu	v1, 0x00ff
293	and	t0, sum, v1
294	sll	t0, t0, 8
295	srl	sum, sum, 8
296	and	sum, sum, v1
297	or	sum, sum, t0
2981:
299#endif
300	.set	reorder
301	/* Add the passed partial csum.	 */
302	ADDC32(sum, a2)
303	jr	ra
304	.set	noreorder
305	END(csum_partial)
306
307
308/*
309 * checksum and copy routines based on memcpy.S
310 *
311 *	csum_partial_copy_nocheck(src, dst, len, sum)
312 *	__csum_partial_copy_kernel(src, dst, len, sum, errp)
313 *
314 * See "Spec" in memcpy.S for details.	Unlike __copy_user, all
315 * function in this file use the standard calling convention.
316 */
317
318#define src a0
319#define dst a1
320#define len a2
321#define psum a3
322#define sum v0
323#define odd t8
324#define errptr t9
325
326/*
327 * The exception handler for loads requires that:
328 *  1- AT contain the address of the byte just past the end of the source
329 *     of the copy,
330 *  2- src_entry <= src < AT, and
331 *  3- (dst - src) == (dst_entry - src_entry),
332 * The _entry suffix denotes values when __copy_user was called.
333 *
334 * (1) is set up up by __csum_partial_copy_from_user and maintained by
335 *	not writing AT in __csum_partial_copy
336 * (2) is met by incrementing src by the number of bytes copied
337 * (3) is met by not doing loads between a pair of increments of dst and src
338 *
339 * The exception handlers for stores stores -EFAULT to errptr and return.
340 * These handlers do not need to overwrite any data.
341 */
342
343/* Instruction type */
344#define LD_INSN 1
345#define ST_INSN 2
346#define LEGACY_MODE 1
347#define EVA_MODE    2
348#define USEROP   1
349#define KERNELOP 2
350
351/*
352 * Wrapper to add an entry in the exception table
353 * in case the insn causes a memory exception.
354 * Arguments:
355 * insn    : Load/store instruction
356 * type    : Instruction type
357 * reg     : Register
358 * addr    : Address
359 * handler : Exception handler
360 */
361#define EXC(insn, type, reg, addr, handler)	\
362	.if \mode == LEGACY_MODE;		\
3639:		insn reg, addr;			\
364		.section __ex_table,"a";	\
365		PTR	9b, handler;		\
366		.previous;			\
367	/* This is enabled in EVA mode */	\
368	.else;					\
369		/* If loading from user or storing to user */	\
370		.if ((\from == USEROP) && (type == LD_INSN)) || \
371		    ((\to == USEROP) && (type == ST_INSN));	\
3729:			__BUILD_EVA_INSN(insn##e, reg, addr);	\
373			.section __ex_table,"a";		\
374			PTR	9b, handler;			\
375			.previous;				\
376		.else;						\
377			/* EVA without exception */		\
378			insn reg, addr;				\
379		.endif;						\
380	.endif
381
382#undef LOAD
383
384#ifdef USE_DOUBLE
385
386#define LOADK	ld /* No exception */
387#define LOAD(reg, addr, handler)	EXC(ld, LD_INSN, reg, addr, handler)
388#define LOADBU(reg, addr, handler)	EXC(lbu, LD_INSN, reg, addr, handler)
389#define LOADL(reg, addr, handler)	EXC(ldl, LD_INSN, reg, addr, handler)
390#define LOADR(reg, addr, handler)	EXC(ldr, LD_INSN, reg, addr, handler)
391#define STOREB(reg, addr, handler)	EXC(sb, ST_INSN, reg, addr, handler)
392#define STOREL(reg, addr, handler)	EXC(sdl, ST_INSN, reg, addr, handler)
393#define STORER(reg, addr, handler)	EXC(sdr, ST_INSN, reg, addr, handler)
394#define STORE(reg, addr, handler)	EXC(sd, ST_INSN, reg, addr, handler)
395#define ADD    daddu
396#define SUB    dsubu
397#define SRL    dsrl
398#define SLL    dsll
399#define SLLV   dsllv
400#define SRLV   dsrlv
401#define NBYTES 8
402#define LOG_NBYTES 3
403
404#else
405
406#define LOADK	lw /* No exception */
407#define LOAD(reg, addr, handler)	EXC(lw, LD_INSN, reg, addr, handler)
408#define LOADBU(reg, addr, handler)	EXC(lbu, LD_INSN, reg, addr, handler)
409#define LOADL(reg, addr, handler)	EXC(lwl, LD_INSN, reg, addr, handler)
410#define LOADR(reg, addr, handler)	EXC(lwr, LD_INSN, reg, addr, handler)
411#define STOREB(reg, addr, handler)	EXC(sb, ST_INSN, reg, addr, handler)
412#define STOREL(reg, addr, handler)	EXC(swl, ST_INSN, reg, addr, handler)
413#define STORER(reg, addr, handler)	EXC(swr, ST_INSN, reg, addr, handler)
414#define STORE(reg, addr, handler)	EXC(sw, ST_INSN, reg, addr, handler)
415#define ADD    addu
416#define SUB    subu
417#define SRL    srl
418#define SLL    sll
419#define SLLV   sllv
420#define SRLV   srlv
421#define NBYTES 4
422#define LOG_NBYTES 2
423
424#endif /* USE_DOUBLE */
425
426#ifdef CONFIG_CPU_LITTLE_ENDIAN
427#define LDFIRST LOADR
428#define LDREST	LOADL
429#define STFIRST STORER
430#define STREST	STOREL
431#define SHIFT_DISCARD SLLV
432#define SHIFT_DISCARD_REVERT SRLV
433#else
434#define LDFIRST LOADL
435#define LDREST	LOADR
436#define STFIRST STOREL
437#define STREST	STORER
438#define SHIFT_DISCARD SRLV
439#define SHIFT_DISCARD_REVERT SLLV
440#endif
441
442#define FIRST(unit) ((unit)*NBYTES)
443#define REST(unit)  (FIRST(unit)+NBYTES-1)
444
445#define ADDRMASK (NBYTES-1)
446
447#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
448	.set	noat
449#else
450	.set	at=v1
451#endif
452
453	.macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to, __nocheck
454
455	PTR_ADDU	AT, src, len	/* See (1) above. */
456	/* initialize __nocheck if this the first time we execute this
457	 * macro
458	 */
459#ifdef CONFIG_64BIT
460	move	errptr, a4
461#else
462	lw	errptr, 16(sp)
463#endif
464	.if \__nocheck == 1
465	FEXPORT(csum_partial_copy_nocheck)
466	EXPORT_SYMBOL(csum_partial_copy_nocheck)
467	.endif
468	move	sum, zero
469	move	odd, zero
470	/*
471	 * Note: dst & src may be unaligned, len may be 0
472	 * Temps
473	 */
474	/*
475	 * The "issue break"s below are very approximate.
476	 * Issue delays for dcache fills will perturb the schedule, as will
477	 * load queue full replay traps, etc.
478	 *
479	 * If len < NBYTES use byte operations.
480	 */
481	sltu	t2, len, NBYTES
482	and	t1, dst, ADDRMASK
483	bnez	t2, .Lcopy_bytes_checklen\@
484	 and	t0, src, ADDRMASK
485	andi	odd, dst, 0x1			/* odd buffer? */
486	bnez	t1, .Ldst_unaligned\@
487	 nop
488	bnez	t0, .Lsrc_unaligned_dst_aligned\@
489	/*
490	 * use delay slot for fall-through
491	 * src and dst are aligned; need to compute rem
492	 */
493.Lboth_aligned\@:
494	 SRL	t0, len, LOG_NBYTES+3	 # +3 for 8 units/iter
495	beqz	t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
496	 nop
497	SUB	len, 8*NBYTES		# subtract here for bgez loop
498	.align	4
4991:
500	LOAD(t0, UNIT(0)(src), .Ll_exc\@)
501	LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
502	LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
503	LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
504	LOAD(t4, UNIT(4)(src), .Ll_exc_copy\@)
505	LOAD(t5, UNIT(5)(src), .Ll_exc_copy\@)
506	LOAD(t6, UNIT(6)(src), .Ll_exc_copy\@)
507	LOAD(t7, UNIT(7)(src), .Ll_exc_copy\@)
508	SUB	len, len, 8*NBYTES
509	ADD	src, src, 8*NBYTES
510	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
511	ADDC(t0, t1)
512	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
513	ADDC(sum, t0)
514	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
515	ADDC(t2, t3)
516	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
517	ADDC(sum, t2)
518	STORE(t4, UNIT(4)(dst),	.Ls_exc\@)
519	ADDC(t4, t5)
520	STORE(t5, UNIT(5)(dst),	.Ls_exc\@)
521	ADDC(sum, t4)
522	STORE(t6, UNIT(6)(dst),	.Ls_exc\@)
523	ADDC(t6, t7)
524	STORE(t7, UNIT(7)(dst),	.Ls_exc\@)
525	ADDC(sum, t6)
526	.set	reorder				/* DADDI_WAR */
527	ADD	dst, dst, 8*NBYTES
528	bgez	len, 1b
529	.set	noreorder
530	ADD	len, 8*NBYTES		# revert len (see above)
531
532	/*
533	 * len == the number of bytes left to copy < 8*NBYTES
534	 */
535.Lcleanup_both_aligned\@:
536#define rem t7
537	beqz	len, .Ldone\@
538	 sltu	t0, len, 4*NBYTES
539	bnez	t0, .Lless_than_4units\@
540	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
541	/*
542	 * len >= 4*NBYTES
543	 */
544	LOAD(t0, UNIT(0)(src), .Ll_exc\@)
545	LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
546	LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
547	LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
548	SUB	len, len, 4*NBYTES
549	ADD	src, src, 4*NBYTES
550	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
551	ADDC(t0, t1)
552	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
553	ADDC(sum, t0)
554	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
555	ADDC(t2, t3)
556	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
557	ADDC(sum, t2)
558	.set	reorder				/* DADDI_WAR */
559	ADD	dst, dst, 4*NBYTES
560	beqz	len, .Ldone\@
561	.set	noreorder
562.Lless_than_4units\@:
563	/*
564	 * rem = len % NBYTES
565	 */
566	beq	rem, len, .Lcopy_bytes\@
567	 nop
5681:
569	LOAD(t0, 0(src), .Ll_exc\@)
570	ADD	src, src, NBYTES
571	SUB	len, len, NBYTES
572	STORE(t0, 0(dst), .Ls_exc\@)
573	ADDC(sum, t0)
574	.set	reorder				/* DADDI_WAR */
575	ADD	dst, dst, NBYTES
576	bne	rem, len, 1b
577	.set	noreorder
578
579	/*
580	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
581	 * A loop would do only a byte at a time with possible branch
582	 * mispredicts.	 Can't do an explicit LOAD dst,mask,or,STORE
583	 * because can't assume read-access to dst.  Instead, use
584	 * STREST dst, which doesn't require read access to dst.
585	 *
586	 * This code should perform better than a simple loop on modern,
587	 * wide-issue mips processors because the code has fewer branches and
588	 * more instruction-level parallelism.
589	 */
590#define bits t2
591	beqz	len, .Ldone\@
592	 ADD	t1, dst, len	# t1 is just past last byte of dst
593	li	bits, 8*NBYTES
594	SLL	rem, len, 3	# rem = number of bits to keep
595	LOAD(t0, 0(src), .Ll_exc\@)
596	SUB	bits, bits, rem # bits = number of bits to discard
597	SHIFT_DISCARD t0, t0, bits
598	STREST(t0, -1(t1), .Ls_exc\@)
599	SHIFT_DISCARD_REVERT t0, t0, bits
600	.set reorder
601	ADDC(sum, t0)
602	b	.Ldone\@
603	.set noreorder
604.Ldst_unaligned\@:
605	/*
606	 * dst is unaligned
607	 * t0 = src & ADDRMASK
608	 * t1 = dst & ADDRMASK; T1 > 0
609	 * len >= NBYTES
610	 *
611	 * Copy enough bytes to align dst
612	 * Set match = (src and dst have same alignment)
613	 */
614#define match rem
615	LDFIRST(t3, FIRST(0)(src), .Ll_exc\@)
616	ADD	t2, zero, NBYTES
617	LDREST(t3, REST(0)(src), .Ll_exc_copy\@)
618	SUB	t2, t2, t1	# t2 = number of bytes copied
619	xor	match, t0, t1
620	STFIRST(t3, FIRST(0)(dst), .Ls_exc\@)
621	SLL	t4, t1, 3		# t4 = number of bits to discard
622	SHIFT_DISCARD t3, t3, t4
623	/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
624	ADDC(sum, t3)
625	beq	len, t2, .Ldone\@
626	 SUB	len, len, t2
627	ADD	dst, dst, t2
628	beqz	match, .Lboth_aligned\@
629	 ADD	src, src, t2
630
631.Lsrc_unaligned_dst_aligned\@:
632	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
633	beqz	t0, .Lcleanup_src_unaligned\@
634	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
6351:
636/*
637 * Avoid consecutive LD*'s to the same register since some mips
638 * implementations can't issue them in the same cycle.
639 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
640 * are to the same unit (unless src is aligned, but it's not).
641 */
642	LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
643	LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy\@)
644	SUB	len, len, 4*NBYTES
645	LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
646	LDREST(t1, REST(1)(src), .Ll_exc_copy\@)
647	LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy\@)
648	LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy\@)
649	LDREST(t2, REST(2)(src), .Ll_exc_copy\@)
650	LDREST(t3, REST(3)(src), .Ll_exc_copy\@)
651	ADD	src, src, 4*NBYTES
652#ifdef CONFIG_CPU_SB1
653	nop				# improves slotting
654#endif
655	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
656	ADDC(t0, t1)
657	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
658	ADDC(sum, t0)
659	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
660	ADDC(t2, t3)
661	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
662	ADDC(sum, t2)
663	.set	reorder				/* DADDI_WAR */
664	ADD	dst, dst, 4*NBYTES
665	bne	len, rem, 1b
666	.set	noreorder
667
668.Lcleanup_src_unaligned\@:
669	beqz	len, .Ldone\@
670	 and	rem, len, NBYTES-1  # rem = len % NBYTES
671	beq	rem, len, .Lcopy_bytes\@
672	 nop
6731:
674	LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
675	LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
676	ADD	src, src, NBYTES
677	SUB	len, len, NBYTES
678	STORE(t0, 0(dst), .Ls_exc\@)
679	ADDC(sum, t0)
680	.set	reorder				/* DADDI_WAR */
681	ADD	dst, dst, NBYTES
682	bne	len, rem, 1b
683	.set	noreorder
684
685.Lcopy_bytes_checklen\@:
686	beqz	len, .Ldone\@
687	 nop
688.Lcopy_bytes\@:
689	/* 0 < len < NBYTES  */
690#ifdef CONFIG_CPU_LITTLE_ENDIAN
691#define SHIFT_START 0
692#define SHIFT_INC 8
693#else
694#define SHIFT_START 8*(NBYTES-1)
695#define SHIFT_INC -8
696#endif
697	move	t2, zero	# partial word
698	li	t3, SHIFT_START # shift
699/* use .Ll_exc_copy here to return correct sum on fault */
700#define COPY_BYTE(N)			\
701	LOADBU(t0, N(src), .Ll_exc_copy\@);	\
702	SUB	len, len, 1;		\
703	STOREB(t0, N(dst), .Ls_exc\@);	\
704	SLLV	t0, t0, t3;		\
705	addu	t3, SHIFT_INC;		\
706	beqz	len, .Lcopy_bytes_done\@; \
707	 or	t2, t0
708
709	COPY_BYTE(0)
710	COPY_BYTE(1)
711#ifdef USE_DOUBLE
712	COPY_BYTE(2)
713	COPY_BYTE(3)
714	COPY_BYTE(4)
715	COPY_BYTE(5)
716#endif
717	LOADBU(t0, NBYTES-2(src), .Ll_exc_copy\@)
718	SUB	len, len, 1
719	STOREB(t0, NBYTES-2(dst), .Ls_exc\@)
720	SLLV	t0, t0, t3
721	or	t2, t0
722.Lcopy_bytes_done\@:
723	ADDC(sum, t2)
724.Ldone\@:
725	/* fold checksum */
726	.set	push
727	.set	noat
728#ifdef USE_DOUBLE
729	dsll32	v1, sum, 0
730	daddu	sum, v1
731	sltu	v1, sum, v1
732	dsra32	sum, sum, 0
733	addu	sum, v1
734#endif
735
736#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
737    defined(CONFIG_CPU_LOONGSON64)
738	.set	push
739	.set	arch=mips32r2
740	wsbh	v1, sum
741	movn	sum, v1, odd
742	.set	pop
743#else
744	beqz	odd, 1f			/* odd buffer alignment? */
745	 lui	v1, 0x00ff
746	addu	v1, 0x00ff
747	and	t0, sum, v1
748	sll	t0, t0, 8
749	srl	sum, sum, 8
750	and	sum, sum, v1
751	or	sum, sum, t0
7521:
753#endif
754	.set	pop
755	.set reorder
756	ADDC32(sum, psum)
757	jr	ra
758	.set noreorder
 
759
760.Ll_exc_copy\@:
761	/*
762	 * Copy bytes from src until faulting load address (or until a
763	 * lb faults)
764	 *
765	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
766	 * may be more than a byte beyond the last address.
767	 * Hence, the lb below may get an exception.
768	 *
769	 * Assumes src < THREAD_BUADDR($28)
770	 */
771	LOADK	t0, TI_TASK($28)
772	 li	t2, SHIFT_START
773	LOADK	t0, THREAD_BUADDR(t0)
7741:
775	LOADBU(t1, 0(src), .Ll_exc\@)
776	ADD	src, src, 1
777	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
778	SLLV	t1, t1, t2
779	addu	t2, SHIFT_INC
780	ADDC(sum, t1)
781	.set	reorder				/* DADDI_WAR */
782	ADD	dst, dst, 1
783	bne	src, t0, 1b
784	.set	noreorder
785.Ll_exc\@:
786	LOADK	t0, TI_TASK($28)
787	 nop
788	LOADK	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
789	 nop
790	SUB	len, AT, t0		# len number of uncopied bytes
791	/*
792	 * Here's where we rely on src and dst being incremented in tandem,
793	 *   See (3) above.
794	 * dst += (fault addr - src) to put dst at first byte to clear
795	 */
796	ADD	dst, t0			# compute start address in a1
797	SUB	dst, src
798	/*
799	 * Clear len bytes starting at dst.  Can't call __bzero because it
800	 * might modify len.  An inefficient loop for these rare times...
801	 */
802	.set	reorder				/* DADDI_WAR */
803	SUB	src, len, 1
804	beqz	len, .Ldone\@
805	.set	noreorder
8061:	sb	zero, 0(dst)
807	ADD	dst, dst, 1
808	.set	push
809	.set	noat
810#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
811	bnez	src, 1b
812	 SUB	src, src, 1
813#else
814	li	v1, 1
815	bnez	src, 1b
816	 SUB	src, src, v1
817#endif
818	li	v1, -EFAULT
819	b	.Ldone\@
820	 sw	v1, (errptr)
821
822.Ls_exc\@:
823	li	v0, -1 /* invalid checksum */
824	li	v1, -EFAULT
825	jr	ra
826	 sw	v1, (errptr)
827	.set	pop
828	.endm
829
830LEAF(__csum_partial_copy_kernel)
831EXPORT_SYMBOL(__csum_partial_copy_kernel)
832#ifndef CONFIG_EVA
833FEXPORT(__csum_partial_copy_to_user)
834EXPORT_SYMBOL(__csum_partial_copy_to_user)
835FEXPORT(__csum_partial_copy_from_user)
836EXPORT_SYMBOL(__csum_partial_copy_from_user)
837#endif
838__BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP 1
839END(__csum_partial_copy_kernel)
840
841#ifdef CONFIG_EVA
842LEAF(__csum_partial_copy_to_user)
843__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP 0
844END(__csum_partial_copy_to_user)
845
846LEAF(__csum_partial_copy_from_user)
847__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP 0
848END(__csum_partial_copy_from_user)
849#endif