Linux Audio

Check our new training course

Loading...
Note: File does not exist in v6.2.
  1/*
  2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
  3 *
  4 *   This program is free software; you can redistribute it and/or
  5 *   modify it under the terms of the GNU General Public License
  6 *   as published by the Free Software Foundation, version 2.
  7 *
  8 *   This program is distributed in the hope that it will be useful, but
  9 *   WITHOUT ANY WARRANTY; without even the implied warranty of
 10 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 11 *   NON INFRINGEMENT.  See the GNU General Public License for
 12 *   more details.
 13 */
 14
 15#include <arch/chip.h>
 16
 17#include <linux/types.h>
 18#include <linux/string.h>
 19#include <linux/module.h>
 20
 21#undef memset
 22
 23void *memset(void *s, int c, size_t n)
 24{
 25	uint32_t *out32;
 26	int n32;
 27	uint32_t v16, v32;
 28	uint8_t *out8 = s;
 29#if !CHIP_HAS_WH64()
 30	int ahead32;
 31#else
 32	int to_align32;
 33#endif
 34
 35	/* Experimentation shows that a trivial tight loop is a win up until
 36	 * around a size of 20, where writing a word at a time starts to win.
 37	 */
 38#define BYTE_CUTOFF 20
 39
 40#if BYTE_CUTOFF < 3
 41	/* This must be at least at least this big, or some code later
 42	 * on doesn't work.
 43	 */
 44#error "BYTE_CUTOFF is too small"
 45#endif
 46
 47	if (n < BYTE_CUTOFF) {
 48		/* Strangely, this turns out to be the tightest way to
 49		 * write this loop.
 50		 */
 51		if (n != 0) {
 52			do {
 53				/* Strangely, combining these into one line
 54				 * performs worse.
 55				 */
 56				*out8 = c;
 57				out8++;
 58			} while (--n != 0);
 59		}
 60
 61		return s;
 62	}
 63
 64#if !CHIP_HAS_WH64()
 65	/* Use a spare issue slot to start prefetching the first cache
 66	 * line early. This instruction is free as the store can be buried
 67	 * in otherwise idle issue slots doing ALU ops.
 68	 */
 69	__insn_prefetch(out8);
 70
 71	/* We prefetch the end so that a short memset that spans two cache
 72	 * lines gets some prefetching benefit. Again we believe this is free
 73	 * to issue.
 74	 */
 75	__insn_prefetch(&out8[n - 1]);
 76#endif /* !CHIP_HAS_WH64() */
 77
 78
 79	/* Align 'out8'. We know n >= 3 so this won't write past the end. */
 80	while (((uintptr_t) out8 & 3) != 0) {
 81		*out8++ = c;
 82		--n;
 83	}
 84
 85	/* Align 'n'. */
 86	while (n & 3)
 87		out8[--n] = c;
 88
 89	out32 = (uint32_t *) out8;
 90	n32 = n >> 2;
 91
 92	/* Tile input byte out to 32 bits. */
 93	v16 = __insn_intlb(c, c);
 94	v32 = __insn_intlh(v16, v16);
 95
 96	/* This must be at least 8 or the following loop doesn't work. */
 97#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
 98
 99#if !CHIP_HAS_WH64()
100
101	ahead32 = CACHE_LINE_SIZE_IN_WORDS;
102
103	/* We already prefetched the first and last cache lines, so
104	 * we only need to do more prefetching if we are storing
105	 * to more than two cache lines.
106	 */
107	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
108		int i;
109
110		/* Prefetch the next several cache lines.
111		 * This is the setup code for the software-pipelined
112		 * loop below.
113		 */
114#define MAX_PREFETCH 5
115		ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
116		if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
117			ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
118
119		for (i = CACHE_LINE_SIZE_IN_WORDS;
120		     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
121			__insn_prefetch(&out32[i]);
122	}
123
124	if (n32 > ahead32) {
125		while (1) {
126			int j;
127
128			/* Prefetch by reading one word several cache lines
129			 * ahead.  Since loads are non-blocking this will
130			 * cause the full cache line to be read while we are
131			 * finishing earlier cache lines.  Using a store
132			 * here causes microarchitectural performance
133			 * problems where a victimizing store miss goes to
134			 * the head of the retry FIFO and locks the pipe for
135			 * a few cycles.  So a few subsequent stores in this
136			 * loop go into the retry FIFO, and then later
137			 * stores see other stores to the same cache line
138			 * are already in the retry FIFO and themselves go
139			 * into the retry FIFO, filling it up and grinding
140			 * to a halt waiting for the original miss to be
141			 * satisfied.
142			 */
143			__insn_prefetch(&out32[ahead32]);
144
145#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
146#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
147#endif
148
149			n32 -= CACHE_LINE_SIZE_IN_WORDS;
150
151			/* Save icache space by only partially unrolling
152			 * this loop.
153			 */
154			for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
155				*out32++ = v32;
156				*out32++ = v32;
157				*out32++ = v32;
158				*out32++ = v32;
159			}
160
161			/* To save compiled code size, reuse this loop even
162			 * when we run out of prefetching to do by dropping
163			 * ahead32 down.
164			 */
165			if (n32 <= ahead32) {
166				/* Not even a full cache line left,
167				 * so stop now.
168				 */
169				if (n32 < CACHE_LINE_SIZE_IN_WORDS)
170					break;
171
172				/* Choose a small enough value that we don't
173				 * prefetch past the end.  There's no sense
174				 * in touching cache lines we don't have to.
175				 */
176				ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
177			}
178		}
179	}
180
181#else /* CHIP_HAS_WH64() */
182
183	/* Determine how many words we need to emit before the 'out32'
184	 * pointer becomes aligned modulo the cache line size.
185	 */
186	to_align32 =
187		(-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
188
189	/* Only bother aligning and using wh64 if there is at least
190	 * one full cache line to process.  This check also prevents
191	 * overrunning the end of the buffer with alignment words.
192	 */
193	if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
194		int lines_left;
195
196		/* Align out32 mod the cache line size so we can use wh64. */
197		n32 -= to_align32;
198		for (; to_align32 != 0; to_align32--) {
199			*out32 = v32;
200			out32++;
201		}
202
203		/* Use unsigned divide to turn this into a right shift. */
204		lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
205
206		do {
207			/* Only wh64 a few lines at a time, so we don't
208			 * exceed the maximum number of victim lines.
209			 */
210			int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
211				  ? lines_left
212				  : CHIP_MAX_OUTSTANDING_VICTIMS());
213			uint32_t *wh = out32;
214			int i = x;
215			int j;
216
217			lines_left -= x;
218
219			do {
220				__insn_wh64(wh);
221				wh += CACHE_LINE_SIZE_IN_WORDS;
222			} while (--i);
223
224			for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4);
225			     j != 0; j--) {
226				*out32++ = v32;
227				*out32++ = v32;
228				*out32++ = v32;
229				*out32++ = v32;
230			}
231		} while (lines_left != 0);
232
233		/* We processed all full lines above, so only this many
234		 * words remain to be processed.
235		 */
236		n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
237	}
238
239#endif /* CHIP_HAS_WH64() */
240
241	/* Now handle any leftover values. */
242	if (n32 != 0) {
243		do {
244			*out32 = v32;
245			out32++;
246		} while (--n32 != 0);
247	}
248
249	return s;
250}
251EXPORT_SYMBOL(memset);