Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.17.
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * Copyright 2024 Rivos Inc.
  4 */
  5
  6#include <linux/cpu.h>
  7#include <linux/cpumask.h>
  8#include <linux/jump_label.h>
  9#include <linux/kthread.h>
 10#include <linux/mm.h>
 11#include <linux/smp.h>
 12#include <linux/types.h>
 13#include <asm/cpufeature.h>
 14#include <asm/hwprobe.h>
 15#include <asm/vector.h>
 16
 17#include "copy-unaligned.h"
 18
 19#define MISALIGNED_ACCESS_JIFFIES_LG2 1
 20#define MISALIGNED_BUFFER_SIZE 0x4000
 21#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
 22#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
 23
 24DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
 25DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
 26
 27#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
 28static cpumask_t fast_misaligned_access;
 29static int check_unaligned_access(void *param)
 30{
 31	int cpu = smp_processor_id();
 32	u64 start_cycles, end_cycles;
 33	u64 word_cycles;
 34	u64 byte_cycles;
 35	int ratio;
 36	unsigned long start_jiffies, now;
 37	struct page *page = param;
 38	void *dst;
 39	void *src;
 40	long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
 41
 42	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
 43		return 0;
 44
 45	/* Make an unaligned destination buffer. */
 46	dst = (void *)((unsigned long)page_address(page) | 0x1);
 47	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
 48	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
 49	src += 2;
 50	word_cycles = -1ULL;
 51	/* Do a warmup. */
 52	__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
 53	preempt_disable();
 54	start_jiffies = jiffies;
 55	while ((now = jiffies) == start_jiffies)
 56		cpu_relax();
 57
 58	/*
 59	 * For a fixed amount of time, repeatedly try the function, and take
 60	 * the best time in cycles as the measurement.
 61	 */
 62	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
 63		start_cycles = get_cycles64();
 64		/* Ensure the CSR read can't reorder WRT to the copy. */
 65		mb();
 66		__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
 67		/* Ensure the copy ends before the end time is snapped. */
 68		mb();
 69		end_cycles = get_cycles64();
 70		if ((end_cycles - start_cycles) < word_cycles)
 71			word_cycles = end_cycles - start_cycles;
 72	}
 73
 74	byte_cycles = -1ULL;
 75	__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
 76	start_jiffies = jiffies;
 77	while ((now = jiffies) == start_jiffies)
 78		cpu_relax();
 79
 80	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
 81		start_cycles = get_cycles64();
 82		mb();
 83		__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
 84		mb();
 85		end_cycles = get_cycles64();
 86		if ((end_cycles - start_cycles) < byte_cycles)
 87			byte_cycles = end_cycles - start_cycles;
 88	}
 89
 90	preempt_enable();
 91
 92	/* Don't divide by zero. */
 93	if (!word_cycles || !byte_cycles) {
 94		pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
 95			cpu);
 96
 97		return 0;
 98	}
 99
100	if (word_cycles < byte_cycles)
101		speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
102
103	ratio = div_u64((byte_cycles * 100), word_cycles);
104	pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
105		cpu,
106		ratio / 100,
107		ratio % 100,
108		(speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow");
109
110	per_cpu(misaligned_access_speed, cpu) = speed;
111
112	/*
113	 * Set the value of fast_misaligned_access of a CPU. These operations
114	 * are atomic to avoid race conditions.
115	 */
116	if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST)
117		cpumask_set_cpu(cpu, &fast_misaligned_access);
118	else
119		cpumask_clear_cpu(cpu, &fast_misaligned_access);
120
121	return 0;
122}
123
124static void check_unaligned_access_nonboot_cpu(void *param)
125{
126	unsigned int cpu = smp_processor_id();
127	struct page **pages = param;
128
129	if (smp_processor_id() != 0)
130		check_unaligned_access(pages[cpu]);
131}
132
133DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
134
135static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
136{
137	if (cpumask_weight(mask) == weight)
138		static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
139	else
140		static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
141}
142
143static void set_unaligned_access_static_branches_except_cpu(int cpu)
144{
145	/*
146	 * Same as set_unaligned_access_static_branches, except excludes the
147	 * given CPU from the result. When a CPU is hotplugged into an offline
148	 * state, this function is called before the CPU is set to offline in
149	 * the cpumask, and thus the CPU needs to be explicitly excluded.
150	 */
151
152	cpumask_t fast_except_me;
153
154	cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
155	cpumask_clear_cpu(cpu, &fast_except_me);
156
157	modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
158}
159
160static void set_unaligned_access_static_branches(void)
161{
162	/*
163	 * This will be called after check_unaligned_access_all_cpus so the
164	 * result of unaligned access speed for all CPUs will be available.
165	 *
166	 * To avoid the number of online cpus changing between reading
167	 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
168	 * held before calling this function.
169	 */
170
171	cpumask_t fast_and_online;
172
173	cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
174
175	modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
176}
177
178static int lock_and_set_unaligned_access_static_branch(void)
179{
180	cpus_read_lock();
181	set_unaligned_access_static_branches();
182	cpus_read_unlock();
183
184	return 0;
185}
186
187arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
188
189static int riscv_online_cpu(unsigned int cpu)
190{
191	static struct page *buf;
192
193	/* We are already set since the last check */
194	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
195		goto exit;
196
197	check_unaligned_access_emulated(NULL);
198	buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
199	if (!buf) {
200		pr_warn("Allocation failure, not measuring misaligned performance\n");
201		return -ENOMEM;
202	}
203
204	check_unaligned_access(buf);
205	__free_pages(buf, MISALIGNED_BUFFER_ORDER);
206
207exit:
208	set_unaligned_access_static_branches();
209
210	return 0;
211}
212
213static int riscv_offline_cpu(unsigned int cpu)
214{
215	set_unaligned_access_static_branches_except_cpu(cpu);
216
217	return 0;
218}
219
220/* Measure unaligned access speed on all CPUs present at boot in parallel. */
221static int check_unaligned_access_speed_all_cpus(void)
222{
223	unsigned int cpu;
224	unsigned int cpu_count = num_possible_cpus();
225	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
226
227	if (!bufs) {
228		pr_warn("Allocation failure, not measuring misaligned performance\n");
229		return 0;
230	}
231
232	/*
233	 * Allocate separate buffers for each CPU so there's no fighting over
234	 * cache lines.
235	 */
236	for_each_cpu(cpu, cpu_online_mask) {
237		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
238		if (!bufs[cpu]) {
239			pr_warn("Allocation failure, not measuring misaligned performance\n");
240			goto out;
241		}
242	}
243
244	/* Check everybody except 0, who stays behind to tend jiffies. */
245	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
246
247	/* Check core 0. */
248	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
249
250	/*
251	 * Setup hotplug callbacks for any new CPUs that come online or go
252	 * offline.
253	 */
254	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
255				  riscv_online_cpu, riscv_offline_cpu);
256
257out:
258	for_each_cpu(cpu, cpu_online_mask) {
259		if (bufs[cpu])
260			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
261	}
262
263	kfree(bufs);
264	return 0;
265}
266#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
267static int check_unaligned_access_speed_all_cpus(void)
268{
269	return 0;
270}
271#endif
272
273#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
274static void check_vector_unaligned_access(struct work_struct *work __always_unused)
275{
276	int cpu = smp_processor_id();
277	u64 start_cycles, end_cycles;
278	u64 word_cycles;
279	u64 byte_cycles;
280	int ratio;
281	unsigned long start_jiffies, now;
282	struct page *page;
283	void *dst;
284	void *src;
285	long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
286
287	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
288		return;
289
290	page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
291	if (!page) {
292		pr_warn("Allocation failure, not measuring vector misaligned performance\n");
293		return;
294	}
295
296	/* Make an unaligned destination buffer. */
297	dst = (void *)((unsigned long)page_address(page) | 0x1);
298	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
299	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
300	src += 2;
301	word_cycles = -1ULL;
302
303	/* Do a warmup. */
304	kernel_vector_begin();
305	__riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
306
307	start_jiffies = jiffies;
308	while ((now = jiffies) == start_jiffies)
309		cpu_relax();
310
311	/*
312	 * For a fixed amount of time, repeatedly try the function, and take
313	 * the best time in cycles as the measurement.
314	 */
315	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
316		start_cycles = get_cycles64();
317		/* Ensure the CSR read can't reorder WRT to the copy. */
318		mb();
319		__riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
320		/* Ensure the copy ends before the end time is snapped. */
321		mb();
322		end_cycles = get_cycles64();
323		if ((end_cycles - start_cycles) < word_cycles)
324			word_cycles = end_cycles - start_cycles;
325	}
326
327	byte_cycles = -1ULL;
328	__riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
329	start_jiffies = jiffies;
330	while ((now = jiffies) == start_jiffies)
331		cpu_relax();
332
333	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
334		start_cycles = get_cycles64();
335		/* Ensure the CSR read can't reorder WRT to the copy. */
336		mb();
337		__riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
338		/* Ensure the copy ends before the end time is snapped. */
339		mb();
340		end_cycles = get_cycles64();
341		if ((end_cycles - start_cycles) < byte_cycles)
342			byte_cycles = end_cycles - start_cycles;
343	}
344
345	kernel_vector_end();
346
347	/* Don't divide by zero. */
348	if (!word_cycles || !byte_cycles) {
349		pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
350			cpu);
351
352		return;
353	}
354
355	if (word_cycles < byte_cycles)
356		speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
357
358	ratio = div_u64((byte_cycles * 100), word_cycles);
359	pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
360		cpu,
361		ratio / 100,
362		ratio % 100,
363		(speed ==  RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow");
364
365	per_cpu(vector_misaligned_access, cpu) = speed;
366}
367
368static int riscv_online_cpu_vec(unsigned int cpu)
369{
370	if (!has_vector())
371		return 0;
372
373	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED)
374		return 0;
375
376	check_vector_unaligned_access_emulated(NULL);
377	check_vector_unaligned_access(NULL);
378	return 0;
379}
380
381/* Measure unaligned access speed on all CPUs present at boot in parallel. */
382static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
383{
384	schedule_on_each_cpu(check_vector_unaligned_access);
385
386	/*
387	 * Setup hotplug callbacks for any new CPUs that come online or go
388	 * offline.
389	 */
390	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
391				  riscv_online_cpu_vec, NULL);
392
393	return 0;
394}
395#else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
396static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
397{
398	return 0;
399}
400#endif
401
402static int check_unaligned_access_all_cpus(void)
403{
404	bool all_cpus_emulated, all_cpus_vec_unsupported;
405
406	all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
407	all_cpus_vec_unsupported = check_vector_unaligned_access_emulated_all_cpus();
408
409	if (!all_cpus_vec_unsupported &&
410	    IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
411		kthread_run(vec_check_unaligned_access_speed_all_cpus,
412			    NULL, "vec_check_unaligned_access_speed_all_cpus");
413	}
414
415	if (!all_cpus_emulated)
416		return check_unaligned_access_speed_all_cpus();
417
418	return 0;
419}
420
421arch_initcall(check_unaligned_access_all_cpus);