hiperdispatch.c - arch/s390/kernel/hiperdispatch.c - Linux source code v6.13.7

Note: File does not exist in v4.10.11.
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Copyright IBM Corp. 2024
  4 */
  5
  6#define KMSG_COMPONENT "hd"
  7#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  8
  9/*
 10 * Hiperdispatch:
 11 * Dynamically calculates the optimum number of high capacity COREs
 12 * by considering the state the system is in. When hiperdispatch decides
 13 * that a capacity update is necessary, it schedules a topology update.
 14 * During topology updates the CPU capacities are always re-adjusted.
 15 *
 16 * There is two places where CPU capacities are being accessed within
 17 * hiperdispatch.
 18 * -> hiperdispatch's reoccuring work function reads CPU capacities to
 19 *    determine high capacity CPU count.
 20 * -> during a topology update hiperdispatch's adjustment function
 21 *    updates CPU capacities.
 22 * These two can run on different CPUs in parallel which can cause
 23 * hiperdispatch to make wrong decisions. This can potentially cause
 24 * some overhead by leading to extra rebuild_sched_domains() calls
 25 * for correction. Access to capacities within hiperdispatch has to be
 26 * serialized to prevent the overhead.
 27 *
 28 * Hiperdispatch decision making revolves around steal time.
 29 * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
 30 * crosses the threshold value hiperdispatch falls back to giving high
 31 * capacities to entitled CPUs. When steal time drops below the
 32 * threshold boundary, hiperdispatch utilizes all CPUs by giving all
 33 * of them high capacity.
 34 *
 35 * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
 36 * performance. Comparing the throughput of;
 37 * - single CORE, with N threads, running N tasks
 38 * - N separate COREs running N tasks,
 39 * using individual COREs for individual tasks yield better
 40 * performance. This performance difference is roughly ~30% (can change
 41 * between machine generations)
 42 *
 43 * Hiperdispatch tries to hint scheduler to use individual COREs for
 44 * each task, as long as steal time on those COREs are less than 30%,
 45 * therefore delaying the throughput loss caused by using SMP threads.
 46 */
 47
 48#include <linux/cpumask.h>
 49#include <linux/debugfs.h>
 50#include <linux/device.h>
 51#include <linux/kernel_stat.h>
 52#include <linux/kstrtox.h>
 53#include <linux/ktime.h>
 54#include <linux/sysctl.h>
 55#include <linux/types.h>
 56#include <linux/workqueue.h>
 57#include <asm/hiperdispatch.h>
 58#include <asm/setup.h>
 59#include <asm/smp.h>
 60#include <asm/topology.h>
 61
 62#define CREATE_TRACE_POINTS
 63#include <asm/trace/hiperdispatch.h>
 64
 65#define HD_DELAY_FACTOR			(4)
 66#define HD_DELAY_INTERVAL		(HZ / 4)
 67#define HD_STEAL_THRESHOLD		30
 68#define HD_STEAL_AVG_WEIGHT		16
 69
 70static cpumask_t hd_vl_coremask;	/* Mask containing all vertical low COREs */
 71static cpumask_t hd_vmvl_cpumask;	/* Mask containing vertical medium and low CPUs */
 72static int hd_high_capacity_cores;	/* Current CORE count with high capacity */
 73static int hd_entitled_cores;		/* Total vertical high and medium CORE count */
 74static int hd_online_cores;		/* Current online CORE count */
 75
 76static unsigned long hd_previous_steal;	/* Previous iteration's CPU steal timer total */
 77static unsigned long hd_high_time;	/* Total time spent while all cpus have high capacity */
 78static unsigned long hd_low_time;	/* Total time spent while vl cpus have low capacity */
 79static atomic64_t hd_adjustments;	/* Total occurrence count of hiperdispatch adjustments */
 80
 81static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
 82static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
 83static int hd_enabled;
 84
 85static void hd_capacity_work_fn(struct work_struct *work);
 86static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
 87
 88static int hd_set_hiperdispatch_mode(int enable)
 89{
 90	if (!MACHINE_HAS_TOPOLOGY)
 91		enable = 0;
 92	if (hd_enabled == enable)
 93		return 0;
 94	hd_enabled = enable;
 95	return 1;
 96}
 97
 98void hd_reset_state(void)
 99{
100	cpumask_clear(&hd_vl_coremask);
101	cpumask_clear(&hd_vmvl_cpumask);
102	hd_entitled_cores = 0;
103	hd_online_cores = 0;
104}
105
106void hd_add_core(int cpu)
107{
108	const struct cpumask *siblings;
109	int polarization;
110
111	hd_online_cores++;
112	polarization = smp_cpu_get_polarization(cpu);
113	siblings = topology_sibling_cpumask(cpu);
114	switch (polarization) {
115	case POLARIZATION_VH:
116		hd_entitled_cores++;
117		break;
118	case POLARIZATION_VM:
119		hd_entitled_cores++;
120		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
121		break;
122	case POLARIZATION_VL:
123		cpumask_set_cpu(cpu, &hd_vl_coremask);
124		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
125		break;
126	}
127}
128
129/* Serialize update and read operations of debug counters. */
130static DEFINE_MUTEX(hd_counter_mutex);
131
132static void hd_update_times(void)
133{
134	static ktime_t prev;
135	ktime_t now;
136
137	/*
138	 * Check if hiperdispatch is active, if not set the prev to 0.
139	 * This way it is possible to differentiate the first update iteration after
140	 * enabling hiperdispatch.
141	 */
142	if (hd_entitled_cores == 0 || hd_enabled == 0) {
143		prev = ktime_set(0, 0);
144		return;
145	}
146	now = ktime_get();
147	if (ktime_after(prev, 0)) {
148		if (hd_high_capacity_cores == hd_online_cores)
149			hd_high_time += ktime_ms_delta(now, prev);
150		else
151			hd_low_time += ktime_ms_delta(now, prev);
152	}
153	prev = now;
154}
155
156static void hd_update_capacities(void)
157{
158	int cpu, upscaling_cores;
159	unsigned long capacity;
160
161	upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
162	capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
163	hd_high_capacity_cores = hd_entitled_cores;
164	for_each_cpu(cpu, &hd_vl_coremask) {
165		smp_set_core_capacity(cpu, capacity);
166		if (capacity != CPU_CAPACITY_HIGH)
167			continue;
168		hd_high_capacity_cores++;
169		upscaling_cores--;
170		if (upscaling_cores == 0)
171			capacity = CPU_CAPACITY_LOW;
172	}
173}
174
175void hd_disable_hiperdispatch(void)
176{
177	cancel_delayed_work_sync(&hd_capacity_work);
178	hd_high_capacity_cores = hd_online_cores;
179	hd_previous_steal = 0;
180}
181
182int hd_enable_hiperdispatch(void)
183{
184	mutex_lock(&hd_counter_mutex);
185	hd_update_times();
186	mutex_unlock(&hd_counter_mutex);
187	if (hd_enabled == 0)
188		return 0;
189	if (hd_entitled_cores == 0)
190		return 0;
191	if (hd_online_cores <= hd_entitled_cores)
192		return 0;
193	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
194	hd_update_capacities();
195	return 1;
196}
197
198static unsigned long hd_steal_avg(unsigned long new)
199{
200	static unsigned long steal;
201
202	steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
203	return steal;
204}
205
206static unsigned long hd_calculate_steal_percentage(void)
207{
208	unsigned long time_delta, steal_delta, steal, percentage;
209	static ktime_t prev;
210	int cpus, cpu;
211	ktime_t now;
212
213	cpus = 0;
214	steal = 0;
215	percentage = 0;
216	for_each_cpu(cpu, &hd_vmvl_cpumask) {
217		steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
218		cpus++;
219	}
220	/*
221	 * If there is no vertical medium and low CPUs steal time
222	 * is 0 as vertical high CPUs shouldn't experience steal time.
223	 */
224	if (cpus == 0)
225		return percentage;
226	now = ktime_get();
227	time_delta = ktime_to_ns(ktime_sub(now, prev));
228	if (steal > hd_previous_steal && hd_previous_steal != 0) {
229		steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
230		percentage = steal_delta / cpus;
231	}
232	hd_previous_steal = steal;
233	prev = now;
234	return percentage;
235}
236
237static void hd_capacity_work_fn(struct work_struct *work)
238{
239	unsigned long steal_percentage, new_cores;
240
241	mutex_lock(&smp_cpu_state_mutex);
242	/*
243	 * If online cores are less or equal to entitled cores hiperdispatch
244	 * does not need to make any adjustments, call a topology update to
245	 * disable hiperdispatch.
246	 * Normally this check is handled on topology update, but during cpu
247	 * unhotplug, topology and cpu mask updates are done in reverse
248	 * order, causing hd_enable_hiperdispatch() to get stale data.
249	 */
250	if (hd_online_cores <= hd_entitled_cores) {
251		topology_schedule_update();
252		mutex_unlock(&smp_cpu_state_mutex);
253		return;
254	}
255	steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
256	if (steal_percentage < hd_steal_threshold)
257		new_cores = hd_online_cores;
258	else
259		new_cores = hd_entitled_cores;
260	if (hd_high_capacity_cores != new_cores) {
261		trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
262		hd_high_capacity_cores = new_cores;
263		atomic64_inc(&hd_adjustments);
264		topology_schedule_update();
265	}
266	trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
267	mutex_unlock(&smp_cpu_state_mutex);
268	schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
269}
270
271static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
272				     void *buffer, size_t *lenp, loff_t *ppos)
273{
274	int hiperdispatch;
275	int rc;
276	struct ctl_table ctl_entry = {
277		.procname	= ctl->procname,
278		.data		= &hiperdispatch,
279		.maxlen		= sizeof(int),
280		.extra1		= SYSCTL_ZERO,
281		.extra2		= SYSCTL_ONE,
282	};
283
284	hiperdispatch = hd_enabled;
285	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
286	if (rc < 0 || !write)
287		return rc;
288	mutex_lock(&smp_cpu_state_mutex);
289	if (hd_set_hiperdispatch_mode(hiperdispatch))
290		topology_schedule_update();
291	mutex_unlock(&smp_cpu_state_mutex);
292	return 0;
293}
294
295static struct ctl_table hiperdispatch_ctl_table[] = {
296	{
297		.procname	= "hiperdispatch",
298		.mode		= 0644,
299		.proc_handler	= hiperdispatch_ctl_handler,
300	},
301};
302
303static ssize_t hd_steal_threshold_show(struct device *dev,
304				       struct device_attribute *attr,
305				       char *buf)
306{
307	return sysfs_emit(buf, "%u\n", hd_steal_threshold);
308}
309
310static ssize_t hd_steal_threshold_store(struct device *dev,
311					struct device_attribute *attr,
312					const char *buf,
313					size_t count)
314{
315	unsigned int val;
316	int rc;
317
318	rc = kstrtouint(buf, 0, &val);
319	if (rc)
320		return rc;
321	if (val > 100)
322		return -ERANGE;
323	hd_steal_threshold = val;
324	return count;
325}
326
327static DEVICE_ATTR_RW(hd_steal_threshold);
328
329static ssize_t hd_delay_factor_show(struct device *dev,
330				    struct device_attribute *attr,
331				    char *buf)
332{
333	return sysfs_emit(buf, "%u\n", hd_delay_factor);
334}
335
336static ssize_t hd_delay_factor_store(struct device *dev,
337				     struct device_attribute *attr,
338				     const char *buf,
339				     size_t count)
340{
341	unsigned int val;
342	int rc;
343
344	rc = kstrtouint(buf, 0, &val);
345	if (rc)
346		return rc;
347	if (!val)
348		return -ERANGE;
349	hd_delay_factor = val;
350	return count;
351}
352
353static DEVICE_ATTR_RW(hd_delay_factor);
354
355static struct attribute *hd_attrs[] = {
356	&dev_attr_hd_steal_threshold.attr,
357	&dev_attr_hd_delay_factor.attr,
358	NULL,
359};
360
361static const struct attribute_group hd_attr_group = {
362	.name  = "hiperdispatch",
363	.attrs = hd_attrs,
364};
365
366static int hd_greedy_time_get(void *unused, u64 *val)
367{
368	mutex_lock(&hd_counter_mutex);
369	hd_update_times();
370	*val = hd_high_time;
371	mutex_unlock(&hd_counter_mutex);
372	return 0;
373}
374
375DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
376
377static int hd_conservative_time_get(void *unused, u64 *val)
378{
379	mutex_lock(&hd_counter_mutex);
380	hd_update_times();
381	*val = hd_low_time;
382	mutex_unlock(&hd_counter_mutex);
383	return 0;
384}
385
386DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
387
388static int hd_adjustment_count_get(void *unused, u64 *val)
389{
390	*val = atomic64_read(&hd_adjustments);
391	return 0;
392}
393
394DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
395
396static void __init hd_create_debugfs_counters(void)
397{
398	struct dentry *dir;
399
400	dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
401	debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
402	debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
403	debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
404}
405
406static void __init hd_create_attributes(void)
407{
408	struct device *dev;
409
410	dev = bus_get_dev_root(&cpu_subsys);
411	if (!dev)
412		return;
413	if (sysfs_create_group(&dev->kobj, &hd_attr_group))
414		pr_warn("Unable to create hiperdispatch attribute group\n");
415	put_device(dev);
416}
417
418static int __init hd_init(void)
419{
420	if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
421		hd_set_hiperdispatch_mode(1);
422		topology_schedule_update();
423	}
424	if (!register_sysctl("s390", hiperdispatch_ctl_table))
425		pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
426	hd_create_debugfs_counters();
427	hd_create_attributes();
428	return 0;
429}
430late_initcall(hd_init);