Linux Audio

Check our new training course

Loading...
v6.2
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * intel_powerclamp.c - package c-state idle injection
  4 *
  5 * Copyright (c) 2012, Intel Corporation.
  6 *
  7 * Authors:
  8 *     Arjan van de Ven <arjan@linux.intel.com>
  9 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
 10 *
 11 *	TODO:
 12 *           1. better handle wakeup from external interrupts, currently a fixed
 13 *              compensation is added to clamping duration when excessive amount
 14 *              of wakeups are observed during idle time. the reason is that in
 15 *              case of external interrupts without need for ack, clamping down
 16 *              cpu in non-irq context does not reduce irq. for majority of the
 17 *              cases, clamping down cpu does help reduce irq as well, we should
 18 *              be able to differentiate the two cases and give a quantitative
 19 *              solution for the irqs that we can control. perhaps based on
 20 *              get_cpu_iowait_time_us()
 21 *
 22 *	     2. synchronization with other hw blocks
 23 */
 24
 25#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 26
 27#include <linux/module.h>
 28#include <linux/kernel.h>
 29#include <linux/delay.h>
 30#include <linux/kthread.h>
 31#include <linux/cpu.h>
 32#include <linux/thermal.h>
 33#include <linux/slab.h>
 34#include <linux/tick.h>
 35#include <linux/debugfs.h>
 36#include <linux/seq_file.h>
 37#include <linux/sched/rt.h>
 38#include <uapi/linux/sched/types.h>
 39
 40#include <asm/nmi.h>
 41#include <asm/msr.h>
 42#include <asm/mwait.h>
 43#include <asm/cpu_device_id.h>
 44#include <asm/hardirq.h>
 45
 46#define MAX_TARGET_RATIO (50U)
 47/* For each undisturbed clamping period (no extra wake ups during idle time),
 48 * we increment the confidence counter for the given target ratio.
 49 * CONFIDENCE_OK defines the level where runtime calibration results are
 50 * valid.
 51 */
 52#define CONFIDENCE_OK (3)
 53/* Default idle injection duration, driver adjust sleep time to meet target
 54 * idle ratio. Similar to frequency modulation.
 55 */
 56#define DEFAULT_DURATION_JIFFIES (6)
 57
 58static unsigned int target_mwait;
 59static struct dentry *debug_dir;
 
 60
 61/* user selected target */
 62static unsigned int set_target_ratio;
 63static unsigned int current_ratio;
 
 
 64static bool should_skip;
 65
 66static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 67				  * control parameters. default to BSP but BSP
 68				  * can be offlined.
 69				  */
 70static bool clamping;
 71
 72struct powerclamp_worker_data {
 73	struct kthread_worker *worker;
 74	struct kthread_work balancing_work;
 75	struct kthread_delayed_work idle_injection_work;
 76	unsigned int cpu;
 77	unsigned int count;
 78	unsigned int guard;
 79	unsigned int window_size_now;
 80	unsigned int target_ratio;
 81	unsigned int duration_jiffies;
 82	bool clamping;
 83};
 84
 85static struct powerclamp_worker_data __percpu *worker_data;
 
 86static struct thermal_cooling_device *cooling_dev;
 87static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
 88					   * clamping kthread worker
 89					   */
 90
 
 
 
 91static unsigned int duration;
 92static unsigned int pkg_cstate_ratio_cur;
 93static unsigned int window_size;
 94
 95static int duration_set(const char *arg, const struct kernel_param *kp)
 96{
 97	int ret = 0;
 98	unsigned long new_duration;
 99
100	ret = kstrtoul(arg, 10, &new_duration);
101	if (ret)
102		goto exit;
103	if (new_duration > 25 || new_duration < 6) {
104		pr_err("Out of recommended range %lu, between 6-25ms\n",
105			new_duration);
106		ret = -EINVAL;
 
107	}
108
109	duration = clamp(new_duration, 6ul, 25ul);
110	smp_mb();
111
112exit:
113
114	return ret;
115}
116
 
 
 
 
 
 
 
 
 
 
 
117static const struct kernel_param_ops duration_ops = {
118	.set = duration_set,
119	.get = param_get_int,
120};
121
122
123module_param_cb(duration, &duration_ops, &duration, 0644);
124MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126struct powerclamp_calibration_data {
127	unsigned long confidence;  /* used for calibration, basically a counter
128				    * gets incremented each time a clamping
129				    * period is completed without extra wakeups
130				    * once that counter is reached given level,
131				    * compensation is deemed usable.
132				    */
133	unsigned long steady_comp; /* steady state compensation used when
134				    * no extra wakeups occurred.
135				    */
136	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
137				     * mostly from external interrupts.
138				     */
139};
140
141static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
142
143static int window_size_set(const char *arg, const struct kernel_param *kp)
144{
145	int ret = 0;
146	unsigned long new_window_size;
147
148	ret = kstrtoul(arg, 10, &new_window_size);
149	if (ret)
150		goto exit_win;
151	if (new_window_size > 10 || new_window_size < 2) {
152		pr_err("Out of recommended window size %lu, between 2-10\n",
153			new_window_size);
154		ret = -EINVAL;
155	}
156
157	window_size = clamp(new_window_size, 2ul, 10ul);
158	smp_mb();
159
160exit_win:
161
162	return ret;
163}
164
165static const struct kernel_param_ops window_size_ops = {
166	.set = window_size_set,
167	.get = param_get_int,
168};
169
170module_param_cb(window_size, &window_size_ops, &window_size, 0644);
171MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
172	"\tpowerclamp controls idle ratio within this window. larger\n"
173	"\twindow size results in slower response time but more smooth\n"
174	"\tclamping results. default to 2.");
175
176static void find_target_mwait(void)
177{
178	unsigned int eax, ebx, ecx, edx;
179	unsigned int highest_cstate = 0;
180	unsigned int highest_subcstate = 0;
181	int i;
182
183	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
184		return;
185
186	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
187
188	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
189	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
190		return;
191
192	edx >>= MWAIT_SUBSTATE_SIZE;
193	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
194		if (edx & MWAIT_SUBSTATE_MASK) {
195			highest_cstate = i;
196			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
197		}
198	}
199	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
200		(highest_subcstate - 1);
201
202}
203
204struct pkg_cstate_info {
205	bool skip;
206	int msr_index;
207	int cstate_id;
208};
209
210#define PKG_CSTATE_INIT(id) {				\
211		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
212		.cstate_id = id				\
213			}
214
215static struct pkg_cstate_info pkg_cstates[] = {
216	PKG_CSTATE_INIT(2),
217	PKG_CSTATE_INIT(3),
218	PKG_CSTATE_INIT(6),
219	PKG_CSTATE_INIT(7),
220	PKG_CSTATE_INIT(8),
221	PKG_CSTATE_INIT(9),
222	PKG_CSTATE_INIT(10),
223	{NULL},
224};
225
226static bool has_pkg_state_counter(void)
227{
228	u64 val;
229	struct pkg_cstate_info *info = pkg_cstates;
230
231	/* check if any one of the counter msrs exists */
232	while (info->msr_index) {
233		if (!rdmsrl_safe(info->msr_index, &val))
234			return true;
235		info++;
236	}
237
238	return false;
239}
240
241static u64 pkg_state_counter(void)
242{
243	u64 val;
244	u64 count = 0;
245	struct pkg_cstate_info *info = pkg_cstates;
246
247	while (info->msr_index) {
248		if (!info->skip) {
249			if (!rdmsrl_safe(info->msr_index, &val))
250				count += val;
251			else
252				info->skip = true;
253		}
254		info++;
255	}
256
257	return count;
258}
259
260static unsigned int get_compensation(int ratio)
261{
262	unsigned int comp = 0;
263
 
 
 
264	/* we only use compensation if all adjacent ones are good */
265	if (ratio == 1 &&
266		cal_data[ratio].confidence >= CONFIDENCE_OK &&
267		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
268		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
269		comp = (cal_data[ratio].steady_comp +
270			cal_data[ratio + 1].steady_comp +
271			cal_data[ratio + 2].steady_comp) / 3;
272	} else if (ratio == MAX_TARGET_RATIO - 1 &&
273		cal_data[ratio].confidence >= CONFIDENCE_OK &&
274		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
275		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
276		comp = (cal_data[ratio].steady_comp +
277			cal_data[ratio - 1].steady_comp +
278			cal_data[ratio - 2].steady_comp) / 3;
279	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
280		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
281		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
282		comp = (cal_data[ratio].steady_comp +
283			cal_data[ratio - 1].steady_comp +
284			cal_data[ratio + 1].steady_comp) / 3;
285	}
286
287	/* do not exceed limit */
288	if (comp + ratio >= MAX_TARGET_RATIO)
289		comp = MAX_TARGET_RATIO - ratio - 1;
290
291	return comp;
292}
293
294static void adjust_compensation(int target_ratio, unsigned int win)
295{
296	int delta;
297	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
298
299	/*
300	 * adjust compensations if confidence level has not been reached.
301	 */
302	if (d->confidence >= CONFIDENCE_OK)
303		return;
304
305	delta = set_target_ratio - current_ratio;
306	/* filter out bad data */
307	if (delta >= 0 && delta <= (1+target_ratio/10)) {
308		if (d->steady_comp)
309			d->steady_comp =
310				roundup(delta+d->steady_comp, 2)/2;
311		else
312			d->steady_comp = delta;
313		d->confidence++;
314	}
315}
316
317static bool powerclamp_adjust_controls(unsigned int target_ratio,
318				unsigned int guard, unsigned int win)
319{
320	static u64 msr_last, tsc_last;
321	u64 msr_now, tsc_now;
322	u64 val64;
323
324	/* check result for the last window */
325	msr_now = pkg_state_counter();
326	tsc_now = rdtsc();
327
328	/* calculate pkg cstate vs tsc ratio */
329	if (!msr_last || !tsc_last)
330		current_ratio = 1;
331	else if (tsc_now-tsc_last) {
332		val64 = 100*(msr_now-msr_last);
333		do_div(val64, (tsc_now-tsc_last));
334		current_ratio = val64;
335	}
336
337	/* update record */
338	msr_last = msr_now;
339	tsc_last = tsc_now;
340
341	adjust_compensation(target_ratio, win);
342
343	/* if we are above target+guard, skip */
344	return set_target_ratio + guard <= current_ratio;
345}
346
347static void clamp_balancing_func(struct kthread_work *work)
 
 
 
 
348{
349	struct powerclamp_worker_data *w_data;
350	int sleeptime;
351	unsigned long target_jiffies;
352	unsigned int compensated_ratio;
353	int interval; /* jiffies to sleep for each attempt */
354
355	w_data = container_of(work, struct powerclamp_worker_data,
356			      balancing_work);
357
358	/*
359	 * make sure user selected ratio does not take effect until
360	 * the next round. adjust target_ratio if user has changed
361	 * target such that we can converge quickly.
362	 */
363	w_data->target_ratio = READ_ONCE(set_target_ratio);
364	w_data->guard = 1 + w_data->target_ratio / 20;
365	w_data->window_size_now = window_size;
366	w_data->duration_jiffies = msecs_to_jiffies(duration);
367	w_data->count++;
368
369	/*
370	 * systems may have different ability to enter package level
371	 * c-states, thus we need to compensate the injected idle ratio
372	 * to achieve the actual target reported by the HW.
373	 */
374	compensated_ratio = w_data->target_ratio +
375		get_compensation(w_data->target_ratio);
376	if (compensated_ratio <= 0)
377		compensated_ratio = 1;
378	interval = w_data->duration_jiffies * 100 / compensated_ratio;
379
380	/* align idle time */
381	target_jiffies = roundup(jiffies, interval);
382	sleeptime = target_jiffies - jiffies;
383	if (sleeptime <= 0)
384		sleeptime = 1;
385
386	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
387		kthread_queue_delayed_work(w_data->worker,
388					   &w_data->idle_injection_work,
389					   sleeptime);
390}
391
392static void clamp_idle_injection_func(struct kthread_work *work)
393{
394	struct powerclamp_worker_data *w_data;
395
396	w_data = container_of(work, struct powerclamp_worker_data,
397			      idle_injection_work.work);
398
399	/*
400	 * only elected controlling cpu can collect stats and update
401	 * control parameters.
402	 */
403	if (w_data->cpu == control_cpu &&
404	    !(w_data->count % w_data->window_size_now)) {
405		should_skip =
406			powerclamp_adjust_controls(w_data->target_ratio,
407						   w_data->guard,
408						   w_data->window_size_now);
409		smp_mb();
410	}
411
412	if (should_skip)
413		goto balance;
414
415	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
416
417balance:
418	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
419		kthread_queue_work(w_data->worker, &w_data->balancing_work);
420}
421
422/*
423 * 1 HZ polling while clamping is active, useful for userspace
424 * to monitor actual idle ratio.
425 */
426static void poll_pkg_cstate(struct work_struct *dummy);
427static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
428static void poll_pkg_cstate(struct work_struct *dummy)
429{
430	static u64 msr_last;
431	static u64 tsc_last;
432
433	u64 msr_now;
434	u64 tsc_now;
435	u64 val64;
436
437	msr_now = pkg_state_counter();
438	tsc_now = rdtsc();
439
440	/* calculate pkg cstate vs tsc ratio */
441	if (!msr_last || !tsc_last)
442		pkg_cstate_ratio_cur = 1;
443	else {
444		if (tsc_now - tsc_last) {
445			val64 = 100 * (msr_now - msr_last);
446			do_div(val64, (tsc_now - tsc_last));
447			pkg_cstate_ratio_cur = val64;
448		}
449	}
450
451	/* update record */
452	msr_last = msr_now;
453	tsc_last = tsc_now;
454
455	if (true == clamping)
 
456		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 
457}
458
459static void start_power_clamp_worker(unsigned long cpu)
 
 
 
 
 
 
 
460{
461	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
462	struct kthread_worker *worker;
463
464	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
465	if (IS_ERR(worker))
466		return;
467
468	w_data->worker = worker;
469	w_data->count = 0;
470	w_data->cpu = cpu;
471	w_data->clamping = true;
472	set_bit(cpu, cpu_clamping_mask);
473	sched_set_fifo(worker->task);
474	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
475	kthread_init_delayed_work(&w_data->idle_injection_work,
476				  clamp_idle_injection_func);
477	kthread_queue_work(w_data->worker, &w_data->balancing_work);
478}
479
480static void stop_power_clamp_worker(unsigned long cpu)
481{
482	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
 
 
483
484	if (!w_data->worker)
485		return;
486
487	w_data->clamping = false;
488	/*
489	 * Make sure that all works that get queued after this point see
490	 * the clamping disabled. The counter part is not needed because
491	 * there is an implicit memory barrier when the queued work
492	 * is proceed.
493	 */
494	smp_wmb();
495	kthread_cancel_work_sync(&w_data->balancing_work);
496	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
497	/*
498	 * The balancing work still might be queued here because
499	 * the handling of the "clapming" variable, cancel, and queue
500	 * operations are not synchronized via a lock. But it is not
501	 * a big deal. The balancing work is fast and destroy kthread
502	 * will wait for it.
503	 */
504	clear_bit(w_data->cpu, cpu_clamping_mask);
505	kthread_destroy_worker(w_data->worker);
506
507	w_data->worker = NULL;
 
 
 
508}
509
510static int start_power_clamp(void)
 
511{
512	unsigned long cpu;
513
514	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
515	/* prevent cpu hotplug */
516	cpus_read_lock();
517
518	/* prefer BSP */
519	control_cpu = cpumask_first(cpu_online_mask);
520
521	clamping = true;
522	schedule_delayed_work(&poll_pkg_cstate_work, 0);
 
 
 
 
 
 
 
 
 
 
 
 
 
523
524	/* start one kthread worker per online cpu */
525	for_each_online_cpu(cpu) {
526		start_power_clamp_worker(cpu);
527	}
528	cpus_read_unlock();
 
 
529
530	return 0;
531}
532
533static void end_power_clamp(void)
 
 
 
 
534{
535	int i;
 
536
537	/*
538	 * Block requeuing in all the kthread workers. They will flush and
539	 * stop faster.
540	 */
541	clamping = false;
542	for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
543		pr_debug("clamping worker for cpu %d alive, destroy\n", i);
544		stop_power_clamp_worker(i);
545	}
546}
547
548static int powerclamp_cpu_online(unsigned int cpu)
 
 
 
 
549{
550	if (clamping == false)
551		return 0;
552	start_power_clamp_worker(cpu);
553	/* prefer BSP as controlling CPU */
554	if (cpu == 0) {
555		control_cpu = 0;
556		smp_mb();
557	}
558	return 0;
 
559}
560
561static int powerclamp_cpu_predown(unsigned int cpu)
 
 
 
 
562{
563	if (clamping == false)
564		return 0;
565
566	stop_power_clamp_worker(cpu);
567	if (cpu != control_cpu)
568		return 0;
569
570	control_cpu = cpumask_first(cpu_online_mask);
571	if (control_cpu == cpu)
572		control_cpu = cpumask_next(cpu, cpu_online_mask);
573	smp_mb();
574	return 0;
575}
576
577static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
578				 unsigned long *state)
579{
580	*state = MAX_TARGET_RATIO;
581
582	return 0;
583}
584
585static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
586				 unsigned long *state)
587{
588	if (true == clamping)
589		*state = pkg_cstate_ratio_cur;
590	else
591		/* to save power, do not poll idle ratio while not clamping */
592		*state = -1; /* indicates invalid state */
593
594	return 0;
595}
596
597static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
598				 unsigned long new_target_ratio)
599{
600	int ret = 0;
601
 
 
602	new_target_ratio = clamp(new_target_ratio, 0UL,
603				(unsigned long) (MAX_TARGET_RATIO-1));
604	if (set_target_ratio == 0 && new_target_ratio > 0) {
 
 
 
 
605		pr_info("Start idle injection to reduce power\n");
606		set_target_ratio = new_target_ratio;
607		ret = start_power_clamp();
 
 
608		goto exit_set;
609	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
610		pr_info("Stop forced idle injection\n");
611		end_power_clamp();
612		set_target_ratio = 0;
613	} else	/* adjust currently running */ {
614		set_target_ratio = new_target_ratio;
615		/* make new set_target_ratio visible to other cpus */
616		smp_mb();
 
 
617	}
618
619exit_set:
 
 
620	return ret;
621}
622
623/* bind to generic thermal layer as cooling device*/
624static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
625	.get_max_state = powerclamp_get_max_state,
626	.get_cur_state = powerclamp_get_cur_state,
627	.set_cur_state = powerclamp_set_cur_state,
628};
629
630static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
631	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
632	{}
633};
634MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
635
636static int __init powerclamp_probe(void)
637{
638
639	if (!x86_match_cpu(intel_powerclamp_ids)) {
640		pr_err("CPU does not support MWAIT\n");
641		return -ENODEV;
642	}
643
644	/* The goal for idle time alignment is to achieve package cstate. */
645	if (!has_pkg_state_counter()) {
646		pr_info("No package C-state available\n");
647		return -ENODEV;
648	}
649
650	/* find the deepest mwait value */
651	find_target_mwait();
652
653	return 0;
654}
655
656static int powerclamp_debug_show(struct seq_file *m, void *unused)
657{
658	int i = 0;
659
660	seq_printf(m, "controlling cpu: %d\n", control_cpu);
661	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
662	for (i = 0; i < MAX_TARGET_RATIO; i++) {
663		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
664			i,
665			cal_data[i].confidence,
666			cal_data[i].steady_comp,
667			cal_data[i].dynamic_comp);
668	}
669
670	return 0;
671}
672
673DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
674
675static inline void powerclamp_create_debug_files(void)
676{
677	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
678
679	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
680			    &powerclamp_debug_fops);
681}
682
683static enum cpuhp_state hp_state;
684
685static int __init powerclamp_init(void)
686{
687	int retval;
688
689	cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
690	if (!cpu_clamping_mask)
691		return -ENOMEM;
692
693	/* probe cpu features and ids here */
694	retval = powerclamp_probe();
695	if (retval)
696		goto exit_free;
 
 
 
 
 
 
 
 
697
698	/* set default limit, maybe adjusted during runtime based on feedback */
699	window_size = 2;
700	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
701					   "thermal/intel_powerclamp:online",
702					   powerclamp_cpu_online,
703					   powerclamp_cpu_predown);
704	if (retval < 0)
705		goto exit_free;
706
707	hp_state = retval;
708
709	worker_data = alloc_percpu(struct powerclamp_worker_data);
710	if (!worker_data) {
711		retval = -ENOMEM;
712		goto exit_unregister;
713	}
714
715	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
716						&powerclamp_cooling_ops);
717	if (IS_ERR(cooling_dev)) {
718		retval = -ENODEV;
719		goto exit_free_thread;
720	}
721
722	if (!duration)
723		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
724
725	powerclamp_create_debug_files();
726
727	return 0;
728
729exit_free_thread:
730	free_percpu(worker_data);
731exit_unregister:
732	cpuhp_remove_state_nocalls(hp_state);
733exit_free:
734	bitmap_free(cpu_clamping_mask);
735	return retval;
736}
737module_init(powerclamp_init);
738
739static void __exit powerclamp_exit(void)
740{
 
741	end_power_clamp();
742	cpuhp_remove_state_nocalls(hp_state);
743	free_percpu(worker_data);
744	thermal_cooling_device_unregister(cooling_dev);
745	bitmap_free(cpu_clamping_mask);
746
747	cancel_delayed_work_sync(&poll_pkg_cstate_work);
748	debugfs_remove_recursive(debug_dir);
 
 
 
749}
750module_exit(powerclamp_exit);
 
 
751
752MODULE_LICENSE("GPL");
753MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
754MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
755MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
v6.8
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * intel_powerclamp.c - package c-state idle injection
  4 *
  5 * Copyright (c) 2012-2023, Intel Corporation.
  6 *
  7 * Authors:
  8 *     Arjan van de Ven <arjan@linux.intel.com>
  9 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
 10 *
 11 *	TODO:
 12 *           1. better handle wakeup from external interrupts, currently a fixed
 13 *              compensation is added to clamping duration when excessive amount
 14 *              of wakeups are observed during idle time. the reason is that in
 15 *              case of external interrupts without need for ack, clamping down
 16 *              cpu in non-irq context does not reduce irq. for majority of the
 17 *              cases, clamping down cpu does help reduce irq as well, we should
 18 *              be able to differentiate the two cases and give a quantitative
 19 *              solution for the irqs that we can control. perhaps based on
 20 *              get_cpu_iowait_time_us()
 21 *
 22 *	     2. synchronization with other hw blocks
 23 */
 24
 25#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 26
 27#include <linux/module.h>
 28#include <linux/kernel.h>
 29#include <linux/delay.h>
 
 30#include <linux/cpu.h>
 31#include <linux/thermal.h>
 
 
 32#include <linux/debugfs.h>
 33#include <linux/seq_file.h>
 34#include <linux/idle_inject.h>
 
 35
 
 36#include <asm/msr.h>
 37#include <asm/mwait.h>
 38#include <asm/cpu_device_id.h>
 
 39
 40#define MAX_TARGET_RATIO (100U)
 41/* For each undisturbed clamping period (no extra wake ups during idle time),
 42 * we increment the confidence counter for the given target ratio.
 43 * CONFIDENCE_OK defines the level where runtime calibration results are
 44 * valid.
 45 */
 46#define CONFIDENCE_OK (3)
 47/* Default idle injection duration, driver adjust sleep time to meet target
 48 * idle ratio. Similar to frequency modulation.
 49 */
 50#define DEFAULT_DURATION_JIFFIES (6)
 51
 
 52static struct dentry *debug_dir;
 53static bool poll_pkg_cstate_enable;
 54
 55/* Idle ratio observed using package C-state counters */
 
 56static unsigned int current_ratio;
 57
 58/* Skip the idle injection till set to true */
 59static bool should_skip;
 60
 61struct powerclamp_data {
 
 
 
 
 
 
 
 
 
 62	unsigned int cpu;
 63	unsigned int count;
 64	unsigned int guard;
 65	unsigned int window_size_now;
 66	unsigned int target_ratio;
 
 67	bool clamping;
 68};
 69
 70static struct powerclamp_data powerclamp_data;
 71
 72static struct thermal_cooling_device *cooling_dev;
 
 
 
 73
 74static DEFINE_MUTEX(powerclamp_lock);
 75
 76/* This duration is in microseconds */
 77static unsigned int duration;
 78static unsigned int pkg_cstate_ratio_cur;
 79static unsigned int window_size;
 80
 81static int duration_set(const char *arg, const struct kernel_param *kp)
 82{
 83	int ret = 0;
 84	unsigned long new_duration;
 85
 86	ret = kstrtoul(arg, 10, &new_duration);
 87	if (ret)
 88		goto exit;
 89	if (new_duration > 25 || new_duration < 6) {
 90		pr_err("Out of recommended range %lu, between 6-25ms\n",
 91			new_duration);
 92		ret = -EINVAL;
 93		goto exit;
 94	}
 95
 96	mutex_lock(&powerclamp_lock);
 97	duration = clamp(new_duration, 6ul, 25ul) * 1000;
 98	mutex_unlock(&powerclamp_lock);
 99exit:
100
101	return ret;
102}
103
104static int duration_get(char *buf, const struct kernel_param *kp)
105{
106	int ret;
107
108	mutex_lock(&powerclamp_lock);
109	ret = sysfs_emit(buf, "%d\n", duration / 1000);
110	mutex_unlock(&powerclamp_lock);
111
112	return ret;
113}
114
115static const struct kernel_param_ops duration_ops = {
116	.set = duration_set,
117	.get = duration_get,
118};
119
120module_param_cb(duration, &duration_ops, NULL, 0644);
 
121MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
122
123#define DEFAULT_MAX_IDLE	50
124#define MAX_ALL_CPU_IDLE	75
125
126static u8 max_idle = DEFAULT_MAX_IDLE;
127
128static cpumask_var_t idle_injection_cpu_mask;
129
130static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask)
131{
132	if (cpumask_available(idle_injection_cpu_mask))
133		goto copy_mask;
134
135	/* This mask is allocated only one time and freed during module exit */
136	if (!alloc_cpumask_var(&idle_injection_cpu_mask, GFP_KERNEL))
137		return -ENOMEM;
138
139copy_mask:
140	cpumask_copy(idle_injection_cpu_mask, copy_mask);
141
142	return 0;
143}
144
145/* Return true if the cpumask and idle percent combination is invalid */
146static bool check_invalid(cpumask_var_t mask, u8 idle)
147{
148	if (cpumask_equal(cpu_present_mask, mask) && idle > MAX_ALL_CPU_IDLE)
149		return true;
150
151	return false;
152}
153
154static int cpumask_set(const char *arg, const struct kernel_param *kp)
155{
156	cpumask_var_t new_mask;
157	int ret;
158
159	mutex_lock(&powerclamp_lock);
160
161	/* Can't set mask when cooling device is in use */
162	if (powerclamp_data.clamping) {
163		ret = -EAGAIN;
164		goto skip_cpumask_set;
165	}
166
167	ret = alloc_cpumask_var(&new_mask, GFP_KERNEL);
168	if (!ret)
169		goto skip_cpumask_set;
170
171	ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
172			   nr_cpumask_bits);
173	if (ret)
174		goto free_cpumask_set;
175
176	if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
177		ret = -EINVAL;
178		goto free_cpumask_set;
179	}
180
181	/*
182	 * When module parameters are passed from kernel command line
183	 * during insmod, the module parameter callback is called
184	 * before powerclamp_init(), so we can't assume that some
185	 * cpumask can be allocated and copied before here. Also
186	 * in this case this cpumask is used as the default mask.
187	 */
188	ret = allocate_copy_idle_injection_mask(new_mask);
189
190free_cpumask_set:
191	free_cpumask_var(new_mask);
192skip_cpumask_set:
193	mutex_unlock(&powerclamp_lock);
194
195	return ret;
196}
197
198static int cpumask_get(char *buf, const struct kernel_param *kp)
199{
200	if (!cpumask_available(idle_injection_cpu_mask))
201		return -ENODEV;
202
203	return bitmap_print_to_pagebuf(false, buf, cpumask_bits(idle_injection_cpu_mask),
204				       nr_cpumask_bits);
205}
206
207static const struct kernel_param_ops cpumask_ops = {
208	.set = cpumask_set,
209	.get = cpumask_get,
210};
211
212module_param_cb(cpumask, &cpumask_ops, NULL, 0644);
213MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection.");
214
215static int max_idle_set(const char *arg, const struct kernel_param *kp)
216{
217	u8 new_max_idle;
218	int ret = 0;
219
220	mutex_lock(&powerclamp_lock);
221
222	/* Can't set mask when cooling device is in use */
223	if (powerclamp_data.clamping) {
224		ret = -EAGAIN;
225		goto skip_limit_set;
226	}
227
228	ret = kstrtou8(arg, 10, &new_max_idle);
229	if (ret)
230		goto skip_limit_set;
231
232	if (new_max_idle > MAX_TARGET_RATIO) {
233		ret = -EINVAL;
234		goto skip_limit_set;
235	}
236
237	if (!cpumask_available(idle_injection_cpu_mask)) {
238		ret = allocate_copy_idle_injection_mask(cpu_present_mask);
239		if (ret)
240			goto skip_limit_set;
241	}
242
243	if (check_invalid(idle_injection_cpu_mask, new_max_idle)) {
244		ret = -EINVAL;
245		goto skip_limit_set;
246	}
247
248	max_idle = new_max_idle;
249
250skip_limit_set:
251	mutex_unlock(&powerclamp_lock);
252
253	return ret;
254}
255
256static const struct kernel_param_ops max_idle_ops = {
257	.set = max_idle_set,
258	.get = param_get_byte,
259};
260
261module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
262MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
263
264struct powerclamp_calibration_data {
265	unsigned long confidence;  /* used for calibration, basically a counter
266				    * gets incremented each time a clamping
267				    * period is completed without extra wakeups
268				    * once that counter is reached given level,
269				    * compensation is deemed usable.
270				    */
271	unsigned long steady_comp; /* steady state compensation used when
272				    * no extra wakeups occurred.
273				    */
274	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
275				     * mostly from external interrupts.
276				     */
277};
278
279static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
280
281static int window_size_set(const char *arg, const struct kernel_param *kp)
282{
283	int ret = 0;
284	unsigned long new_window_size;
285
286	ret = kstrtoul(arg, 10, &new_window_size);
287	if (ret)
288		goto exit_win;
289	if (new_window_size > 10 || new_window_size < 2) {
290		pr_err("Out of recommended window size %lu, between 2-10\n",
291			new_window_size);
292		ret = -EINVAL;
293	}
294
295	window_size = clamp(new_window_size, 2ul, 10ul);
296	smp_mb();
297
298exit_win:
299
300	return ret;
301}
302
303static const struct kernel_param_ops window_size_ops = {
304	.set = window_size_set,
305	.get = param_get_int,
306};
307
308module_param_cb(window_size, &window_size_ops, &window_size, 0644);
309MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
310	"\tpowerclamp controls idle ratio within this window. larger\n"
311	"\twindow size results in slower response time but more smooth\n"
312	"\tclamping results. default to 2.");
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314struct pkg_cstate_info {
315	bool skip;
316	int msr_index;
317	int cstate_id;
318};
319
320#define PKG_CSTATE_INIT(id) {				\
321		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
322		.cstate_id = id				\
323			}
324
325static struct pkg_cstate_info pkg_cstates[] = {
326	PKG_CSTATE_INIT(2),
327	PKG_CSTATE_INIT(3),
328	PKG_CSTATE_INIT(6),
329	PKG_CSTATE_INIT(7),
330	PKG_CSTATE_INIT(8),
331	PKG_CSTATE_INIT(9),
332	PKG_CSTATE_INIT(10),
333	{NULL},
334};
335
336static bool has_pkg_state_counter(void)
337{
338	u64 val;
339	struct pkg_cstate_info *info = pkg_cstates;
340
341	/* check if any one of the counter msrs exists */
342	while (info->msr_index) {
343		if (!rdmsrl_safe(info->msr_index, &val))
344			return true;
345		info++;
346	}
347
348	return false;
349}
350
351static u64 pkg_state_counter(void)
352{
353	u64 val;
354	u64 count = 0;
355	struct pkg_cstate_info *info = pkg_cstates;
356
357	while (info->msr_index) {
358		if (!info->skip) {
359			if (!rdmsrl_safe(info->msr_index, &val))
360				count += val;
361			else
362				info->skip = true;
363		}
364		info++;
365	}
366
367	return count;
368}
369
370static unsigned int get_compensation(int ratio)
371{
372	unsigned int comp = 0;
373
374	if (!poll_pkg_cstate_enable)
375		return 0;
376
377	/* we only use compensation if all adjacent ones are good */
378	if (ratio == 1 &&
379		cal_data[ratio].confidence >= CONFIDENCE_OK &&
380		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
381		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
382		comp = (cal_data[ratio].steady_comp +
383			cal_data[ratio + 1].steady_comp +
384			cal_data[ratio + 2].steady_comp) / 3;
385	} else if (ratio == MAX_TARGET_RATIO - 1 &&
386		cal_data[ratio].confidence >= CONFIDENCE_OK &&
387		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
388		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
389		comp = (cal_data[ratio].steady_comp +
390			cal_data[ratio - 1].steady_comp +
391			cal_data[ratio - 2].steady_comp) / 3;
392	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
393		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
394		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
395		comp = (cal_data[ratio].steady_comp +
396			cal_data[ratio - 1].steady_comp +
397			cal_data[ratio + 1].steady_comp) / 3;
398	}
399
400	/* do not exceed limit */
401	if (comp + ratio >= MAX_TARGET_RATIO)
402		comp = MAX_TARGET_RATIO - ratio - 1;
403
404	return comp;
405}
406
407static void adjust_compensation(int target_ratio, unsigned int win)
408{
409	int delta;
410	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
411
412	/*
413	 * adjust compensations if confidence level has not been reached.
414	 */
415	if (d->confidence >= CONFIDENCE_OK)
416		return;
417
418	delta = powerclamp_data.target_ratio - current_ratio;
419	/* filter out bad data */
420	if (delta >= 0 && delta <= (1+target_ratio/10)) {
421		if (d->steady_comp)
422			d->steady_comp =
423				roundup(delta+d->steady_comp, 2)/2;
424		else
425			d->steady_comp = delta;
426		d->confidence++;
427	}
428}
429
430static bool powerclamp_adjust_controls(unsigned int target_ratio,
431				unsigned int guard, unsigned int win)
432{
433	static u64 msr_last, tsc_last;
434	u64 msr_now, tsc_now;
435	u64 val64;
436
437	/* check result for the last window */
438	msr_now = pkg_state_counter();
439	tsc_now = rdtsc();
440
441	/* calculate pkg cstate vs tsc ratio */
442	if (!msr_last || !tsc_last)
443		current_ratio = 1;
444	else if (tsc_now-tsc_last) {
445		val64 = 100*(msr_now-msr_last);
446		do_div(val64, (tsc_now-tsc_last));
447		current_ratio = val64;
448	}
449
450	/* update record */
451	msr_last = msr_now;
452	tsc_last = tsc_now;
453
454	adjust_compensation(target_ratio, win);
455
456	/* if we are above target+guard, skip */
457	return powerclamp_data.target_ratio + guard <= current_ratio;
458}
459
460/*
461 * This function calculates runtime from the current target ratio.
462 * This function gets called under powerclamp_lock.
463 */
464static unsigned int get_run_time(void)
465{
 
 
 
466	unsigned int compensated_ratio;
467	unsigned int runtime;
 
 
 
468
469	/*
470	 * make sure user selected ratio does not take effect until
471	 * the next round. adjust target_ratio if user has changed
472	 * target such that we can converge quickly.
473	 */
474	powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
475	powerclamp_data.window_size_now = window_size;
 
 
 
476
477	/*
478	 * systems may have different ability to enter package level
479	 * c-states, thus we need to compensate the injected idle ratio
480	 * to achieve the actual target reported by the HW.
481	 */
482	compensated_ratio = powerclamp_data.target_ratio +
483		get_compensation(powerclamp_data.target_ratio);
484	if (compensated_ratio <= 0)
485		compensated_ratio = 1;
 
486
487	runtime = duration * 100 / compensated_ratio - duration;
 
 
 
 
488
489	return runtime;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490}
491
492/*
493 * 1 HZ polling while clamping is active, useful for userspace
494 * to monitor actual idle ratio.
495 */
496static void poll_pkg_cstate(struct work_struct *dummy);
497static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
498static void poll_pkg_cstate(struct work_struct *dummy)
499{
500	static u64 msr_last;
501	static u64 tsc_last;
502
503	u64 msr_now;
504	u64 tsc_now;
505	u64 val64;
506
507	msr_now = pkg_state_counter();
508	tsc_now = rdtsc();
509
510	/* calculate pkg cstate vs tsc ratio */
511	if (!msr_last || !tsc_last)
512		pkg_cstate_ratio_cur = 1;
513	else {
514		if (tsc_now - tsc_last) {
515			val64 = 100 * (msr_now - msr_last);
516			do_div(val64, (tsc_now - tsc_last));
517			pkg_cstate_ratio_cur = val64;
518		}
519	}
520
521	/* update record */
522	msr_last = msr_now;
523	tsc_last = tsc_now;
524
525	mutex_lock(&powerclamp_lock);
526	if (powerclamp_data.clamping)
527		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
528	mutex_unlock(&powerclamp_lock);
529}
530
531static struct idle_inject_device *ii_dev;
532
533/*
534 * This function is called from idle injection core on timer expiry
535 * for the run duration. This allows powerclamp to readjust or skip
536 * injecting idle for this cycle.
537 */
538static bool idle_inject_update(void)
539{
540	bool update = false;
 
541
542	/* We can't sleep in this callback */
543	if (!mutex_trylock(&powerclamp_lock))
544		return true;
545
546	if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
 
 
 
 
 
 
 
 
 
 
547
548		should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio,
549							 powerclamp_data.guard,
550							 powerclamp_data.window_size_now);
551		update = true;
552	}
553
554	if (update) {
555		unsigned int runtime = get_run_time();
556
557		idle_inject_set_duration(ii_dev, runtime, duration);
558	}
559
560	powerclamp_data.count++;
561
562	mutex_unlock(&powerclamp_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
563
564	if (should_skip)
565		return false;
566
567	return true;
568}
569
570/* This function starts idle injection by calling idle_inject_start() */
571static void trigger_idle_injection(void)
572{
573	unsigned int runtime = get_run_time();
574
575	idle_inject_set_duration(ii_dev, runtime, duration);
576	idle_inject_start(ii_dev);
577	powerclamp_data.clamping = true;
578}
 
 
579
580/*
581 * This function is called from start_power_clamp() to register
582 * CPUS with powercap idle injection register and set default
583 * idle duration and latency.
584 */
585static int powerclamp_idle_injection_register(void)
586{
587	poll_pkg_cstate_enable = false;
588	if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
589		ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update);
590		if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
591			poll_pkg_cstate_enable = true;
592	} else {
593		ii_dev = idle_inject_register(idle_injection_cpu_mask);
594	}
595
596	if (!ii_dev) {
597		pr_err("powerclamp: idle_inject_register failed\n");
598		return -EAGAIN;
599	}
600
601	idle_inject_set_duration(ii_dev, TICK_USEC, duration);
602	idle_inject_set_latency(ii_dev, UINT_MAX);
603
604	return 0;
605}
606
607/*
608 * This function is called from end_power_clamp() to stop idle injection
609 * and unregister CPUS from powercap idle injection core.
610 */
611static void remove_idle_injection(void)
612{
613	if (!powerclamp_data.clamping)
614		return;
615
616	powerclamp_data.clamping = false;
617	idle_inject_stop(ii_dev);
 
 
 
 
 
 
 
618}
619
620/*
621 * This function is called when user change the cooling device
622 * state from zero to some other value.
623 */
624static int start_power_clamp(void)
625{
626	int ret;
627
628	ret = powerclamp_idle_injection_register();
629	if (!ret) {
630		trigger_idle_injection();
631		if (poll_pkg_cstate_enable)
632			schedule_delayed_work(&poll_pkg_cstate_work, 0);
633	}
634
635	return ret;
636}
637
638/*
639 * This function is called when user change the cooling device
640 * state from non zero value zero.
641 */
642static void end_power_clamp(void)
643{
644	if (powerclamp_data.clamping) {
645		remove_idle_injection();
646		idle_inject_unregister(ii_dev);
647	}
 
 
 
 
 
 
 
 
648}
649
650static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
651				 unsigned long *state)
652{
653	*state = MAX_TARGET_RATIO;
654
655	return 0;
656}
657
658static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
659				 unsigned long *state)
660{
661	mutex_lock(&powerclamp_lock);
662	*state = powerclamp_data.target_ratio;
663	mutex_unlock(&powerclamp_lock);
 
 
664
665	return 0;
666}
667
668static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
669				 unsigned long new_target_ratio)
670{
671	int ret = 0;
672
673	mutex_lock(&powerclamp_lock);
674
675	new_target_ratio = clamp(new_target_ratio, 0UL,
676				(unsigned long) (max_idle - 1));
677
678	if (powerclamp_data.target_ratio == new_target_ratio)
679		goto exit_set;
680
681	if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
682		pr_info("Start idle injection to reduce power\n");
683		powerclamp_data.target_ratio = new_target_ratio;
684		ret = start_power_clamp();
685		if (ret)
686			powerclamp_data.target_ratio = 0;
687		goto exit_set;
688	} else	if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) {
689		pr_info("Stop forced idle injection\n");
690		end_power_clamp();
691		powerclamp_data.target_ratio = 0;
692	} else	/* adjust currently running */ {
693		unsigned int runtime;
694
695		powerclamp_data.target_ratio = new_target_ratio;
696		runtime = get_run_time();
697		idle_inject_set_duration(ii_dev, runtime, duration);
698	}
699
700exit_set:
701	mutex_unlock(&powerclamp_lock);
702
703	return ret;
704}
705
706/* bind to generic thermal layer as cooling device*/
707static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
708	.get_max_state = powerclamp_get_max_state,
709	.get_cur_state = powerclamp_get_cur_state,
710	.set_cur_state = powerclamp_set_cur_state,
711};
712
713static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
714	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
715	{}
716};
717MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
718
719static int __init powerclamp_probe(void)
720{
721
722	if (!x86_match_cpu(intel_powerclamp_ids)) {
723		pr_err("CPU does not support MWAIT\n");
724		return -ENODEV;
725	}
726
727	/* The goal for idle time alignment is to achieve package cstate. */
728	if (!has_pkg_state_counter()) {
729		pr_info("No package C-state available\n");
730		return -ENODEV;
731	}
732
 
 
 
733	return 0;
734}
735
736static int powerclamp_debug_show(struct seq_file *m, void *unused)
737{
738	int i = 0;
739
 
740	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
741	for (i = 0; i < MAX_TARGET_RATIO; i++) {
742		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
743			i,
744			cal_data[i].confidence,
745			cal_data[i].steady_comp,
746			cal_data[i].dynamic_comp);
747	}
748
749	return 0;
750}
751
752DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
753
754static inline void powerclamp_create_debug_files(void)
755{
756	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
757
758	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
759			    &powerclamp_debug_fops);
760}
761
 
 
762static int __init powerclamp_init(void)
763{
764	int retval;
765
 
 
 
 
766	/* probe cpu features and ids here */
767	retval = powerclamp_probe();
768	if (retval)
769		return retval;
770
771	mutex_lock(&powerclamp_lock);
772	if (!cpumask_available(idle_injection_cpu_mask))
773		retval = allocate_copy_idle_injection_mask(cpu_present_mask);
774	mutex_unlock(&powerclamp_lock);
775
776	if (retval)
777		return retval;
778
779	/* set default limit, maybe adjusted during runtime based on feedback */
780	window_size = 2;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
782	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
783						      &powerclamp_cooling_ops);
784	if (IS_ERR(cooling_dev))
785		return -ENODEV;
 
 
786
787	if (!duration)
788		duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES);
789
790	powerclamp_create_debug_files();
791
792	return 0;
 
 
 
 
 
 
 
 
793}
794module_init(powerclamp_init);
795
796static void __exit powerclamp_exit(void)
797{
798	mutex_lock(&powerclamp_lock);
799	end_power_clamp();
800	mutex_unlock(&powerclamp_lock);
801
802	thermal_cooling_device_unregister(cooling_dev);
 
803
804	cancel_delayed_work_sync(&poll_pkg_cstate_work);
805	debugfs_remove_recursive(debug_dir);
806
807	if (cpumask_available(idle_injection_cpu_mask))
808		free_cpumask_var(idle_injection_cpu_mask);
809}
810module_exit(powerclamp_exit);
811
812MODULE_IMPORT_NS(IDLE_INJECT);
813
814MODULE_LICENSE("GPL");
815MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
816MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
817MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");