Linux Audio

Check our new training course

Loading...
v6.2
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * intel_powerclamp.c - package c-state idle injection
  4 *
  5 * Copyright (c) 2012, Intel Corporation.
  6 *
  7 * Authors:
  8 *     Arjan van de Ven <arjan@linux.intel.com>
  9 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
 10 *
 11 *	TODO:
 12 *           1. better handle wakeup from external interrupts, currently a fixed
 13 *              compensation is added to clamping duration when excessive amount
 14 *              of wakeups are observed during idle time. the reason is that in
 15 *              case of external interrupts without need for ack, clamping down
 16 *              cpu in non-irq context does not reduce irq. for majority of the
 17 *              cases, clamping down cpu does help reduce irq as well, we should
 18 *              be able to differentiate the two cases and give a quantitative
 19 *              solution for the irqs that we can control. perhaps based on
 20 *              get_cpu_iowait_time_us()
 21 *
 22 *	     2. synchronization with other hw blocks
 23 */
 24
 25#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 26
 27#include <linux/module.h>
 28#include <linux/kernel.h>
 29#include <linux/delay.h>
 30#include <linux/kthread.h>
 31#include <linux/cpu.h>
 32#include <linux/thermal.h>
 33#include <linux/slab.h>
 34#include <linux/tick.h>
 35#include <linux/debugfs.h>
 36#include <linux/seq_file.h>
 37#include <linux/sched/rt.h>
 38#include <uapi/linux/sched/types.h>
 39
 40#include <asm/nmi.h>
 41#include <asm/msr.h>
 42#include <asm/mwait.h>
 43#include <asm/cpu_device_id.h>
 44#include <asm/hardirq.h>
 45
 46#define MAX_TARGET_RATIO (50U)
 47/* For each undisturbed clamping period (no extra wake ups during idle time),
 48 * we increment the confidence counter for the given target ratio.
 49 * CONFIDENCE_OK defines the level where runtime calibration results are
 50 * valid.
 51 */
 52#define CONFIDENCE_OK (3)
 53/* Default idle injection duration, driver adjust sleep time to meet target
 54 * idle ratio. Similar to frequency modulation.
 55 */
 56#define DEFAULT_DURATION_JIFFIES (6)
 57
 58static unsigned int target_mwait;
 59static struct dentry *debug_dir;
 60
 61/* user selected target */
 62static unsigned int set_target_ratio;
 63static unsigned int current_ratio;
 64static bool should_skip;
 65
 
 66static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 67				  * control parameters. default to BSP but BSP
 68				  * can be offlined.
 69				  */
 70static bool clamping;
 71
 
 
 
 72struct powerclamp_worker_data {
 73	struct kthread_worker *worker;
 74	struct kthread_work balancing_work;
 75	struct kthread_delayed_work idle_injection_work;
 76	unsigned int cpu;
 77	unsigned int count;
 78	unsigned int guard;
 79	unsigned int window_size_now;
 80	unsigned int target_ratio;
 81	unsigned int duration_jiffies;
 82	bool clamping;
 83};
 84
 85static struct powerclamp_worker_data __percpu *worker_data;
 86static struct thermal_cooling_device *cooling_dev;
 87static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
 88					   * clamping kthread worker
 89					   */
 90
 91static unsigned int duration;
 92static unsigned int pkg_cstate_ratio_cur;
 93static unsigned int window_size;
 94
 95static int duration_set(const char *arg, const struct kernel_param *kp)
 96{
 97	int ret = 0;
 98	unsigned long new_duration;
 99
100	ret = kstrtoul(arg, 10, &new_duration);
101	if (ret)
102		goto exit;
103	if (new_duration > 25 || new_duration < 6) {
104		pr_err("Out of recommended range %lu, between 6-25ms\n",
105			new_duration);
106		ret = -EINVAL;
107	}
108
109	duration = clamp(new_duration, 6ul, 25ul);
110	smp_mb();
111
112exit:
113
114	return ret;
115}
116
117static const struct kernel_param_ops duration_ops = {
118	.set = duration_set,
119	.get = param_get_int,
120};
121
122
123module_param_cb(duration, &duration_ops, &duration, 0644);
124MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
125
126struct powerclamp_calibration_data {
127	unsigned long confidence;  /* used for calibration, basically a counter
128				    * gets incremented each time a clamping
129				    * period is completed without extra wakeups
130				    * once that counter is reached given level,
131				    * compensation is deemed usable.
132				    */
133	unsigned long steady_comp; /* steady state compensation used when
134				    * no extra wakeups occurred.
135				    */
136	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
137				     * mostly from external interrupts.
138				     */
139};
140
141static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
142
143static int window_size_set(const char *arg, const struct kernel_param *kp)
144{
145	int ret = 0;
146	unsigned long new_window_size;
147
148	ret = kstrtoul(arg, 10, &new_window_size);
149	if (ret)
150		goto exit_win;
151	if (new_window_size > 10 || new_window_size < 2) {
152		pr_err("Out of recommended window size %lu, between 2-10\n",
153			new_window_size);
154		ret = -EINVAL;
155	}
156
157	window_size = clamp(new_window_size, 2ul, 10ul);
158	smp_mb();
159
160exit_win:
161
162	return ret;
163}
164
165static const struct kernel_param_ops window_size_ops = {
166	.set = window_size_set,
167	.get = param_get_int,
168};
169
170module_param_cb(window_size, &window_size_ops, &window_size, 0644);
171MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
172	"\tpowerclamp controls idle ratio within this window. larger\n"
173	"\twindow size results in slower response time but more smooth\n"
174	"\tclamping results. default to 2.");
175
176static void find_target_mwait(void)
177{
178	unsigned int eax, ebx, ecx, edx;
179	unsigned int highest_cstate = 0;
180	unsigned int highest_subcstate = 0;
181	int i;
182
183	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
184		return;
185
186	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
187
188	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
189	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
190		return;
191
192	edx >>= MWAIT_SUBSTATE_SIZE;
193	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
194		if (edx & MWAIT_SUBSTATE_MASK) {
195			highest_cstate = i;
196			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
197		}
198	}
199	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
200		(highest_subcstate - 1);
201
202}
203
204struct pkg_cstate_info {
205	bool skip;
206	int msr_index;
207	int cstate_id;
208};
209
210#define PKG_CSTATE_INIT(id) {				\
211		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
212		.cstate_id = id				\
213			}
214
215static struct pkg_cstate_info pkg_cstates[] = {
216	PKG_CSTATE_INIT(2),
217	PKG_CSTATE_INIT(3),
218	PKG_CSTATE_INIT(6),
219	PKG_CSTATE_INIT(7),
220	PKG_CSTATE_INIT(8),
221	PKG_CSTATE_INIT(9),
222	PKG_CSTATE_INIT(10),
223	{NULL},
224};
225
226static bool has_pkg_state_counter(void)
227{
228	u64 val;
229	struct pkg_cstate_info *info = pkg_cstates;
230
231	/* check if any one of the counter msrs exists */
232	while (info->msr_index) {
233		if (!rdmsrl_safe(info->msr_index, &val))
234			return true;
235		info++;
236	}
237
238	return false;
239}
240
241static u64 pkg_state_counter(void)
242{
243	u64 val;
244	u64 count = 0;
245	struct pkg_cstate_info *info = pkg_cstates;
246
247	while (info->msr_index) {
248		if (!info->skip) {
249			if (!rdmsrl_safe(info->msr_index, &val))
250				count += val;
251			else
252				info->skip = true;
253		}
254		info++;
255	}
256
257	return count;
258}
259
260static unsigned int get_compensation(int ratio)
261{
262	unsigned int comp = 0;
263
264	/* we only use compensation if all adjacent ones are good */
265	if (ratio == 1 &&
266		cal_data[ratio].confidence >= CONFIDENCE_OK &&
267		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
268		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
269		comp = (cal_data[ratio].steady_comp +
270			cal_data[ratio + 1].steady_comp +
271			cal_data[ratio + 2].steady_comp) / 3;
272	} else if (ratio == MAX_TARGET_RATIO - 1 &&
273		cal_data[ratio].confidence >= CONFIDENCE_OK &&
274		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
275		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
276		comp = (cal_data[ratio].steady_comp +
277			cal_data[ratio - 1].steady_comp +
278			cal_data[ratio - 2].steady_comp) / 3;
279	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
280		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
281		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
282		comp = (cal_data[ratio].steady_comp +
283			cal_data[ratio - 1].steady_comp +
284			cal_data[ratio + 1].steady_comp) / 3;
285	}
286
 
 
 
287	/* do not exceed limit */
288	if (comp + ratio >= MAX_TARGET_RATIO)
289		comp = MAX_TARGET_RATIO - ratio - 1;
290
291	return comp;
292}
293
294static void adjust_compensation(int target_ratio, unsigned int win)
295{
296	int delta;
297	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
298
299	/*
300	 * adjust compensations if confidence level has not been reached.
 
 
301	 */
302	if (d->confidence >= CONFIDENCE_OK)
 
 
303		return;
304
305	delta = set_target_ratio - current_ratio;
306	/* filter out bad data */
307	if (delta >= 0 && delta <= (1+target_ratio/10)) {
308		if (d->steady_comp)
309			d->steady_comp =
310				roundup(delta+d->steady_comp, 2)/2;
311		else
312			d->steady_comp = delta;
313		d->confidence++;
314	}
315}
316
317static bool powerclamp_adjust_controls(unsigned int target_ratio,
318				unsigned int guard, unsigned int win)
319{
320	static u64 msr_last, tsc_last;
321	u64 msr_now, tsc_now;
322	u64 val64;
323
324	/* check result for the last window */
325	msr_now = pkg_state_counter();
326	tsc_now = rdtsc();
327
328	/* calculate pkg cstate vs tsc ratio */
329	if (!msr_last || !tsc_last)
330		current_ratio = 1;
331	else if (tsc_now-tsc_last) {
332		val64 = 100*(msr_now-msr_last);
333		do_div(val64, (tsc_now-tsc_last));
334		current_ratio = val64;
335	}
336
337	/* update record */
338	msr_last = msr_now;
339	tsc_last = tsc_now;
340
341	adjust_compensation(target_ratio, win);
 
 
 
 
 
 
342
 
343	/* if we are above target+guard, skip */
344	return set_target_ratio + guard <= current_ratio;
345}
346
347static void clamp_balancing_func(struct kthread_work *work)
348{
349	struct powerclamp_worker_data *w_data;
350	int sleeptime;
351	unsigned long target_jiffies;
352	unsigned int compensated_ratio;
353	int interval; /* jiffies to sleep for each attempt */
354
355	w_data = container_of(work, struct powerclamp_worker_data,
356			      balancing_work);
357
358	/*
359	 * make sure user selected ratio does not take effect until
360	 * the next round. adjust target_ratio if user has changed
361	 * target such that we can converge quickly.
362	 */
363	w_data->target_ratio = READ_ONCE(set_target_ratio);
364	w_data->guard = 1 + w_data->target_ratio / 20;
365	w_data->window_size_now = window_size;
366	w_data->duration_jiffies = msecs_to_jiffies(duration);
367	w_data->count++;
368
369	/*
370	 * systems may have different ability to enter package level
371	 * c-states, thus we need to compensate the injected idle ratio
372	 * to achieve the actual target reported by the HW.
373	 */
374	compensated_ratio = w_data->target_ratio +
375		get_compensation(w_data->target_ratio);
376	if (compensated_ratio <= 0)
377		compensated_ratio = 1;
378	interval = w_data->duration_jiffies * 100 / compensated_ratio;
379
380	/* align idle time */
381	target_jiffies = roundup(jiffies, interval);
382	sleeptime = target_jiffies - jiffies;
383	if (sleeptime <= 0)
384		sleeptime = 1;
385
386	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
387		kthread_queue_delayed_work(w_data->worker,
388					   &w_data->idle_injection_work,
389					   sleeptime);
390}
391
392static void clamp_idle_injection_func(struct kthread_work *work)
393{
394	struct powerclamp_worker_data *w_data;
395
396	w_data = container_of(work, struct powerclamp_worker_data,
397			      idle_injection_work.work);
398
399	/*
400	 * only elected controlling cpu can collect stats and update
401	 * control parameters.
402	 */
403	if (w_data->cpu == control_cpu &&
404	    !(w_data->count % w_data->window_size_now)) {
405		should_skip =
406			powerclamp_adjust_controls(w_data->target_ratio,
407						   w_data->guard,
408						   w_data->window_size_now);
409		smp_mb();
410	}
411
412	if (should_skip)
413		goto balance;
414
415	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
416
417balance:
418	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
419		kthread_queue_work(w_data->worker, &w_data->balancing_work);
420}
421
422/*
423 * 1 HZ polling while clamping is active, useful for userspace
424 * to monitor actual idle ratio.
425 */
426static void poll_pkg_cstate(struct work_struct *dummy);
427static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
428static void poll_pkg_cstate(struct work_struct *dummy)
429{
430	static u64 msr_last;
431	static u64 tsc_last;
432
433	u64 msr_now;
434	u64 tsc_now;
435	u64 val64;
436
437	msr_now = pkg_state_counter();
438	tsc_now = rdtsc();
439
440	/* calculate pkg cstate vs tsc ratio */
441	if (!msr_last || !tsc_last)
442		pkg_cstate_ratio_cur = 1;
443	else {
444		if (tsc_now - tsc_last) {
445			val64 = 100 * (msr_now - msr_last);
446			do_div(val64, (tsc_now - tsc_last));
447			pkg_cstate_ratio_cur = val64;
448		}
449	}
450
451	/* update record */
452	msr_last = msr_now;
453	tsc_last = tsc_now;
454
455	if (true == clamping)
456		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
457}
458
459static void start_power_clamp_worker(unsigned long cpu)
460{
461	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
462	struct kthread_worker *worker;
463
464	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
465	if (IS_ERR(worker))
466		return;
467
468	w_data->worker = worker;
469	w_data->count = 0;
470	w_data->cpu = cpu;
471	w_data->clamping = true;
472	set_bit(cpu, cpu_clamping_mask);
473	sched_set_fifo(worker->task);
474	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
475	kthread_init_delayed_work(&w_data->idle_injection_work,
476				  clamp_idle_injection_func);
477	kthread_queue_work(w_data->worker, &w_data->balancing_work);
478}
479
480static void stop_power_clamp_worker(unsigned long cpu)
481{
482	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
483
484	if (!w_data->worker)
485		return;
486
487	w_data->clamping = false;
488	/*
489	 * Make sure that all works that get queued after this point see
490	 * the clamping disabled. The counter part is not needed because
491	 * there is an implicit memory barrier when the queued work
492	 * is proceed.
493	 */
494	smp_wmb();
495	kthread_cancel_work_sync(&w_data->balancing_work);
496	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
497	/*
498	 * The balancing work still might be queued here because
499	 * the handling of the "clapming" variable, cancel, and queue
500	 * operations are not synchronized via a lock. But it is not
501	 * a big deal. The balancing work is fast and destroy kthread
502	 * will wait for it.
503	 */
504	clear_bit(w_data->cpu, cpu_clamping_mask);
505	kthread_destroy_worker(w_data->worker);
506
507	w_data->worker = NULL;
508}
509
510static int start_power_clamp(void)
511{
512	unsigned long cpu;
513
514	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
515	/* prevent cpu hotplug */
516	cpus_read_lock();
517
518	/* prefer BSP */
519	control_cpu = cpumask_first(cpu_online_mask);
 
 
520
521	clamping = true;
522	schedule_delayed_work(&poll_pkg_cstate_work, 0);
523
524	/* start one kthread worker per online cpu */
525	for_each_online_cpu(cpu) {
526		start_power_clamp_worker(cpu);
527	}
528	cpus_read_unlock();
529
530	return 0;
531}
532
533static void end_power_clamp(void)
534{
535	int i;
536
537	/*
538	 * Block requeuing in all the kthread workers. They will flush and
539	 * stop faster.
540	 */
541	clamping = false;
542	for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
543		pr_debug("clamping worker for cpu %d alive, destroy\n", i);
544		stop_power_clamp_worker(i);
 
 
 
545	}
546}
547
548static int powerclamp_cpu_online(unsigned int cpu)
549{
550	if (clamping == false)
551		return 0;
552	start_power_clamp_worker(cpu);
553	/* prefer BSP as controlling CPU */
554	if (cpu == 0) {
555		control_cpu = 0;
556		smp_mb();
557	}
558	return 0;
559}
560
561static int powerclamp_cpu_predown(unsigned int cpu)
562{
563	if (clamping == false)
564		return 0;
565
566	stop_power_clamp_worker(cpu);
567	if (cpu != control_cpu)
568		return 0;
569
570	control_cpu = cpumask_first(cpu_online_mask);
571	if (control_cpu == cpu)
572		control_cpu = cpumask_next(cpu, cpu_online_mask);
573	smp_mb();
574	return 0;
575}
576
577static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
578				 unsigned long *state)
579{
580	*state = MAX_TARGET_RATIO;
581
582	return 0;
583}
584
585static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
586				 unsigned long *state)
587{
588	if (true == clamping)
589		*state = pkg_cstate_ratio_cur;
590	else
591		/* to save power, do not poll idle ratio while not clamping */
592		*state = -1; /* indicates invalid state */
593
594	return 0;
595}
596
597static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
598				 unsigned long new_target_ratio)
599{
600	int ret = 0;
601
602	new_target_ratio = clamp(new_target_ratio, 0UL,
603				(unsigned long) (MAX_TARGET_RATIO-1));
604	if (set_target_ratio == 0 && new_target_ratio > 0) {
605		pr_info("Start idle injection to reduce power\n");
606		set_target_ratio = new_target_ratio;
607		ret = start_power_clamp();
608		goto exit_set;
609	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
610		pr_info("Stop forced idle injection\n");
611		end_power_clamp();
612		set_target_ratio = 0;
613	} else	/* adjust currently running */ {
614		set_target_ratio = new_target_ratio;
615		/* make new set_target_ratio visible to other cpus */
616		smp_mb();
617	}
618
619exit_set:
620	return ret;
621}
622
623/* bind to generic thermal layer as cooling device*/
624static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
625	.get_max_state = powerclamp_get_max_state,
626	.get_cur_state = powerclamp_get_cur_state,
627	.set_cur_state = powerclamp_set_cur_state,
628};
629
630static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
631	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
632	{}
633};
634MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
635
636static int __init powerclamp_probe(void)
637{
638
639	if (!x86_match_cpu(intel_powerclamp_ids)) {
640		pr_err("CPU does not support MWAIT\n");
641		return -ENODEV;
642	}
643
644	/* The goal for idle time alignment is to achieve package cstate. */
645	if (!has_pkg_state_counter()) {
646		pr_info("No package C-state available\n");
647		return -ENODEV;
648	}
649
650	/* find the deepest mwait value */
651	find_target_mwait();
652
653	return 0;
654}
655
656static int powerclamp_debug_show(struct seq_file *m, void *unused)
657{
658	int i = 0;
659
660	seq_printf(m, "controlling cpu: %d\n", control_cpu);
661	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
662	for (i = 0; i < MAX_TARGET_RATIO; i++) {
663		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
664			i,
665			cal_data[i].confidence,
666			cal_data[i].steady_comp,
667			cal_data[i].dynamic_comp);
668	}
669
670	return 0;
671}
672
673DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
674
675static inline void powerclamp_create_debug_files(void)
676{
677	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
678
679	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
680			    &powerclamp_debug_fops);
681}
682
683static enum cpuhp_state hp_state;
684
685static int __init powerclamp_init(void)
686{
687	int retval;
 
688
689	cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
 
690	if (!cpu_clamping_mask)
691		return -ENOMEM;
692
693	/* probe cpu features and ids here */
694	retval = powerclamp_probe();
695	if (retval)
696		goto exit_free;
697
698	/* set default limit, maybe adjusted during runtime based on feedback */
699	window_size = 2;
700	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
701					   "thermal/intel_powerclamp:online",
702					   powerclamp_cpu_online,
703					   powerclamp_cpu_predown);
704	if (retval < 0)
705		goto exit_free;
706
707	hp_state = retval;
708
709	worker_data = alloc_percpu(struct powerclamp_worker_data);
710	if (!worker_data) {
711		retval = -ENOMEM;
712		goto exit_unregister;
713	}
714
715	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
716						&powerclamp_cooling_ops);
717	if (IS_ERR(cooling_dev)) {
718		retval = -ENODEV;
719		goto exit_free_thread;
720	}
721
722	if (!duration)
723		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
724
725	powerclamp_create_debug_files();
726
727	return 0;
728
729exit_free_thread:
730	free_percpu(worker_data);
731exit_unregister:
732	cpuhp_remove_state_nocalls(hp_state);
733exit_free:
734	bitmap_free(cpu_clamping_mask);
735	return retval;
736}
737module_init(powerclamp_init);
738
739static void __exit powerclamp_exit(void)
740{
741	end_power_clamp();
742	cpuhp_remove_state_nocalls(hp_state);
743	free_percpu(worker_data);
744	thermal_cooling_device_unregister(cooling_dev);
745	bitmap_free(cpu_clamping_mask);
746
747	cancel_delayed_work_sync(&poll_pkg_cstate_work);
748	debugfs_remove_recursive(debug_dir);
749}
750module_exit(powerclamp_exit);
751
752MODULE_LICENSE("GPL");
753MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
754MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
755MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
v5.4
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * intel_powerclamp.c - package c-state idle injection
  4 *
  5 * Copyright (c) 2012, Intel Corporation.
  6 *
  7 * Authors:
  8 *     Arjan van de Ven <arjan@linux.intel.com>
  9 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
 10 *
 11 *	TODO:
 12 *           1. better handle wakeup from external interrupts, currently a fixed
 13 *              compensation is added to clamping duration when excessive amount
 14 *              of wakeups are observed during idle time. the reason is that in
 15 *              case of external interrupts without need for ack, clamping down
 16 *              cpu in non-irq context does not reduce irq. for majority of the
 17 *              cases, clamping down cpu does help reduce irq as well, we should
 18 *              be able to differentiate the two cases and give a quantitative
 19 *              solution for the irqs that we can control. perhaps based on
 20 *              get_cpu_iowait_time_us()
 21 *
 22 *	     2. synchronization with other hw blocks
 23 */
 24
 25#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 26
 27#include <linux/module.h>
 28#include <linux/kernel.h>
 29#include <linux/delay.h>
 30#include <linux/kthread.h>
 31#include <linux/cpu.h>
 32#include <linux/thermal.h>
 33#include <linux/slab.h>
 34#include <linux/tick.h>
 35#include <linux/debugfs.h>
 36#include <linux/seq_file.h>
 37#include <linux/sched/rt.h>
 38#include <uapi/linux/sched/types.h>
 39
 40#include <asm/nmi.h>
 41#include <asm/msr.h>
 42#include <asm/mwait.h>
 43#include <asm/cpu_device_id.h>
 44#include <asm/hardirq.h>
 45
 46#define MAX_TARGET_RATIO (50U)
 47/* For each undisturbed clamping period (no extra wake ups during idle time),
 48 * we increment the confidence counter for the given target ratio.
 49 * CONFIDENCE_OK defines the level where runtime calibration results are
 50 * valid.
 51 */
 52#define CONFIDENCE_OK (3)
 53/* Default idle injection duration, driver adjust sleep time to meet target
 54 * idle ratio. Similar to frequency modulation.
 55 */
 56#define DEFAULT_DURATION_JIFFIES (6)
 57
 58static unsigned int target_mwait;
 59static struct dentry *debug_dir;
 60
 61/* user selected target */
 62static unsigned int set_target_ratio;
 63static unsigned int current_ratio;
 64static bool should_skip;
 65static bool reduce_irq;
 66static atomic_t idle_wakeup_counter;
 67static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 68				  * control parameters. default to BSP but BSP
 69				  * can be offlined.
 70				  */
 71static bool clamping;
 72
 73static const struct sched_param sparam = {
 74	.sched_priority = MAX_USER_RT_PRIO / 2,
 75};
 76struct powerclamp_worker_data {
 77	struct kthread_worker *worker;
 78	struct kthread_work balancing_work;
 79	struct kthread_delayed_work idle_injection_work;
 80	unsigned int cpu;
 81	unsigned int count;
 82	unsigned int guard;
 83	unsigned int window_size_now;
 84	unsigned int target_ratio;
 85	unsigned int duration_jiffies;
 86	bool clamping;
 87};
 88
 89static struct powerclamp_worker_data __percpu *worker_data;
 90static struct thermal_cooling_device *cooling_dev;
 91static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
 92					   * clamping kthread worker
 93					   */
 94
 95static unsigned int duration;
 96static unsigned int pkg_cstate_ratio_cur;
 97static unsigned int window_size;
 98
 99static int duration_set(const char *arg, const struct kernel_param *kp)
100{
101	int ret = 0;
102	unsigned long new_duration;
103
104	ret = kstrtoul(arg, 10, &new_duration);
105	if (ret)
106		goto exit;
107	if (new_duration > 25 || new_duration < 6) {
108		pr_err("Out of recommended range %lu, between 6-25ms\n",
109			new_duration);
110		ret = -EINVAL;
111	}
112
113	duration = clamp(new_duration, 6ul, 25ul);
114	smp_mb();
115
116exit:
117
118	return ret;
119}
120
121static const struct kernel_param_ops duration_ops = {
122	.set = duration_set,
123	.get = param_get_int,
124};
125
126
127module_param_cb(duration, &duration_ops, &duration, 0644);
128MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
129
130struct powerclamp_calibration_data {
131	unsigned long confidence;  /* used for calibration, basically a counter
132				    * gets incremented each time a clamping
133				    * period is completed without extra wakeups
134				    * once that counter is reached given level,
135				    * compensation is deemed usable.
136				    */
137	unsigned long steady_comp; /* steady state compensation used when
138				    * no extra wakeups occurred.
139				    */
140	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
141				     * mostly from external interrupts.
142				     */
143};
144
145static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
146
147static int window_size_set(const char *arg, const struct kernel_param *kp)
148{
149	int ret = 0;
150	unsigned long new_window_size;
151
152	ret = kstrtoul(arg, 10, &new_window_size);
153	if (ret)
154		goto exit_win;
155	if (new_window_size > 10 || new_window_size < 2) {
156		pr_err("Out of recommended window size %lu, between 2-10\n",
157			new_window_size);
158		ret = -EINVAL;
159	}
160
161	window_size = clamp(new_window_size, 2ul, 10ul);
162	smp_mb();
163
164exit_win:
165
166	return ret;
167}
168
169static const struct kernel_param_ops window_size_ops = {
170	.set = window_size_set,
171	.get = param_get_int,
172};
173
174module_param_cb(window_size, &window_size_ops, &window_size, 0644);
175MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
176	"\tpowerclamp controls idle ratio within this window. larger\n"
177	"\twindow size results in slower response time but more smooth\n"
178	"\tclamping results. default to 2.");
179
180static void find_target_mwait(void)
181{
182	unsigned int eax, ebx, ecx, edx;
183	unsigned int highest_cstate = 0;
184	unsigned int highest_subcstate = 0;
185	int i;
186
187	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
188		return;
189
190	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
191
192	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
193	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
194		return;
195
196	edx >>= MWAIT_SUBSTATE_SIZE;
197	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
198		if (edx & MWAIT_SUBSTATE_MASK) {
199			highest_cstate = i;
200			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
201		}
202	}
203	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
204		(highest_subcstate - 1);
205
206}
207
208struct pkg_cstate_info {
209	bool skip;
210	int msr_index;
211	int cstate_id;
212};
213
214#define PKG_CSTATE_INIT(id) {				\
215		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
216		.cstate_id = id				\
217			}
218
219static struct pkg_cstate_info pkg_cstates[] = {
220	PKG_CSTATE_INIT(2),
221	PKG_CSTATE_INIT(3),
222	PKG_CSTATE_INIT(6),
223	PKG_CSTATE_INIT(7),
224	PKG_CSTATE_INIT(8),
225	PKG_CSTATE_INIT(9),
226	PKG_CSTATE_INIT(10),
227	{NULL},
228};
229
230static bool has_pkg_state_counter(void)
231{
232	u64 val;
233	struct pkg_cstate_info *info = pkg_cstates;
234
235	/* check if any one of the counter msrs exists */
236	while (info->msr_index) {
237		if (!rdmsrl_safe(info->msr_index, &val))
238			return true;
239		info++;
240	}
241
242	return false;
243}
244
245static u64 pkg_state_counter(void)
246{
247	u64 val;
248	u64 count = 0;
249	struct pkg_cstate_info *info = pkg_cstates;
250
251	while (info->msr_index) {
252		if (!info->skip) {
253			if (!rdmsrl_safe(info->msr_index, &val))
254				count += val;
255			else
256				info->skip = true;
257		}
258		info++;
259	}
260
261	return count;
262}
263
264static unsigned int get_compensation(int ratio)
265{
266	unsigned int comp = 0;
267
268	/* we only use compensation if all adjacent ones are good */
269	if (ratio == 1 &&
270		cal_data[ratio].confidence >= CONFIDENCE_OK &&
271		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
272		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
273		comp = (cal_data[ratio].steady_comp +
274			cal_data[ratio + 1].steady_comp +
275			cal_data[ratio + 2].steady_comp) / 3;
276	} else if (ratio == MAX_TARGET_RATIO - 1 &&
277		cal_data[ratio].confidence >= CONFIDENCE_OK &&
278		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
279		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
280		comp = (cal_data[ratio].steady_comp +
281			cal_data[ratio - 1].steady_comp +
282			cal_data[ratio - 2].steady_comp) / 3;
283	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
284		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
286		comp = (cal_data[ratio].steady_comp +
287			cal_data[ratio - 1].steady_comp +
288			cal_data[ratio + 1].steady_comp) / 3;
289	}
290
291	/* REVISIT: simple penalty of double idle injection */
292	if (reduce_irq)
293		comp = ratio;
294	/* do not exceed limit */
295	if (comp + ratio >= MAX_TARGET_RATIO)
296		comp = MAX_TARGET_RATIO - ratio - 1;
297
298	return comp;
299}
300
301static void adjust_compensation(int target_ratio, unsigned int win)
302{
303	int delta;
304	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
305
306	/*
307	 * adjust compensations if confidence level has not been reached or
308	 * there are too many wakeups during the last idle injection period, we
309	 * cannot trust the data for compensation.
310	 */
311	if (d->confidence >= CONFIDENCE_OK ||
312		atomic_read(&idle_wakeup_counter) >
313		win * num_online_cpus())
314		return;
315
316	delta = set_target_ratio - current_ratio;
317	/* filter out bad data */
318	if (delta >= 0 && delta <= (1+target_ratio/10)) {
319		if (d->steady_comp)
320			d->steady_comp =
321				roundup(delta+d->steady_comp, 2)/2;
322		else
323			d->steady_comp = delta;
324		d->confidence++;
325	}
326}
327
328static bool powerclamp_adjust_controls(unsigned int target_ratio,
329				unsigned int guard, unsigned int win)
330{
331	static u64 msr_last, tsc_last;
332	u64 msr_now, tsc_now;
333	u64 val64;
334
335	/* check result for the last window */
336	msr_now = pkg_state_counter();
337	tsc_now = rdtsc();
338
339	/* calculate pkg cstate vs tsc ratio */
340	if (!msr_last || !tsc_last)
341		current_ratio = 1;
342	else if (tsc_now-tsc_last) {
343		val64 = 100*(msr_now-msr_last);
344		do_div(val64, (tsc_now-tsc_last));
345		current_ratio = val64;
346	}
347
348	/* update record */
349	msr_last = msr_now;
350	tsc_last = tsc_now;
351
352	adjust_compensation(target_ratio, win);
353	/*
354	 * too many external interrupts, set flag such
355	 * that we can take measure later.
356	 */
357	reduce_irq = atomic_read(&idle_wakeup_counter) >=
358		2 * win * num_online_cpus();
359
360	atomic_set(&idle_wakeup_counter, 0);
361	/* if we are above target+guard, skip */
362	return set_target_ratio + guard <= current_ratio;
363}
364
365static void clamp_balancing_func(struct kthread_work *work)
366{
367	struct powerclamp_worker_data *w_data;
368	int sleeptime;
369	unsigned long target_jiffies;
370	unsigned int compensated_ratio;
371	int interval; /* jiffies to sleep for each attempt */
372
373	w_data = container_of(work, struct powerclamp_worker_data,
374			      balancing_work);
375
376	/*
377	 * make sure user selected ratio does not take effect until
378	 * the next round. adjust target_ratio if user has changed
379	 * target such that we can converge quickly.
380	 */
381	w_data->target_ratio = READ_ONCE(set_target_ratio);
382	w_data->guard = 1 + w_data->target_ratio / 20;
383	w_data->window_size_now = window_size;
384	w_data->duration_jiffies = msecs_to_jiffies(duration);
385	w_data->count++;
386
387	/*
388	 * systems may have different ability to enter package level
389	 * c-states, thus we need to compensate the injected idle ratio
390	 * to achieve the actual target reported by the HW.
391	 */
392	compensated_ratio = w_data->target_ratio +
393		get_compensation(w_data->target_ratio);
394	if (compensated_ratio <= 0)
395		compensated_ratio = 1;
396	interval = w_data->duration_jiffies * 100 / compensated_ratio;
397
398	/* align idle time */
399	target_jiffies = roundup(jiffies, interval);
400	sleeptime = target_jiffies - jiffies;
401	if (sleeptime <= 0)
402		sleeptime = 1;
403
404	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
405		kthread_queue_delayed_work(w_data->worker,
406					   &w_data->idle_injection_work,
407					   sleeptime);
408}
409
410static void clamp_idle_injection_func(struct kthread_work *work)
411{
412	struct powerclamp_worker_data *w_data;
413
414	w_data = container_of(work, struct powerclamp_worker_data,
415			      idle_injection_work.work);
416
417	/*
418	 * only elected controlling cpu can collect stats and update
419	 * control parameters.
420	 */
421	if (w_data->cpu == control_cpu &&
422	    !(w_data->count % w_data->window_size_now)) {
423		should_skip =
424			powerclamp_adjust_controls(w_data->target_ratio,
425						   w_data->guard,
426						   w_data->window_size_now);
427		smp_mb();
428	}
429
430	if (should_skip)
431		goto balance;
432
433	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
434
435balance:
436	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
437		kthread_queue_work(w_data->worker, &w_data->balancing_work);
438}
439
440/*
441 * 1 HZ polling while clamping is active, useful for userspace
442 * to monitor actual idle ratio.
443 */
444static void poll_pkg_cstate(struct work_struct *dummy);
445static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
446static void poll_pkg_cstate(struct work_struct *dummy)
447{
448	static u64 msr_last;
449	static u64 tsc_last;
450
451	u64 msr_now;
452	u64 tsc_now;
453	u64 val64;
454
455	msr_now = pkg_state_counter();
456	tsc_now = rdtsc();
457
458	/* calculate pkg cstate vs tsc ratio */
459	if (!msr_last || !tsc_last)
460		pkg_cstate_ratio_cur = 1;
461	else {
462		if (tsc_now - tsc_last) {
463			val64 = 100 * (msr_now - msr_last);
464			do_div(val64, (tsc_now - tsc_last));
465			pkg_cstate_ratio_cur = val64;
466		}
467	}
468
469	/* update record */
470	msr_last = msr_now;
471	tsc_last = tsc_now;
472
473	if (true == clamping)
474		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
475}
476
477static void start_power_clamp_worker(unsigned long cpu)
478{
479	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
480	struct kthread_worker *worker;
481
482	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
483	if (IS_ERR(worker))
484		return;
485
486	w_data->worker = worker;
487	w_data->count = 0;
488	w_data->cpu = cpu;
489	w_data->clamping = true;
490	set_bit(cpu, cpu_clamping_mask);
491	sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
492	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
493	kthread_init_delayed_work(&w_data->idle_injection_work,
494				  clamp_idle_injection_func);
495	kthread_queue_work(w_data->worker, &w_data->balancing_work);
496}
497
498static void stop_power_clamp_worker(unsigned long cpu)
499{
500	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
501
502	if (!w_data->worker)
503		return;
504
505	w_data->clamping = false;
506	/*
507	 * Make sure that all works that get queued after this point see
508	 * the clamping disabled. The counter part is not needed because
509	 * there is an implicit memory barrier when the queued work
510	 * is proceed.
511	 */
512	smp_wmb();
513	kthread_cancel_work_sync(&w_data->balancing_work);
514	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
515	/*
516	 * The balancing work still might be queued here because
517	 * the handling of the "clapming" variable, cancel, and queue
518	 * operations are not synchronized via a lock. But it is not
519	 * a big deal. The balancing work is fast and destroy kthread
520	 * will wait for it.
521	 */
522	clear_bit(w_data->cpu, cpu_clamping_mask);
523	kthread_destroy_worker(w_data->worker);
524
525	w_data->worker = NULL;
526}
527
528static int start_power_clamp(void)
529{
530	unsigned long cpu;
531
532	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
533	/* prevent cpu hotplug */
534	get_online_cpus();
535
536	/* prefer BSP */
537	control_cpu = 0;
538	if (!cpu_online(control_cpu))
539		control_cpu = smp_processor_id();
540
541	clamping = true;
542	schedule_delayed_work(&poll_pkg_cstate_work, 0);
543
544	/* start one kthread worker per online cpu */
545	for_each_online_cpu(cpu) {
546		start_power_clamp_worker(cpu);
547	}
548	put_online_cpus();
549
550	return 0;
551}
552
553static void end_power_clamp(void)
554{
555	int i;
556
557	/*
558	 * Block requeuing in all the kthread workers. They will flush and
559	 * stop faster.
560	 */
561	clamping = false;
562	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
563		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
564			pr_debug("clamping worker for cpu %d alive, destroy\n",
565				 i);
566			stop_power_clamp_worker(i);
567		}
568	}
569}
570
571static int powerclamp_cpu_online(unsigned int cpu)
572{
573	if (clamping == false)
574		return 0;
575	start_power_clamp_worker(cpu);
576	/* prefer BSP as controlling CPU */
577	if (cpu == 0) {
578		control_cpu = 0;
579		smp_mb();
580	}
581	return 0;
582}
583
584static int powerclamp_cpu_predown(unsigned int cpu)
585{
586	if (clamping == false)
587		return 0;
588
589	stop_power_clamp_worker(cpu);
590	if (cpu != control_cpu)
591		return 0;
592
593	control_cpu = cpumask_first(cpu_online_mask);
594	if (control_cpu == cpu)
595		control_cpu = cpumask_next(cpu, cpu_online_mask);
596	smp_mb();
597	return 0;
598}
599
600static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
601				 unsigned long *state)
602{
603	*state = MAX_TARGET_RATIO;
604
605	return 0;
606}
607
608static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
609				 unsigned long *state)
610{
611	if (true == clamping)
612		*state = pkg_cstate_ratio_cur;
613	else
614		/* to save power, do not poll idle ratio while not clamping */
615		*state = -1; /* indicates invalid state */
616
617	return 0;
618}
619
620static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
621				 unsigned long new_target_ratio)
622{
623	int ret = 0;
624
625	new_target_ratio = clamp(new_target_ratio, 0UL,
626				(unsigned long) (MAX_TARGET_RATIO-1));
627	if (set_target_ratio == 0 && new_target_ratio > 0) {
628		pr_info("Start idle injection to reduce power\n");
629		set_target_ratio = new_target_ratio;
630		ret = start_power_clamp();
631		goto exit_set;
632	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
633		pr_info("Stop forced idle injection\n");
634		end_power_clamp();
635		set_target_ratio = 0;
636	} else	/* adjust currently running */ {
637		set_target_ratio = new_target_ratio;
638		/* make new set_target_ratio visible to other cpus */
639		smp_mb();
640	}
641
642exit_set:
643	return ret;
644}
645
646/* bind to generic thermal layer as cooling device*/
647static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
648	.get_max_state = powerclamp_get_max_state,
649	.get_cur_state = powerclamp_get_cur_state,
650	.set_cur_state = powerclamp_set_cur_state,
651};
652
653static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
654	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
655	{}
656};
657MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
658
659static int __init powerclamp_probe(void)
660{
661
662	if (!x86_match_cpu(intel_powerclamp_ids)) {
663		pr_err("CPU does not support MWAIT\n");
664		return -ENODEV;
665	}
666
667	/* The goal for idle time alignment is to achieve package cstate. */
668	if (!has_pkg_state_counter()) {
669		pr_info("No package C-state available\n");
670		return -ENODEV;
671	}
672
673	/* find the deepest mwait value */
674	find_target_mwait();
675
676	return 0;
677}
678
679static int powerclamp_debug_show(struct seq_file *m, void *unused)
680{
681	int i = 0;
682
683	seq_printf(m, "controlling cpu: %d\n", control_cpu);
684	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
685	for (i = 0; i < MAX_TARGET_RATIO; i++) {
686		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
687			i,
688			cal_data[i].confidence,
689			cal_data[i].steady_comp,
690			cal_data[i].dynamic_comp);
691	}
692
693	return 0;
694}
695
696DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
697
698static inline void powerclamp_create_debug_files(void)
699{
700	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
701
702	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
703			    &powerclamp_debug_fops);
704}
705
706static enum cpuhp_state hp_state;
707
708static int __init powerclamp_init(void)
709{
710	int retval;
711	int bitmap_size;
712
713	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
714	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
715	if (!cpu_clamping_mask)
716		return -ENOMEM;
717
718	/* probe cpu features and ids here */
719	retval = powerclamp_probe();
720	if (retval)
721		goto exit_free;
722
723	/* set default limit, maybe adjusted during runtime based on feedback */
724	window_size = 2;
725	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
726					   "thermal/intel_powerclamp:online",
727					   powerclamp_cpu_online,
728					   powerclamp_cpu_predown);
729	if (retval < 0)
730		goto exit_free;
731
732	hp_state = retval;
733
734	worker_data = alloc_percpu(struct powerclamp_worker_data);
735	if (!worker_data) {
736		retval = -ENOMEM;
737		goto exit_unregister;
738	}
739
740	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
741						&powerclamp_cooling_ops);
742	if (IS_ERR(cooling_dev)) {
743		retval = -ENODEV;
744		goto exit_free_thread;
745	}
746
747	if (!duration)
748		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
749
750	powerclamp_create_debug_files();
751
752	return 0;
753
754exit_free_thread:
755	free_percpu(worker_data);
756exit_unregister:
757	cpuhp_remove_state_nocalls(hp_state);
758exit_free:
759	kfree(cpu_clamping_mask);
760	return retval;
761}
762module_init(powerclamp_init);
763
764static void __exit powerclamp_exit(void)
765{
766	end_power_clamp();
767	cpuhp_remove_state_nocalls(hp_state);
768	free_percpu(worker_data);
769	thermal_cooling_device_unregister(cooling_dev);
770	kfree(cpu_clamping_mask);
771
772	cancel_delayed_work_sync(&poll_pkg_cstate_work);
773	debugfs_remove_recursive(debug_dir);
774}
775module_exit(powerclamp_exit);
776
777MODULE_LICENSE("GPL");
778MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
779MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
780MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");