Linux Audio

Check our new training course

Loading...
Note: File does not exist in v5.9.
  1/*
  2 * intel_powerclamp.c - package c-state idle injection
  3 *
  4 * Copyright (c) 2012, Intel Corporation.
  5 *
  6 * Authors:
  7 *     Arjan van de Ven <arjan@linux.intel.com>
  8 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
  9 *
 10 * This program is free software; you can redistribute it and/or modify it
 11 * under the terms and conditions of the GNU General Public License,
 12 * version 2, as published by the Free Software Foundation.
 13 *
 14 * This program is distributed in the hope it will be useful, but WITHOUT
 15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 16 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 17 * more details.
 18 *
 19 * You should have received a copy of the GNU General Public License along with
 20 * this program; if not, write to the Free Software Foundation, Inc.,
 21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 22 *
 23 *
 24 *	TODO:
 25 *           1. better handle wakeup from external interrupts, currently a fixed
 26 *              compensation is added to clamping duration when excessive amount
 27 *              of wakeups are observed during idle time. the reason is that in
 28 *              case of external interrupts without need for ack, clamping down
 29 *              cpu in non-irq context does not reduce irq. for majority of the
 30 *              cases, clamping down cpu does help reduce irq as well, we should
 31 *              be able to differenciate the two cases and give a quantitative
 32 *              solution for the irqs that we can control. perhaps based on
 33 *              get_cpu_iowait_time_us()
 34 *
 35 *	     2. synchronization with other hw blocks
 36 *
 37 *
 38 */
 39
 40#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 41
 42#include <linux/module.h>
 43#include <linux/kernel.h>
 44#include <linux/delay.h>
 45#include <linux/kthread.h>
 46#include <linux/freezer.h>
 47#include <linux/cpu.h>
 48#include <linux/thermal.h>
 49#include <linux/slab.h>
 50#include <linux/tick.h>
 51#include <linux/debugfs.h>
 52#include <linux/seq_file.h>
 53#include <linux/sched/rt.h>
 54
 55#include <asm/nmi.h>
 56#include <asm/msr.h>
 57#include <asm/mwait.h>
 58#include <asm/cpu_device_id.h>
 59#include <asm/idle.h>
 60#include <asm/hardirq.h>
 61
 62#define MAX_TARGET_RATIO (50U)
 63/* For each undisturbed clamping period (no extra wake ups during idle time),
 64 * we increment the confidence counter for the given target ratio.
 65 * CONFIDENCE_OK defines the level where runtime calibration results are
 66 * valid.
 67 */
 68#define CONFIDENCE_OK (3)
 69/* Default idle injection duration, driver adjust sleep time to meet target
 70 * idle ratio. Similar to frequency modulation.
 71 */
 72#define DEFAULT_DURATION_JIFFIES (6)
 73
 74static unsigned int target_mwait;
 75static struct dentry *debug_dir;
 76
 77/* user selected target */
 78static unsigned int set_target_ratio;
 79static unsigned int current_ratio;
 80static bool should_skip;
 81static bool reduce_irq;
 82static atomic_t idle_wakeup_counter;
 83static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 84				  * control parameters. default to BSP but BSP
 85				  * can be offlined.
 86				  */
 87static bool clamping;
 88
 89
 90static struct task_struct * __percpu *powerclamp_thread;
 91static struct thermal_cooling_device *cooling_dev;
 92static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
 93					   * clamping thread
 94					   */
 95
 96static unsigned int duration;
 97static unsigned int pkg_cstate_ratio_cur;
 98static unsigned int window_size;
 99
100static int duration_set(const char *arg, const struct kernel_param *kp)
101{
102	int ret = 0;
103	unsigned long new_duration;
104
105	ret = kstrtoul(arg, 10, &new_duration);
106	if (ret)
107		goto exit;
108	if (new_duration > 25 || new_duration < 6) {
109		pr_err("Out of recommended range %lu, between 6-25ms\n",
110			new_duration);
111		ret = -EINVAL;
112	}
113
114	duration = clamp(new_duration, 6ul, 25ul);
115	smp_mb();
116
117exit:
118
119	return ret;
120}
121
122static const struct kernel_param_ops duration_ops = {
123	.set = duration_set,
124	.get = param_get_int,
125};
126
127
128module_param_cb(duration, &duration_ops, &duration, 0644);
129MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
130
131struct powerclamp_calibration_data {
132	unsigned long confidence;  /* used for calibration, basically a counter
133				    * gets incremented each time a clamping
134				    * period is completed without extra wakeups
135				    * once that counter is reached given level,
136				    * compensation is deemed usable.
137				    */
138	unsigned long steady_comp; /* steady state compensation used when
139				    * no extra wakeups occurred.
140				    */
141	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
142				     * mostly from external interrupts.
143				     */
144};
145
146static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
147
148static int window_size_set(const char *arg, const struct kernel_param *kp)
149{
150	int ret = 0;
151	unsigned long new_window_size;
152
153	ret = kstrtoul(arg, 10, &new_window_size);
154	if (ret)
155		goto exit_win;
156	if (new_window_size > 10 || new_window_size < 2) {
157		pr_err("Out of recommended window size %lu, between 2-10\n",
158			new_window_size);
159		ret = -EINVAL;
160	}
161
162	window_size = clamp(new_window_size, 2ul, 10ul);
163	smp_mb();
164
165exit_win:
166
167	return ret;
168}
169
170static const struct kernel_param_ops window_size_ops = {
171	.set = window_size_set,
172	.get = param_get_int,
173};
174
175module_param_cb(window_size, &window_size_ops, &window_size, 0644);
176MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
177	"\tpowerclamp controls idle ratio within this window. larger\n"
178	"\twindow size results in slower response time but more smooth\n"
179	"\tclamping results. default to 2.");
180
181static void find_target_mwait(void)
182{
183	unsigned int eax, ebx, ecx, edx;
184	unsigned int highest_cstate = 0;
185	unsigned int highest_subcstate = 0;
186	int i;
187
188	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
189		return;
190
191	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
192
193	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
194	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
195		return;
196
197	edx >>= MWAIT_SUBSTATE_SIZE;
198	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
199		if (edx & MWAIT_SUBSTATE_MASK) {
200			highest_cstate = i;
201			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
202		}
203	}
204	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
205		(highest_subcstate - 1);
206
207}
208
209struct pkg_cstate_info {
210	bool skip;
211	int msr_index;
212	int cstate_id;
213};
214
215#define PKG_CSTATE_INIT(id) {				\
216		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
217		.cstate_id = id				\
218			}
219
220static struct pkg_cstate_info pkg_cstates[] = {
221	PKG_CSTATE_INIT(2),
222	PKG_CSTATE_INIT(3),
223	PKG_CSTATE_INIT(6),
224	PKG_CSTATE_INIT(7),
225	PKG_CSTATE_INIT(8),
226	PKG_CSTATE_INIT(9),
227	PKG_CSTATE_INIT(10),
228	{NULL},
229};
230
231static bool has_pkg_state_counter(void)
232{
233	u64 val;
234	struct pkg_cstate_info *info = pkg_cstates;
235
236	/* check if any one of the counter msrs exists */
237	while (info->msr_index) {
238		if (!rdmsrl_safe(info->msr_index, &val))
239			return true;
240		info++;
241	}
242
243	return false;
244}
245
246static u64 pkg_state_counter(void)
247{
248	u64 val;
249	u64 count = 0;
250	struct pkg_cstate_info *info = pkg_cstates;
251
252	while (info->msr_index) {
253		if (!info->skip) {
254			if (!rdmsrl_safe(info->msr_index, &val))
255				count += val;
256			else
257				info->skip = true;
258		}
259		info++;
260	}
261
262	return count;
263}
264
265static void noop_timer(unsigned long foo)
266{
267	/* empty... just the fact that we get the interrupt wakes us up */
268}
269
270static unsigned int get_compensation(int ratio)
271{
272	unsigned int comp = 0;
273
274	/* we only use compensation if all adjacent ones are good */
275	if (ratio == 1 &&
276		cal_data[ratio].confidence >= CONFIDENCE_OK &&
277		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
278		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
279		comp = (cal_data[ratio].steady_comp +
280			cal_data[ratio + 1].steady_comp +
281			cal_data[ratio + 2].steady_comp) / 3;
282	} else if (ratio == MAX_TARGET_RATIO - 1 &&
283		cal_data[ratio].confidence >= CONFIDENCE_OK &&
284		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
286		comp = (cal_data[ratio].steady_comp +
287			cal_data[ratio - 1].steady_comp +
288			cal_data[ratio - 2].steady_comp) / 3;
289	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
290		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
291		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
292		comp = (cal_data[ratio].steady_comp +
293			cal_data[ratio - 1].steady_comp +
294			cal_data[ratio + 1].steady_comp) / 3;
295	}
296
297	/* REVISIT: simple penalty of double idle injection */
298	if (reduce_irq)
299		comp = ratio;
300	/* do not exceed limit */
301	if (comp + ratio >= MAX_TARGET_RATIO)
302		comp = MAX_TARGET_RATIO - ratio - 1;
303
304	return comp;
305}
306
307static void adjust_compensation(int target_ratio, unsigned int win)
308{
309	int delta;
310	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
311
312	/*
313	 * adjust compensations if confidence level has not been reached or
314	 * there are too many wakeups during the last idle injection period, we
315	 * cannot trust the data for compensation.
316	 */
317	if (d->confidence >= CONFIDENCE_OK ||
318		atomic_read(&idle_wakeup_counter) >
319		win * num_online_cpus())
320		return;
321
322	delta = set_target_ratio - current_ratio;
323	/* filter out bad data */
324	if (delta >= 0 && delta <= (1+target_ratio/10)) {
325		if (d->steady_comp)
326			d->steady_comp =
327				roundup(delta+d->steady_comp, 2)/2;
328		else
329			d->steady_comp = delta;
330		d->confidence++;
331	}
332}
333
334static bool powerclamp_adjust_controls(unsigned int target_ratio,
335				unsigned int guard, unsigned int win)
336{
337	static u64 msr_last, tsc_last;
338	u64 msr_now, tsc_now;
339	u64 val64;
340
341	/* check result for the last window */
342	msr_now = pkg_state_counter();
343	tsc_now = rdtsc();
344
345	/* calculate pkg cstate vs tsc ratio */
346	if (!msr_last || !tsc_last)
347		current_ratio = 1;
348	else if (tsc_now-tsc_last) {
349		val64 = 100*(msr_now-msr_last);
350		do_div(val64, (tsc_now-tsc_last));
351		current_ratio = val64;
352	}
353
354	/* update record */
355	msr_last = msr_now;
356	tsc_last = tsc_now;
357
358	adjust_compensation(target_ratio, win);
359	/*
360	 * too many external interrupts, set flag such
361	 * that we can take measure later.
362	 */
363	reduce_irq = atomic_read(&idle_wakeup_counter) >=
364		2 * win * num_online_cpus();
365
366	atomic_set(&idle_wakeup_counter, 0);
367	/* if we are above target+guard, skip */
368	return set_target_ratio + guard <= current_ratio;
369}
370
371static int clamp_thread(void *arg)
372{
373	int cpunr = (unsigned long)arg;
374	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
375	static const struct sched_param param = {
376		.sched_priority = MAX_USER_RT_PRIO/2,
377	};
378	unsigned int count = 0;
379	unsigned int target_ratio;
380
381	set_bit(cpunr, cpu_clamping_mask);
382	set_freezable();
383	init_timer_on_stack(&wakeup_timer);
384	sched_setscheduler(current, SCHED_FIFO, &param);
385
386	while (true == clamping && !kthread_should_stop() &&
387		cpu_online(cpunr)) {
388		int sleeptime;
389		unsigned long target_jiffies;
390		unsigned int guard;
391		unsigned int compensation = 0;
392		int interval; /* jiffies to sleep for each attempt */
393		unsigned int duration_jiffies = msecs_to_jiffies(duration);
394		unsigned int window_size_now;
395
396		try_to_freeze();
397		/*
398		 * make sure user selected ratio does not take effect until
399		 * the next round. adjust target_ratio if user has changed
400		 * target such that we can converge quickly.
401		 */
402		target_ratio = set_target_ratio;
403		guard = 1 + target_ratio/20;
404		window_size_now = window_size;
405		count++;
406
407		/*
408		 * systems may have different ability to enter package level
409		 * c-states, thus we need to compensate the injected idle ratio
410		 * to achieve the actual target reported by the HW.
411		 */
412		compensation = get_compensation(target_ratio);
413		interval = duration_jiffies*100/(target_ratio+compensation);
414
415		/* align idle time */
416		target_jiffies = roundup(jiffies, interval);
417		sleeptime = target_jiffies - jiffies;
418		if (sleeptime <= 0)
419			sleeptime = 1;
420		schedule_timeout_interruptible(sleeptime);
421		/*
422		 * only elected controlling cpu can collect stats and update
423		 * control parameters.
424		 */
425		if (cpunr == control_cpu && !(count%window_size_now)) {
426			should_skip =
427				powerclamp_adjust_controls(target_ratio,
428							guard, window_size_now);
429			smp_mb();
430		}
431
432		if (should_skip)
433			continue;
434
435		target_jiffies = jiffies + duration_jiffies;
436		mod_timer(&wakeup_timer, target_jiffies);
437		if (unlikely(local_softirq_pending()))
438			continue;
439		/*
440		 * stop tick sched during idle time, interrupts are still
441		 * allowed. thus jiffies are updated properly.
442		 */
443		preempt_disable();
444		/* mwait until target jiffies is reached */
445		while (time_before(jiffies, target_jiffies)) {
446			unsigned long ecx = 1;
447			unsigned long eax = target_mwait;
448
449			/*
450			 * REVISIT: may call enter_idle() to notify drivers who
451			 * can save power during cpu idle. same for exit_idle()
452			 */
453			local_touch_nmi();
454			stop_critical_timings();
455			mwait_idle_with_hints(eax, ecx);
456			start_critical_timings();
457			atomic_inc(&idle_wakeup_counter);
458		}
459		preempt_enable();
460	}
461	del_timer_sync(&wakeup_timer);
462	clear_bit(cpunr, cpu_clamping_mask);
463
464	return 0;
465}
466
467/*
468 * 1 HZ polling while clamping is active, useful for userspace
469 * to monitor actual idle ratio.
470 */
471static void poll_pkg_cstate(struct work_struct *dummy);
472static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
473static void poll_pkg_cstate(struct work_struct *dummy)
474{
475	static u64 msr_last;
476	static u64 tsc_last;
477	static unsigned long jiffies_last;
478
479	u64 msr_now;
480	unsigned long jiffies_now;
481	u64 tsc_now;
482	u64 val64;
483
484	msr_now = pkg_state_counter();
485	tsc_now = rdtsc();
486	jiffies_now = jiffies;
487
488	/* calculate pkg cstate vs tsc ratio */
489	if (!msr_last || !tsc_last)
490		pkg_cstate_ratio_cur = 1;
491	else {
492		if (tsc_now - tsc_last) {
493			val64 = 100 * (msr_now - msr_last);
494			do_div(val64, (tsc_now - tsc_last));
495			pkg_cstate_ratio_cur = val64;
496		}
497	}
498
499	/* update record */
500	msr_last = msr_now;
501	jiffies_last = jiffies_now;
502	tsc_last = tsc_now;
503
504	if (true == clamping)
505		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
506}
507
508static int start_power_clamp(void)
509{
510	unsigned long cpu;
511	struct task_struct *thread;
512
513	/* check if pkg cstate counter is completely 0, abort in this case */
514	if (!has_pkg_state_counter()) {
515		pr_err("pkg cstate counter not functional, abort\n");
516		return -EINVAL;
517	}
518
519	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
520	/* prevent cpu hotplug */
521	get_online_cpus();
522
523	/* prefer BSP */
524	control_cpu = 0;
525	if (!cpu_online(control_cpu))
526		control_cpu = smp_processor_id();
527
528	clamping = true;
529	schedule_delayed_work(&poll_pkg_cstate_work, 0);
530
531	/* start one thread per online cpu */
532	for_each_online_cpu(cpu) {
533		struct task_struct **p =
534			per_cpu_ptr(powerclamp_thread, cpu);
535
536		thread = kthread_create_on_node(clamp_thread,
537						(void *) cpu,
538						cpu_to_node(cpu),
539						"kidle_inject/%ld", cpu);
540		/* bind to cpu here */
541		if (likely(!IS_ERR(thread))) {
542			kthread_bind(thread, cpu);
543			wake_up_process(thread);
544			*p = thread;
545		}
546
547	}
548	put_online_cpus();
549
550	return 0;
551}
552
553static void end_power_clamp(void)
554{
555	int i;
556	struct task_struct *thread;
557
558	clamping = false;
559	/*
560	 * make clamping visible to other cpus and give per cpu clamping threads
561	 * sometime to exit, or gets killed later.
562	 */
563	smp_mb();
564	msleep(20);
565	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
566		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
567			pr_debug("clamping thread for cpu %d alive, kill\n", i);
568			thread = *per_cpu_ptr(powerclamp_thread, i);
569			kthread_stop(thread);
570		}
571	}
572}
573
574static int powerclamp_cpu_callback(struct notifier_block *nfb,
575				unsigned long action, void *hcpu)
576{
577	unsigned long cpu = (unsigned long)hcpu;
578	struct task_struct *thread;
579	struct task_struct **percpu_thread =
580		per_cpu_ptr(powerclamp_thread, cpu);
581
582	if (false == clamping)
583		goto exit_ok;
584
585	switch (action) {
586	case CPU_ONLINE:
587		thread = kthread_create_on_node(clamp_thread,
588						(void *) cpu,
589						cpu_to_node(cpu),
590						"kidle_inject/%lu", cpu);
591		if (likely(!IS_ERR(thread))) {
592			kthread_bind(thread, cpu);
593			wake_up_process(thread);
594			*percpu_thread = thread;
595		}
596		/* prefer BSP as controlling CPU */
597		if (cpu == 0) {
598			control_cpu = 0;
599			smp_mb();
600		}
601		break;
602	case CPU_DEAD:
603		if (test_bit(cpu, cpu_clamping_mask)) {
604			pr_err("cpu %lu dead but powerclamping thread is not\n",
605				cpu);
606			kthread_stop(*percpu_thread);
607		}
608		if (cpu == control_cpu) {
609			control_cpu = smp_processor_id();
610			smp_mb();
611		}
612	}
613
614exit_ok:
615	return NOTIFY_OK;
616}
617
618static struct notifier_block powerclamp_cpu_notifier = {
619	.notifier_call = powerclamp_cpu_callback,
620};
621
622static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
623				 unsigned long *state)
624{
625	*state = MAX_TARGET_RATIO;
626
627	return 0;
628}
629
630static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
631				 unsigned long *state)
632{
633	if (true == clamping)
634		*state = pkg_cstate_ratio_cur;
635	else
636		/* to save power, do not poll idle ratio while not clamping */
637		*state = -1; /* indicates invalid state */
638
639	return 0;
640}
641
642static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
643				 unsigned long new_target_ratio)
644{
645	int ret = 0;
646
647	new_target_ratio = clamp(new_target_ratio, 0UL,
648				(unsigned long) (MAX_TARGET_RATIO-1));
649	if (set_target_ratio == 0 && new_target_ratio > 0) {
650		pr_info("Start idle injection to reduce power\n");
651		set_target_ratio = new_target_ratio;
652		ret = start_power_clamp();
653		goto exit_set;
654	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
655		pr_info("Stop forced idle injection\n");
656		set_target_ratio = 0;
657		end_power_clamp();
658	} else	/* adjust currently running */ {
659		set_target_ratio = new_target_ratio;
660		/* make new set_target_ratio visible to other cpus */
661		smp_mb();
662	}
663
664exit_set:
665	return ret;
666}
667
668/* bind to generic thermal layer as cooling device*/
669static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
670	.get_max_state = powerclamp_get_max_state,
671	.get_cur_state = powerclamp_get_cur_state,
672	.set_cur_state = powerclamp_set_cur_state,
673};
674
675/* runs on Nehalem and later */
676static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
677	{ X86_VENDOR_INTEL, 6, 0x1a},
678	{ X86_VENDOR_INTEL, 6, 0x1c},
679	{ X86_VENDOR_INTEL, 6, 0x1e},
680	{ X86_VENDOR_INTEL, 6, 0x1f},
681	{ X86_VENDOR_INTEL, 6, 0x25},
682	{ X86_VENDOR_INTEL, 6, 0x26},
683	{ X86_VENDOR_INTEL, 6, 0x2a},
684	{ X86_VENDOR_INTEL, 6, 0x2c},
685	{ X86_VENDOR_INTEL, 6, 0x2d},
686	{ X86_VENDOR_INTEL, 6, 0x2e},
687	{ X86_VENDOR_INTEL, 6, 0x2f},
688	{ X86_VENDOR_INTEL, 6, 0x37},
689	{ X86_VENDOR_INTEL, 6, 0x3a},
690	{ X86_VENDOR_INTEL, 6, 0x3c},
691	{ X86_VENDOR_INTEL, 6, 0x3d},
692	{ X86_VENDOR_INTEL, 6, 0x3e},
693	{ X86_VENDOR_INTEL, 6, 0x3f},
694	{ X86_VENDOR_INTEL, 6, 0x45},
695	{ X86_VENDOR_INTEL, 6, 0x46},
696	{ X86_VENDOR_INTEL, 6, 0x47},
697	{ X86_VENDOR_INTEL, 6, 0x4c},
698	{ X86_VENDOR_INTEL, 6, 0x4d},
699	{ X86_VENDOR_INTEL, 6, 0x4e},
700	{ X86_VENDOR_INTEL, 6, 0x4f},
701	{ X86_VENDOR_INTEL, 6, 0x56},
702	{ X86_VENDOR_INTEL, 6, 0x57},
703	{ X86_VENDOR_INTEL, 6, 0x5e},
704	{}
705};
706MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
707
708static int __init powerclamp_probe(void)
709{
710	if (!x86_match_cpu(intel_powerclamp_ids)) {
711		pr_err("Intel powerclamp does not run on family %d model %d\n",
712				boot_cpu_data.x86, boot_cpu_data.x86_model);
713		return -ENODEV;
714	}
715	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
716		!boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
717		!boot_cpu_has(X86_FEATURE_MWAIT) ||
718		!boot_cpu_has(X86_FEATURE_ARAT))
719		return -ENODEV;
720
721	/* find the deepest mwait value */
722	find_target_mwait();
723
724	return 0;
725}
726
727static int powerclamp_debug_show(struct seq_file *m, void *unused)
728{
729	int i = 0;
730
731	seq_printf(m, "controlling cpu: %d\n", control_cpu);
732	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
733	for (i = 0; i < MAX_TARGET_RATIO; i++) {
734		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
735			i,
736			cal_data[i].confidence,
737			cal_data[i].steady_comp,
738			cal_data[i].dynamic_comp);
739	}
740
741	return 0;
742}
743
744static int powerclamp_debug_open(struct inode *inode,
745			struct file *file)
746{
747	return single_open(file, powerclamp_debug_show, inode->i_private);
748}
749
750static const struct file_operations powerclamp_debug_fops = {
751	.open		= powerclamp_debug_open,
752	.read		= seq_read,
753	.llseek		= seq_lseek,
754	.release	= single_release,
755	.owner		= THIS_MODULE,
756};
757
758static inline void powerclamp_create_debug_files(void)
759{
760	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
761	if (!debug_dir)
762		return;
763
764	if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
765					cal_data, &powerclamp_debug_fops))
766		goto file_error;
767
768	return;
769
770file_error:
771	debugfs_remove_recursive(debug_dir);
772}
773
774static int __init powerclamp_init(void)
775{
776	int retval;
777	int bitmap_size;
778
779	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
780	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
781	if (!cpu_clamping_mask)
782		return -ENOMEM;
783
784	/* probe cpu features and ids here */
785	retval = powerclamp_probe();
786	if (retval)
787		goto exit_free;
788
789	/* set default limit, maybe adjusted during runtime based on feedback */
790	window_size = 2;
791	register_hotcpu_notifier(&powerclamp_cpu_notifier);
792
793	powerclamp_thread = alloc_percpu(struct task_struct *);
794	if (!powerclamp_thread) {
795		retval = -ENOMEM;
796		goto exit_unregister;
797	}
798
799	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
800						&powerclamp_cooling_ops);
801	if (IS_ERR(cooling_dev)) {
802		retval = -ENODEV;
803		goto exit_free_thread;
804	}
805
806	if (!duration)
807		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
808
809	powerclamp_create_debug_files();
810
811	return 0;
812
813exit_free_thread:
814	free_percpu(powerclamp_thread);
815exit_unregister:
816	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
817exit_free:
818	kfree(cpu_clamping_mask);
819	return retval;
820}
821module_init(powerclamp_init);
822
823static void __exit powerclamp_exit(void)
824{
825	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
826	end_power_clamp();
827	free_percpu(powerclamp_thread);
828	thermal_cooling_device_unregister(cooling_dev);
829	kfree(cpu_clamping_mask);
830
831	cancel_delayed_work_sync(&poll_pkg_cstate_work);
832	debugfs_remove_recursive(debug_dir);
833}
834module_exit(powerclamp_exit);
835
836MODULE_LICENSE("GPL");
837MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
838MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
839MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");