Linux Audio

Check our new training course

Linux kernel drivers training

May 6-19, 2025
Register
Loading...
Note: File does not exist in v3.1.
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * x86_pkg_temp_thermal driver
  4 * Copyright (c) 2013, Intel Corporation.
  5 */
  6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  7
  8#include <linux/module.h>
  9#include <linux/init.h>
 10#include <linux/err.h>
 11#include <linux/param.h>
 12#include <linux/device.h>
 13#include <linux/platform_device.h>
 14#include <linux/cpu.h>
 15#include <linux/smp.h>
 16#include <linux/slab.h>
 17#include <linux/pm.h>
 18#include <linux/thermal.h>
 19#include <linux/debugfs.h>
 20
 21#include <asm/cpu_device_id.h>
 22
 23#include "thermal_interrupt.h"
 24
 25/*
 26* Rate control delay: Idea is to introduce denounce effect
 27* This should be long enough to avoid reduce events, when
 28* threshold is set to a temperature, which is constantly
 29* violated, but at the short enough to take any action.
 30* The action can be remove threshold or change it to next
 31* interesting setting. Based on experiments, in around
 32* every 5 seconds under load will give us a significant
 33* temperature change.
 34*/
 35#define PKG_TEMP_THERMAL_NOTIFY_DELAY	5000
 36static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
 37module_param(notify_delay_ms, int, 0644);
 38MODULE_PARM_DESC(notify_delay_ms,
 39	"User space notification delay in milli seconds.");
 40
 41/* Number of trip points in thermal zone. Currently it can't
 42* be more than 2. MSR can allow setting and getting notifications
 43* for only 2 thresholds. This define enforces this, if there
 44* is some wrong values returned by cpuid for number of thresholds.
 45*/
 46#define MAX_NUMBER_OF_TRIPS	2
 47
 48struct zone_device {
 49	int				cpu;
 50	bool				work_scheduled;
 51	u32				tj_max;
 52	u32				msr_pkg_therm_low;
 53	u32				msr_pkg_therm_high;
 54	struct delayed_work		work;
 55	struct thermal_zone_device	*tzone;
 56	struct cpumask			cpumask;
 57};
 58
 59static struct thermal_zone_params pkg_temp_tz_params = {
 60	.no_hwmon	= true,
 61};
 62
 63/* Keep track of how many zone pointers we allocated in init() */
 64static int max_id __read_mostly;
 65/* Array of zone pointers */
 66static struct zone_device **zones;
 67/* Serializes interrupt notification, work and hotplug */
 68static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
 69/* Protects zone operation in the work function against hotplug removal */
 70static DEFINE_MUTEX(thermal_zone_mutex);
 71
 72/* The dynamically assigned cpu hotplug state for module_exit() */
 73static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
 74
 75/* Debug counters to show using debugfs */
 76static struct dentry *debugfs;
 77static unsigned int pkg_interrupt_cnt;
 78static unsigned int pkg_work_cnt;
 79
 80static void pkg_temp_debugfs_init(void)
 81{
 82	debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
 83
 84	debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
 85			   &pkg_interrupt_cnt);
 86	debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
 87			   &pkg_work_cnt);
 88}
 89
 90/*
 91 * Protection:
 92 *
 93 * - cpu hotplug: Read serialized by cpu hotplug lock
 94 *		  Write must hold pkg_temp_lock
 95 *
 96 * - Other callsites: Must hold pkg_temp_lock
 97 */
 98static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
 99{
100	int id = topology_logical_die_id(cpu);
101
102	if (id >= 0 && id < max_id)
103		return zones[id];
104	return NULL;
105}
106
107/*
108* tj-max is interesting because threshold is set relative to this
109* temperature.
110*/
111static int get_tj_max(int cpu, u32 *tj_max)
112{
113	u32 eax, edx, val;
114	int err;
115
116	err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
117	if (err)
118		return err;
119
120	val = (eax >> 16) & 0xff;
121	*tj_max = val * 1000;
122
123	return val ? 0 : -EINVAL;
124}
125
126static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
127{
128	struct zone_device *zonedev = tzd->devdata;
129	u32 eax, edx;
130
131	rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_STATUS,
132			&eax, &edx);
133	if (eax & 0x80000000) {
134		*temp = zonedev->tj_max - ((eax >> 16) & 0x7f) * 1000;
135		pr_debug("sys_get_curr_temp %d\n", *temp);
136		return 0;
137	}
138	return -EINVAL;
139}
140
141static int sys_get_trip_temp(struct thermal_zone_device *tzd,
142			     int trip, int *temp)
143{
144	struct zone_device *zonedev = tzd->devdata;
145	unsigned long thres_reg_value;
146	u32 mask, shift, eax, edx;
147	int ret;
148
149	if (trip >= MAX_NUMBER_OF_TRIPS)
150		return -EINVAL;
151
152	if (trip) {
153		mask = THERM_MASK_THRESHOLD1;
154		shift = THERM_SHIFT_THRESHOLD1;
155	} else {
156		mask = THERM_MASK_THRESHOLD0;
157		shift = THERM_SHIFT_THRESHOLD0;
158	}
159
160	ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
161			   &eax, &edx);
162	if (ret < 0)
163		return ret;
164
165	thres_reg_value = (eax & mask) >> shift;
166	if (thres_reg_value)
167		*temp = zonedev->tj_max - thres_reg_value * 1000;
168	else
169		*temp = THERMAL_TEMP_INVALID;
170	pr_debug("sys_get_trip_temp %d\n", *temp);
171
172	return 0;
173}
174
175static int
176sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
177{
178	struct zone_device *zonedev = tzd->devdata;
179	u32 l, h, mask, shift, intr;
180	int ret;
181
182	if (trip >= MAX_NUMBER_OF_TRIPS || temp >= zonedev->tj_max)
183		return -EINVAL;
184
185	ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
186			   &l, &h);
187	if (ret < 0)
188		return ret;
189
190	if (trip) {
191		mask = THERM_MASK_THRESHOLD1;
192		shift = THERM_SHIFT_THRESHOLD1;
193		intr = THERM_INT_THRESHOLD1_ENABLE;
194	} else {
195		mask = THERM_MASK_THRESHOLD0;
196		shift = THERM_SHIFT_THRESHOLD0;
197		intr = THERM_INT_THRESHOLD0_ENABLE;
198	}
199	l &= ~mask;
200	/*
201	* When users space sets a trip temperature == 0, which is indication
202	* that, it is no longer interested in receiving notifications.
203	*/
204	if (!temp) {
205		l &= ~intr;
206	} else {
207		l |= (zonedev->tj_max - temp)/1000 << shift;
208		l |= intr;
209	}
210
211	return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
212			l, h);
213}
214
215static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
216			     enum thermal_trip_type *type)
217{
218	*type = THERMAL_TRIP_PASSIVE;
219	return 0;
220}
221
222/* Thermal zone callback registry */
223static struct thermal_zone_device_ops tzone_ops = {
224	.get_temp = sys_get_curr_temp,
225	.get_trip_temp = sys_get_trip_temp,
226	.get_trip_type = sys_get_trip_type,
227	.set_trip_temp = sys_set_trip_temp,
228};
229
230static bool pkg_thermal_rate_control(void)
231{
232	return true;
233}
234
235/* Enable threshold interrupt on local package/cpu */
236static inline void enable_pkg_thres_interrupt(void)
237{
238	u8 thres_0, thres_1;
239	u32 l, h;
240
241	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
242	/* only enable/disable if it had valid threshold value */
243	thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
244	thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
245	if (thres_0)
246		l |= THERM_INT_THRESHOLD0_ENABLE;
247	if (thres_1)
248		l |= THERM_INT_THRESHOLD1_ENABLE;
249	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
250}
251
252/* Disable threshold interrupt on local package/cpu */
253static inline void disable_pkg_thres_interrupt(void)
254{
255	u32 l, h;
256
257	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
258
259	l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
260	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
261}
262
263static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
264{
265	struct thermal_zone_device *tzone = NULL;
266	int cpu = smp_processor_id();
267	struct zone_device *zonedev;
268
269	mutex_lock(&thermal_zone_mutex);
270	raw_spin_lock_irq(&pkg_temp_lock);
271	++pkg_work_cnt;
272
273	zonedev = pkg_temp_thermal_get_dev(cpu);
274	if (!zonedev) {
275		raw_spin_unlock_irq(&pkg_temp_lock);
276		mutex_unlock(&thermal_zone_mutex);
277		return;
278	}
279	zonedev->work_scheduled = false;
280
281	thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
282	tzone = zonedev->tzone;
283
284	enable_pkg_thres_interrupt();
285	raw_spin_unlock_irq(&pkg_temp_lock);
286
287	/*
288	 * If tzone is not NULL, then thermal_zone_mutex will prevent the
289	 * concurrent removal in the cpu offline callback.
290	 */
291	if (tzone)
292		thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
293
294	mutex_unlock(&thermal_zone_mutex);
295}
296
297static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
298{
299	unsigned long ms = msecs_to_jiffies(notify_delay_ms);
300
301	schedule_delayed_work_on(cpu, work, ms);
302}
303
304static int pkg_thermal_notify(u64 msr_val)
305{
306	int cpu = smp_processor_id();
307	struct zone_device *zonedev;
308	unsigned long flags;
309
310	raw_spin_lock_irqsave(&pkg_temp_lock, flags);
311	++pkg_interrupt_cnt;
312
313	disable_pkg_thres_interrupt();
314
315	/* Work is per package, so scheduling it once is enough. */
316	zonedev = pkg_temp_thermal_get_dev(cpu);
317	if (zonedev && !zonedev->work_scheduled) {
318		zonedev->work_scheduled = true;
319		pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
320	}
321
322	raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
323	return 0;
324}
325
326static int pkg_temp_thermal_device_add(unsigned int cpu)
327{
328	int id = topology_logical_die_id(cpu);
329	u32 tj_max, eax, ebx, ecx, edx;
330	struct zone_device *zonedev;
331	int thres_count, err;
332
333	if (id >= max_id)
334		return -ENOMEM;
335
336	cpuid(6, &eax, &ebx, &ecx, &edx);
337	thres_count = ebx & 0x07;
338	if (!thres_count)
339		return -ENODEV;
340
341	thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
342
343	err = get_tj_max(cpu, &tj_max);
344	if (err)
345		return err;
346
347	zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
348	if (!zonedev)
349		return -ENOMEM;
350
351	INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
352	zonedev->cpu = cpu;
353	zonedev->tj_max = tj_max;
354	zonedev->tzone = thermal_zone_device_register("x86_pkg_temp",
355			thres_count,
356			(thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
357			zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
358	if (IS_ERR(zonedev->tzone)) {
359		err = PTR_ERR(zonedev->tzone);
360		kfree(zonedev);
361		return err;
362	}
363	err = thermal_zone_device_enable(zonedev->tzone);
364	if (err) {
365		thermal_zone_device_unregister(zonedev->tzone);
366		kfree(zonedev);
367		return err;
368	}
369	/* Store MSR value for package thermal interrupt, to restore at exit */
370	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
371	      zonedev->msr_pkg_therm_high);
372
373	cpumask_set_cpu(cpu, &zonedev->cpumask);
374	raw_spin_lock_irq(&pkg_temp_lock);
375	zones[id] = zonedev;
376	raw_spin_unlock_irq(&pkg_temp_lock);
377	return 0;
378}
379
380static int pkg_thermal_cpu_offline(unsigned int cpu)
381{
382	struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
383	bool lastcpu, was_target;
384	int target;
385
386	if (!zonedev)
387		return 0;
388
389	target = cpumask_any_but(&zonedev->cpumask, cpu);
390	cpumask_clear_cpu(cpu, &zonedev->cpumask);
391	lastcpu = target >= nr_cpu_ids;
392	/*
393	 * Remove the sysfs files, if this is the last cpu in the package
394	 * before doing further cleanups.
395	 */
396	if (lastcpu) {
397		struct thermal_zone_device *tzone = zonedev->tzone;
398
399		/*
400		 * We must protect against a work function calling
401		 * thermal_zone_update, after/while unregister. We null out
402		 * the pointer under the zone mutex, so the worker function
403		 * won't try to call.
404		 */
405		mutex_lock(&thermal_zone_mutex);
406		zonedev->tzone = NULL;
407		mutex_unlock(&thermal_zone_mutex);
408
409		thermal_zone_device_unregister(tzone);
410	}
411
412	/* Protect against work and interrupts */
413	raw_spin_lock_irq(&pkg_temp_lock);
414
415	/*
416	 * Check whether this cpu was the current target and store the new
417	 * one. When we drop the lock, then the interrupt notify function
418	 * will see the new target.
419	 */
420	was_target = zonedev->cpu == cpu;
421	zonedev->cpu = target;
422
423	/*
424	 * If this is the last CPU in the package remove the package
425	 * reference from the array and restore the interrupt MSR. When we
426	 * drop the lock neither the interrupt notify function nor the
427	 * worker will see the package anymore.
428	 */
429	if (lastcpu) {
430		zones[topology_logical_die_id(cpu)] = NULL;
431		/* After this point nothing touches the MSR anymore. */
432		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
433		      zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
434	}
435
436	/*
437	 * Check whether there is work scheduled and whether the work is
438	 * targeted at the outgoing CPU.
439	 */
440	if (zonedev->work_scheduled && was_target) {
441		/*
442		 * To cancel the work we need to drop the lock, otherwise
443		 * we might deadlock if the work needs to be flushed.
444		 */
445		raw_spin_unlock_irq(&pkg_temp_lock);
446		cancel_delayed_work_sync(&zonedev->work);
447		raw_spin_lock_irq(&pkg_temp_lock);
448		/*
449		 * If this is not the last cpu in the package and the work
450		 * did not run after we dropped the lock above, then we
451		 * need to reschedule the work, otherwise the interrupt
452		 * stays disabled forever.
453		 */
454		if (!lastcpu && zonedev->work_scheduled)
455			pkg_thermal_schedule_work(target, &zonedev->work);
456	}
457
458	raw_spin_unlock_irq(&pkg_temp_lock);
459
460	/* Final cleanup if this is the last cpu */
461	if (lastcpu)
462		kfree(zonedev);
463	return 0;
464}
465
466static int pkg_thermal_cpu_online(unsigned int cpu)
467{
468	struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
469	struct cpuinfo_x86 *c = &cpu_data(cpu);
470
471	/* Paranoia check */
472	if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
473		return -ENODEV;
474
475	/* If the package exists, nothing to do */
476	if (zonedev) {
477		cpumask_set_cpu(cpu, &zonedev->cpumask);
478		return 0;
479	}
480	return pkg_temp_thermal_device_add(cpu);
481}
482
483static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
484	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL),
485	{}
486};
487MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
488
489static int __init pkg_temp_thermal_init(void)
490{
491	int ret;
492
493	if (!x86_match_cpu(pkg_temp_thermal_ids))
494		return -ENODEV;
495
496	max_id = topology_max_packages() * topology_max_die_per_package();
497	zones = kcalloc(max_id, sizeof(struct zone_device *),
498			   GFP_KERNEL);
499	if (!zones)
500		return -ENOMEM;
501
502	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
503				pkg_thermal_cpu_online,	pkg_thermal_cpu_offline);
504	if (ret < 0)
505		goto err;
506
507	/* Store the state for module exit */
508	pkg_thermal_hp_state = ret;
509
510	platform_thermal_package_notify = pkg_thermal_notify;
511	platform_thermal_package_rate_control = pkg_thermal_rate_control;
512
513	 /* Don't care if it fails */
514	pkg_temp_debugfs_init();
515	return 0;
516
517err:
518	kfree(zones);
519	return ret;
520}
521module_init(pkg_temp_thermal_init)
522
523static void __exit pkg_temp_thermal_exit(void)
524{
525	platform_thermal_package_notify = NULL;
526	platform_thermal_package_rate_control = NULL;
527
528	cpuhp_remove_state(pkg_thermal_hp_state);
529	debugfs_remove_recursive(debugfs);
530	kfree(zones);
531}
532module_exit(pkg_temp_thermal_exit)
533
534MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
535MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
536MODULE_LICENSE("GPL v2");