Loading...
Note: File does not exist in v3.1.
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * x86_pkg_temp_thermal driver
4 * Copyright (c) 2013, Intel Corporation.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/err.h>
11#include <linux/param.h>
12#include <linux/device.h>
13#include <linux/platform_device.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/slab.h>
17#include <linux/pm.h>
18#include <linux/thermal.h>
19#include <linux/debugfs.h>
20
21#include <asm/cpu_device_id.h>
22
23#include "thermal_interrupt.h"
24
25/*
26* Rate control delay: Idea is to introduce denounce effect
27* This should be long enough to avoid reduce events, when
28* threshold is set to a temperature, which is constantly
29* violated, but at the short enough to take any action.
30* The action can be remove threshold or change it to next
31* interesting setting. Based on experiments, in around
32* every 5 seconds under load will give us a significant
33* temperature change.
34*/
35#define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000
36static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
37module_param(notify_delay_ms, int, 0644);
38MODULE_PARM_DESC(notify_delay_ms,
39 "User space notification delay in milli seconds.");
40
41/* Number of trip points in thermal zone. Currently it can't
42* be more than 2. MSR can allow setting and getting notifications
43* for only 2 thresholds. This define enforces this, if there
44* is some wrong values returned by cpuid for number of thresholds.
45*/
46#define MAX_NUMBER_OF_TRIPS 2
47
48struct zone_device {
49 int cpu;
50 bool work_scheduled;
51 u32 tj_max;
52 u32 msr_pkg_therm_low;
53 u32 msr_pkg_therm_high;
54 struct delayed_work work;
55 struct thermal_zone_device *tzone;
56 struct cpumask cpumask;
57};
58
59static struct thermal_zone_params pkg_temp_tz_params = {
60 .no_hwmon = true,
61};
62
63/* Keep track of how many zone pointers we allocated in init() */
64static int max_id __read_mostly;
65/* Array of zone pointers */
66static struct zone_device **zones;
67/* Serializes interrupt notification, work and hotplug */
68static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
69/* Protects zone operation in the work function against hotplug removal */
70static DEFINE_MUTEX(thermal_zone_mutex);
71
72/* The dynamically assigned cpu hotplug state for module_exit() */
73static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
74
75/* Debug counters to show using debugfs */
76static struct dentry *debugfs;
77static unsigned int pkg_interrupt_cnt;
78static unsigned int pkg_work_cnt;
79
80static void pkg_temp_debugfs_init(void)
81{
82 debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
83
84 debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
85 &pkg_interrupt_cnt);
86 debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
87 &pkg_work_cnt);
88}
89
90/*
91 * Protection:
92 *
93 * - cpu hotplug: Read serialized by cpu hotplug lock
94 * Write must hold pkg_temp_lock
95 *
96 * - Other callsites: Must hold pkg_temp_lock
97 */
98static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
99{
100 int id = topology_logical_die_id(cpu);
101
102 if (id >= 0 && id < max_id)
103 return zones[id];
104 return NULL;
105}
106
107/*
108* tj-max is interesting because threshold is set relative to this
109* temperature.
110*/
111static int get_tj_max(int cpu, u32 *tj_max)
112{
113 u32 eax, edx, val;
114 int err;
115
116 err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
117 if (err)
118 return err;
119
120 val = (eax >> 16) & 0xff;
121 *tj_max = val * 1000;
122
123 return val ? 0 : -EINVAL;
124}
125
126static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
127{
128 struct zone_device *zonedev = tzd->devdata;
129 u32 eax, edx;
130
131 rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_STATUS,
132 &eax, &edx);
133 if (eax & 0x80000000) {
134 *temp = zonedev->tj_max - ((eax >> 16) & 0x7f) * 1000;
135 pr_debug("sys_get_curr_temp %d\n", *temp);
136 return 0;
137 }
138 return -EINVAL;
139}
140
141static int sys_get_trip_temp(struct thermal_zone_device *tzd,
142 int trip, int *temp)
143{
144 struct zone_device *zonedev = tzd->devdata;
145 unsigned long thres_reg_value;
146 u32 mask, shift, eax, edx;
147 int ret;
148
149 if (trip >= MAX_NUMBER_OF_TRIPS)
150 return -EINVAL;
151
152 if (trip) {
153 mask = THERM_MASK_THRESHOLD1;
154 shift = THERM_SHIFT_THRESHOLD1;
155 } else {
156 mask = THERM_MASK_THRESHOLD0;
157 shift = THERM_SHIFT_THRESHOLD0;
158 }
159
160 ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
161 &eax, &edx);
162 if (ret < 0)
163 return ret;
164
165 thres_reg_value = (eax & mask) >> shift;
166 if (thres_reg_value)
167 *temp = zonedev->tj_max - thres_reg_value * 1000;
168 else
169 *temp = THERMAL_TEMP_INVALID;
170 pr_debug("sys_get_trip_temp %d\n", *temp);
171
172 return 0;
173}
174
175static int
176sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
177{
178 struct zone_device *zonedev = tzd->devdata;
179 u32 l, h, mask, shift, intr;
180 int ret;
181
182 if (trip >= MAX_NUMBER_OF_TRIPS || temp >= zonedev->tj_max)
183 return -EINVAL;
184
185 ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
186 &l, &h);
187 if (ret < 0)
188 return ret;
189
190 if (trip) {
191 mask = THERM_MASK_THRESHOLD1;
192 shift = THERM_SHIFT_THRESHOLD1;
193 intr = THERM_INT_THRESHOLD1_ENABLE;
194 } else {
195 mask = THERM_MASK_THRESHOLD0;
196 shift = THERM_SHIFT_THRESHOLD0;
197 intr = THERM_INT_THRESHOLD0_ENABLE;
198 }
199 l &= ~mask;
200 /*
201 * When users space sets a trip temperature == 0, which is indication
202 * that, it is no longer interested in receiving notifications.
203 */
204 if (!temp) {
205 l &= ~intr;
206 } else {
207 l |= (zonedev->tj_max - temp)/1000 << shift;
208 l |= intr;
209 }
210
211 return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
212 l, h);
213}
214
215static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
216 enum thermal_trip_type *type)
217{
218 *type = THERMAL_TRIP_PASSIVE;
219 return 0;
220}
221
222/* Thermal zone callback registry */
223static struct thermal_zone_device_ops tzone_ops = {
224 .get_temp = sys_get_curr_temp,
225 .get_trip_temp = sys_get_trip_temp,
226 .get_trip_type = sys_get_trip_type,
227 .set_trip_temp = sys_set_trip_temp,
228};
229
230static bool pkg_thermal_rate_control(void)
231{
232 return true;
233}
234
235/* Enable threshold interrupt on local package/cpu */
236static inline void enable_pkg_thres_interrupt(void)
237{
238 u8 thres_0, thres_1;
239 u32 l, h;
240
241 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
242 /* only enable/disable if it had valid threshold value */
243 thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
244 thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
245 if (thres_0)
246 l |= THERM_INT_THRESHOLD0_ENABLE;
247 if (thres_1)
248 l |= THERM_INT_THRESHOLD1_ENABLE;
249 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
250}
251
252/* Disable threshold interrupt on local package/cpu */
253static inline void disable_pkg_thres_interrupt(void)
254{
255 u32 l, h;
256
257 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
258
259 l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
260 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
261}
262
263static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
264{
265 struct thermal_zone_device *tzone = NULL;
266 int cpu = smp_processor_id();
267 struct zone_device *zonedev;
268
269 mutex_lock(&thermal_zone_mutex);
270 raw_spin_lock_irq(&pkg_temp_lock);
271 ++pkg_work_cnt;
272
273 zonedev = pkg_temp_thermal_get_dev(cpu);
274 if (!zonedev) {
275 raw_spin_unlock_irq(&pkg_temp_lock);
276 mutex_unlock(&thermal_zone_mutex);
277 return;
278 }
279 zonedev->work_scheduled = false;
280
281 thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
282 tzone = zonedev->tzone;
283
284 enable_pkg_thres_interrupt();
285 raw_spin_unlock_irq(&pkg_temp_lock);
286
287 /*
288 * If tzone is not NULL, then thermal_zone_mutex will prevent the
289 * concurrent removal in the cpu offline callback.
290 */
291 if (tzone)
292 thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
293
294 mutex_unlock(&thermal_zone_mutex);
295}
296
297static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
298{
299 unsigned long ms = msecs_to_jiffies(notify_delay_ms);
300
301 schedule_delayed_work_on(cpu, work, ms);
302}
303
304static int pkg_thermal_notify(u64 msr_val)
305{
306 int cpu = smp_processor_id();
307 struct zone_device *zonedev;
308 unsigned long flags;
309
310 raw_spin_lock_irqsave(&pkg_temp_lock, flags);
311 ++pkg_interrupt_cnt;
312
313 disable_pkg_thres_interrupt();
314
315 /* Work is per package, so scheduling it once is enough. */
316 zonedev = pkg_temp_thermal_get_dev(cpu);
317 if (zonedev && !zonedev->work_scheduled) {
318 zonedev->work_scheduled = true;
319 pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
320 }
321
322 raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
323 return 0;
324}
325
326static int pkg_temp_thermal_device_add(unsigned int cpu)
327{
328 int id = topology_logical_die_id(cpu);
329 u32 tj_max, eax, ebx, ecx, edx;
330 struct zone_device *zonedev;
331 int thres_count, err;
332
333 if (id >= max_id)
334 return -ENOMEM;
335
336 cpuid(6, &eax, &ebx, &ecx, &edx);
337 thres_count = ebx & 0x07;
338 if (!thres_count)
339 return -ENODEV;
340
341 thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
342
343 err = get_tj_max(cpu, &tj_max);
344 if (err)
345 return err;
346
347 zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
348 if (!zonedev)
349 return -ENOMEM;
350
351 INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
352 zonedev->cpu = cpu;
353 zonedev->tj_max = tj_max;
354 zonedev->tzone = thermal_zone_device_register("x86_pkg_temp",
355 thres_count,
356 (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
357 zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
358 if (IS_ERR(zonedev->tzone)) {
359 err = PTR_ERR(zonedev->tzone);
360 kfree(zonedev);
361 return err;
362 }
363 err = thermal_zone_device_enable(zonedev->tzone);
364 if (err) {
365 thermal_zone_device_unregister(zonedev->tzone);
366 kfree(zonedev);
367 return err;
368 }
369 /* Store MSR value for package thermal interrupt, to restore at exit */
370 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
371 zonedev->msr_pkg_therm_high);
372
373 cpumask_set_cpu(cpu, &zonedev->cpumask);
374 raw_spin_lock_irq(&pkg_temp_lock);
375 zones[id] = zonedev;
376 raw_spin_unlock_irq(&pkg_temp_lock);
377 return 0;
378}
379
380static int pkg_thermal_cpu_offline(unsigned int cpu)
381{
382 struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
383 bool lastcpu, was_target;
384 int target;
385
386 if (!zonedev)
387 return 0;
388
389 target = cpumask_any_but(&zonedev->cpumask, cpu);
390 cpumask_clear_cpu(cpu, &zonedev->cpumask);
391 lastcpu = target >= nr_cpu_ids;
392 /*
393 * Remove the sysfs files, if this is the last cpu in the package
394 * before doing further cleanups.
395 */
396 if (lastcpu) {
397 struct thermal_zone_device *tzone = zonedev->tzone;
398
399 /*
400 * We must protect against a work function calling
401 * thermal_zone_update, after/while unregister. We null out
402 * the pointer under the zone mutex, so the worker function
403 * won't try to call.
404 */
405 mutex_lock(&thermal_zone_mutex);
406 zonedev->tzone = NULL;
407 mutex_unlock(&thermal_zone_mutex);
408
409 thermal_zone_device_unregister(tzone);
410 }
411
412 /* Protect against work and interrupts */
413 raw_spin_lock_irq(&pkg_temp_lock);
414
415 /*
416 * Check whether this cpu was the current target and store the new
417 * one. When we drop the lock, then the interrupt notify function
418 * will see the new target.
419 */
420 was_target = zonedev->cpu == cpu;
421 zonedev->cpu = target;
422
423 /*
424 * If this is the last CPU in the package remove the package
425 * reference from the array and restore the interrupt MSR. When we
426 * drop the lock neither the interrupt notify function nor the
427 * worker will see the package anymore.
428 */
429 if (lastcpu) {
430 zones[topology_logical_die_id(cpu)] = NULL;
431 /* After this point nothing touches the MSR anymore. */
432 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
433 zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
434 }
435
436 /*
437 * Check whether there is work scheduled and whether the work is
438 * targeted at the outgoing CPU.
439 */
440 if (zonedev->work_scheduled && was_target) {
441 /*
442 * To cancel the work we need to drop the lock, otherwise
443 * we might deadlock if the work needs to be flushed.
444 */
445 raw_spin_unlock_irq(&pkg_temp_lock);
446 cancel_delayed_work_sync(&zonedev->work);
447 raw_spin_lock_irq(&pkg_temp_lock);
448 /*
449 * If this is not the last cpu in the package and the work
450 * did not run after we dropped the lock above, then we
451 * need to reschedule the work, otherwise the interrupt
452 * stays disabled forever.
453 */
454 if (!lastcpu && zonedev->work_scheduled)
455 pkg_thermal_schedule_work(target, &zonedev->work);
456 }
457
458 raw_spin_unlock_irq(&pkg_temp_lock);
459
460 /* Final cleanup if this is the last cpu */
461 if (lastcpu)
462 kfree(zonedev);
463 return 0;
464}
465
466static int pkg_thermal_cpu_online(unsigned int cpu)
467{
468 struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
469 struct cpuinfo_x86 *c = &cpu_data(cpu);
470
471 /* Paranoia check */
472 if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
473 return -ENODEV;
474
475 /* If the package exists, nothing to do */
476 if (zonedev) {
477 cpumask_set_cpu(cpu, &zonedev->cpumask);
478 return 0;
479 }
480 return pkg_temp_thermal_device_add(cpu);
481}
482
483static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
484 X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL),
485 {}
486};
487MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
488
489static int __init pkg_temp_thermal_init(void)
490{
491 int ret;
492
493 if (!x86_match_cpu(pkg_temp_thermal_ids))
494 return -ENODEV;
495
496 max_id = topology_max_packages() * topology_max_die_per_package();
497 zones = kcalloc(max_id, sizeof(struct zone_device *),
498 GFP_KERNEL);
499 if (!zones)
500 return -ENOMEM;
501
502 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
503 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
504 if (ret < 0)
505 goto err;
506
507 /* Store the state for module exit */
508 pkg_thermal_hp_state = ret;
509
510 platform_thermal_package_notify = pkg_thermal_notify;
511 platform_thermal_package_rate_control = pkg_thermal_rate_control;
512
513 /* Don't care if it fails */
514 pkg_temp_debugfs_init();
515 return 0;
516
517err:
518 kfree(zones);
519 return ret;
520}
521module_init(pkg_temp_thermal_init)
522
523static void __exit pkg_temp_thermal_exit(void)
524{
525 platform_thermal_package_notify = NULL;
526 platform_thermal_package_rate_control = NULL;
527
528 cpuhp_remove_state(pkg_thermal_hp_state);
529 debugfs_remove_recursive(debugfs);
530 kfree(zones);
531}
532module_exit(pkg_temp_thermal_exit)
533
534MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
535MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
536MODULE_LICENSE("GPL v2");