Loading...
Note: File does not exist in v3.1.
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * x86_pkg_temp_thermal driver
4 * Copyright (c) 2013, Intel Corporation.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/intel_tcc.h>
11#include <linux/err.h>
12#include <linux/param.h>
13#include <linux/device.h>
14#include <linux/platform_device.h>
15#include <linux/cpu.h>
16#include <linux/smp.h>
17#include <linux/slab.h>
18#include <linux/pm.h>
19#include <linux/thermal.h>
20#include <linux/debugfs.h>
21
22#include <asm/cpu_device_id.h>
23
24#include "thermal_interrupt.h"
25
26/*
27* Rate control delay: Idea is to introduce denounce effect
28* This should be long enough to avoid reduce events, when
29* threshold is set to a temperature, which is constantly
30* violated, but at the short enough to take any action.
31* The action can be remove threshold or change it to next
32* interesting setting. Based on experiments, in around
33* every 5 seconds under load will give us a significant
34* temperature change.
35*/
36#define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000
37static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
38module_param(notify_delay_ms, int, 0644);
39MODULE_PARM_DESC(notify_delay_ms,
40 "User space notification delay in milli seconds.");
41
42/* Number of trip points in thermal zone. Currently it can't
43* be more than 2. MSR can allow setting and getting notifications
44* for only 2 thresholds. This define enforces this, if there
45* is some wrong values returned by cpuid for number of thresholds.
46*/
47#define MAX_NUMBER_OF_TRIPS 2
48
49struct zone_device {
50 int cpu;
51 bool work_scheduled;
52 u32 msr_pkg_therm_low;
53 u32 msr_pkg_therm_high;
54 struct delayed_work work;
55 struct thermal_zone_device *tzone;
56 struct cpumask cpumask;
57};
58
59static struct thermal_zone_params pkg_temp_tz_params = {
60 .no_hwmon = true,
61};
62
63/* Keep track of how many zone pointers we allocated in init() */
64static int max_id __read_mostly;
65/* Array of zone pointers */
66static struct zone_device **zones;
67/* Serializes interrupt notification, work and hotplug */
68static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
69/* Protects zone operation in the work function against hotplug removal */
70static DEFINE_MUTEX(thermal_zone_mutex);
71
72/* The dynamically assigned cpu hotplug state for module_exit() */
73static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
74
75/* Debug counters to show using debugfs */
76static struct dentry *debugfs;
77static unsigned int pkg_interrupt_cnt;
78static unsigned int pkg_work_cnt;
79
80static void pkg_temp_debugfs_init(void)
81{
82 debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
83
84 debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
85 &pkg_interrupt_cnt);
86 debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
87 &pkg_work_cnt);
88}
89
90/*
91 * Protection:
92 *
93 * - cpu hotplug: Read serialized by cpu hotplug lock
94 * Write must hold pkg_temp_lock
95 *
96 * - Other callsites: Must hold pkg_temp_lock
97 */
98static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
99{
100 int id = topology_logical_die_id(cpu);
101
102 if (id >= 0 && id < max_id)
103 return zones[id];
104 return NULL;
105}
106
107static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
108{
109 struct zone_device *zonedev = thermal_zone_device_priv(tzd);
110 int val, ret;
111
112 ret = intel_tcc_get_temp(zonedev->cpu, &val, true);
113 if (ret < 0)
114 return ret;
115
116 *temp = val * 1000;
117 pr_debug("sys_get_curr_temp %d\n", *temp);
118 return 0;
119}
120
121static int
122sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
123{
124 struct zone_device *zonedev = thermal_zone_device_priv(tzd);
125 u32 l, h, mask, shift, intr;
126 int tj_max, val, ret;
127
128 tj_max = intel_tcc_get_tjmax(zonedev->cpu);
129 if (tj_max < 0)
130 return tj_max;
131 tj_max *= 1000;
132
133 val = (tj_max - temp)/1000;
134
135 if (trip >= MAX_NUMBER_OF_TRIPS || val < 0 || val > 0x7f)
136 return -EINVAL;
137
138 ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
139 &l, &h);
140 if (ret < 0)
141 return ret;
142
143 if (trip) {
144 mask = THERM_MASK_THRESHOLD1;
145 shift = THERM_SHIFT_THRESHOLD1;
146 intr = THERM_INT_THRESHOLD1_ENABLE;
147 } else {
148 mask = THERM_MASK_THRESHOLD0;
149 shift = THERM_SHIFT_THRESHOLD0;
150 intr = THERM_INT_THRESHOLD0_ENABLE;
151 }
152 l &= ~mask;
153 /*
154 * When users space sets a trip temperature == 0, which is indication
155 * that, it is no longer interested in receiving notifications.
156 */
157 if (!temp) {
158 l &= ~intr;
159 } else {
160 l |= val << shift;
161 l |= intr;
162 }
163
164 return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
165 l, h);
166}
167
168/* Thermal zone callback registry */
169static const struct thermal_zone_device_ops tzone_ops = {
170 .get_temp = sys_get_curr_temp,
171 .set_trip_temp = sys_set_trip_temp,
172};
173
174static bool pkg_thermal_rate_control(void)
175{
176 return true;
177}
178
179/* Enable threshold interrupt on local package/cpu */
180static inline void enable_pkg_thres_interrupt(void)
181{
182 u8 thres_0, thres_1;
183 u32 l, h;
184
185 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
186 /* only enable/disable if it had valid threshold value */
187 thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
188 thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
189 if (thres_0)
190 l |= THERM_INT_THRESHOLD0_ENABLE;
191 if (thres_1)
192 l |= THERM_INT_THRESHOLD1_ENABLE;
193 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
194}
195
196/* Disable threshold interrupt on local package/cpu */
197static inline void disable_pkg_thres_interrupt(void)
198{
199 u32 l, h;
200
201 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
202
203 l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
204 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
205}
206
207static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
208{
209 struct thermal_zone_device *tzone = NULL;
210 int cpu = smp_processor_id();
211 struct zone_device *zonedev;
212
213 mutex_lock(&thermal_zone_mutex);
214 raw_spin_lock_irq(&pkg_temp_lock);
215 ++pkg_work_cnt;
216
217 zonedev = pkg_temp_thermal_get_dev(cpu);
218 if (!zonedev) {
219 raw_spin_unlock_irq(&pkg_temp_lock);
220 mutex_unlock(&thermal_zone_mutex);
221 return;
222 }
223 zonedev->work_scheduled = false;
224
225 thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
226 tzone = zonedev->tzone;
227
228 enable_pkg_thres_interrupt();
229 raw_spin_unlock_irq(&pkg_temp_lock);
230
231 /*
232 * If tzone is not NULL, then thermal_zone_mutex will prevent the
233 * concurrent removal in the cpu offline callback.
234 */
235 if (tzone)
236 thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
237
238 mutex_unlock(&thermal_zone_mutex);
239}
240
241static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
242{
243 unsigned long ms = msecs_to_jiffies(notify_delay_ms);
244
245 schedule_delayed_work_on(cpu, work, ms);
246}
247
248static int pkg_thermal_notify(u64 msr_val)
249{
250 int cpu = smp_processor_id();
251 struct zone_device *zonedev;
252 unsigned long flags;
253
254 raw_spin_lock_irqsave(&pkg_temp_lock, flags);
255 ++pkg_interrupt_cnt;
256
257 disable_pkg_thres_interrupt();
258
259 /* Work is per package, so scheduling it once is enough. */
260 zonedev = pkg_temp_thermal_get_dev(cpu);
261 if (zonedev && !zonedev->work_scheduled) {
262 zonedev->work_scheduled = true;
263 pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
264 }
265
266 raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
267 return 0;
268}
269
270static int pkg_temp_thermal_trips_init(int cpu, int tj_max,
271 struct thermal_trip *trips, int num_trips)
272{
273 unsigned long thres_reg_value;
274 u32 mask, shift, eax, edx;
275 int ret, i;
276
277 for (i = 0; i < num_trips; i++) {
278
279 if (i) {
280 mask = THERM_MASK_THRESHOLD1;
281 shift = THERM_SHIFT_THRESHOLD1;
282 } else {
283 mask = THERM_MASK_THRESHOLD0;
284 shift = THERM_SHIFT_THRESHOLD0;
285 }
286
287 ret = rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
288 &eax, &edx);
289 if (ret < 0)
290 return ret;
291
292 thres_reg_value = (eax & mask) >> shift;
293
294 trips[i].temperature = thres_reg_value ?
295 tj_max - thres_reg_value * 1000 : THERMAL_TEMP_INVALID;
296
297 trips[i].type = THERMAL_TRIP_PASSIVE;
298 trips[i].flags |= THERMAL_TRIP_FLAG_RW_TEMP;
299
300 pr_debug("%s: cpu=%d, trip=%d, temp=%d\n",
301 __func__, cpu, i, trips[i].temperature);
302 }
303
304 return 0;
305}
306
307static int pkg_temp_thermal_device_add(unsigned int cpu)
308{
309 struct thermal_trip trips[MAX_NUMBER_OF_TRIPS] = { 0 };
310 int id = topology_logical_die_id(cpu);
311 u32 eax, ebx, ecx, edx;
312 struct zone_device *zonedev;
313 int thres_count, err;
314 int tj_max;
315
316 if (id >= max_id)
317 return -ENOMEM;
318
319 cpuid(6, &eax, &ebx, &ecx, &edx);
320 thres_count = ebx & 0x07;
321 if (!thres_count)
322 return -ENODEV;
323
324 thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
325
326 tj_max = intel_tcc_get_tjmax(cpu);
327 if (tj_max < 0)
328 return tj_max;
329
330 zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
331 if (!zonedev)
332 return -ENOMEM;
333
334 err = pkg_temp_thermal_trips_init(cpu, tj_max, trips, thres_count);
335 if (err)
336 goto out_kfree_zonedev;
337
338 INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
339 zonedev->cpu = cpu;
340 zonedev->tzone = thermal_zone_device_register_with_trips("x86_pkg_temp",
341 trips, thres_count,
342 zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
343 if (IS_ERR(zonedev->tzone)) {
344 err = PTR_ERR(zonedev->tzone);
345 goto out_kfree_zonedev;
346 }
347 err = thermal_zone_device_enable(zonedev->tzone);
348 if (err)
349 goto out_unregister_tz;
350
351 /* Store MSR value for package thermal interrupt, to restore at exit */
352 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
353 zonedev->msr_pkg_therm_high);
354
355 cpumask_set_cpu(cpu, &zonedev->cpumask);
356 raw_spin_lock_irq(&pkg_temp_lock);
357 zones[id] = zonedev;
358 raw_spin_unlock_irq(&pkg_temp_lock);
359
360 return 0;
361
362out_unregister_tz:
363 thermal_zone_device_unregister(zonedev->tzone);
364out_kfree_zonedev:
365 kfree(zonedev);
366 return err;
367}
368
369static int pkg_thermal_cpu_offline(unsigned int cpu)
370{
371 struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
372 bool lastcpu, was_target;
373 int target;
374
375 if (!zonedev)
376 return 0;
377
378 target = cpumask_any_but(&zonedev->cpumask, cpu);
379 cpumask_clear_cpu(cpu, &zonedev->cpumask);
380 lastcpu = target >= nr_cpu_ids;
381 /*
382 * Remove the sysfs files, if this is the last cpu in the package
383 * before doing further cleanups.
384 */
385 if (lastcpu) {
386 struct thermal_zone_device *tzone = zonedev->tzone;
387
388 /*
389 * We must protect against a work function calling
390 * thermal_zone_update, after/while unregister. We null out
391 * the pointer under the zone mutex, so the worker function
392 * won't try to call.
393 */
394 mutex_lock(&thermal_zone_mutex);
395 zonedev->tzone = NULL;
396 mutex_unlock(&thermal_zone_mutex);
397
398 thermal_zone_device_unregister(tzone);
399 }
400
401 /* Protect against work and interrupts */
402 raw_spin_lock_irq(&pkg_temp_lock);
403
404 /*
405 * Check whether this cpu was the current target and store the new
406 * one. When we drop the lock, then the interrupt notify function
407 * will see the new target.
408 */
409 was_target = zonedev->cpu == cpu;
410 zonedev->cpu = target;
411
412 /*
413 * If this is the last CPU in the package remove the package
414 * reference from the array and restore the interrupt MSR. When we
415 * drop the lock neither the interrupt notify function nor the
416 * worker will see the package anymore.
417 */
418 if (lastcpu) {
419 zones[topology_logical_die_id(cpu)] = NULL;
420 /* After this point nothing touches the MSR anymore. */
421 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
422 zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
423 }
424
425 /*
426 * Check whether there is work scheduled and whether the work is
427 * targeted at the outgoing CPU.
428 */
429 if (zonedev->work_scheduled && was_target) {
430 /*
431 * To cancel the work we need to drop the lock, otherwise
432 * we might deadlock if the work needs to be flushed.
433 */
434 raw_spin_unlock_irq(&pkg_temp_lock);
435 cancel_delayed_work_sync(&zonedev->work);
436 raw_spin_lock_irq(&pkg_temp_lock);
437 /*
438 * If this is not the last cpu in the package and the work
439 * did not run after we dropped the lock above, then we
440 * need to reschedule the work, otherwise the interrupt
441 * stays disabled forever.
442 */
443 if (!lastcpu && zonedev->work_scheduled)
444 pkg_thermal_schedule_work(target, &zonedev->work);
445 }
446
447 raw_spin_unlock_irq(&pkg_temp_lock);
448
449 /* Final cleanup if this is the last cpu */
450 if (lastcpu)
451 kfree(zonedev);
452
453 return 0;
454}
455
456static int pkg_thermal_cpu_online(unsigned int cpu)
457{
458 struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
459 struct cpuinfo_x86 *c = &cpu_data(cpu);
460
461 /* Paranoia check */
462 if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
463 return -ENODEV;
464
465 /* If the package exists, nothing to do */
466 if (zonedev) {
467 cpumask_set_cpu(cpu, &zonedev->cpumask);
468 return 0;
469 }
470 return pkg_temp_thermal_device_add(cpu);
471}
472
473static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
474 X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL),
475 {}
476};
477MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
478
479static int __init pkg_temp_thermal_init(void)
480{
481 int ret;
482
483 if (!x86_match_cpu(pkg_temp_thermal_ids))
484 return -ENODEV;
485
486 max_id = topology_max_packages() * topology_max_dies_per_package();
487 zones = kcalloc(max_id, sizeof(struct zone_device *),
488 GFP_KERNEL);
489 if (!zones)
490 return -ENOMEM;
491
492 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
493 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
494 if (ret < 0)
495 goto err;
496
497 /* Store the state for module exit */
498 pkg_thermal_hp_state = ret;
499
500 platform_thermal_package_notify = pkg_thermal_notify;
501 platform_thermal_package_rate_control = pkg_thermal_rate_control;
502
503 /* Don't care if it fails */
504 pkg_temp_debugfs_init();
505 return 0;
506
507err:
508 kfree(zones);
509 return ret;
510}
511module_init(pkg_temp_thermal_init)
512
513static void __exit pkg_temp_thermal_exit(void)
514{
515 platform_thermal_package_notify = NULL;
516 platform_thermal_package_rate_control = NULL;
517
518 cpuhp_remove_state(pkg_thermal_hp_state);
519 debugfs_remove_recursive(debugfs);
520 kfree(zones);
521}
522module_exit(pkg_temp_thermal_exit)
523
524MODULE_IMPORT_NS(INTEL_TCC);
525MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
526MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
527MODULE_LICENSE("GPL v2");