Linux Audio

Check our new training course

Loading...
Note: File does not exist in v6.13.7.
   1/*
   2 *  linux/drivers/thermal/cpu_cooling.c
   3 *
   4 *  Copyright (C) 2012	Samsung Electronics Co., Ltd(http://www.samsung.com)
   5 *  Copyright (C) 2012  Amit Daniel <amit.kachhap@linaro.org>
   6 *
   7 *  Copyright (C) 2014  Viresh Kumar <viresh.kumar@linaro.org>
   8 *
   9 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  10 *  This program is free software; you can redistribute it and/or modify
  11 *  it under the terms of the GNU General Public License as published by
  12 *  the Free Software Foundation; version 2 of the License.
  13 *
  14 *  This program is distributed in the hope that it will be useful, but
  15 *  WITHOUT ANY WARRANTY; without even the implied warranty of
  16 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 *  General Public License for more details.
  18 *
  19 *  You should have received a copy of the GNU General Public License along
  20 *  with this program; if not, write to the Free Software Foundation, Inc.,
  21 *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  22 *
  23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  24 */
  25#include <linux/module.h>
  26#include <linux/thermal.h>
  27#include <linux/cpufreq.h>
  28#include <linux/err.h>
  29#include <linux/pm_opp.h>
  30#include <linux/slab.h>
  31#include <linux/cpu.h>
  32#include <linux/cpu_cooling.h>
  33
  34#include <trace/events/thermal.h>
  35
  36/*
  37 * Cooling state <-> CPUFreq frequency
  38 *
  39 * Cooling states are translated to frequencies throughout this driver and this
  40 * is the relation between them.
  41 *
  42 * Highest cooling state corresponds to lowest possible frequency.
  43 *
  44 * i.e.
  45 *	level 0 --> 1st Max Freq
  46 *	level 1 --> 2nd Max Freq
  47 *	...
  48 */
  49
  50/**
  51 * struct power_table - frequency to power conversion
  52 * @frequency:	frequency in KHz
  53 * @power:	power in mW
  54 *
  55 * This structure is built when the cooling device registers and helps
  56 * in translating frequency to power and viceversa.
  57 */
  58struct power_table {
  59	u32 frequency;
  60	u32 power;
  61};
  62
  63/**
  64 * struct cpufreq_cooling_device - data for cooling device with cpufreq
  65 * @id: unique integer value corresponding to each cpufreq_cooling_device
  66 *	registered.
  67 * @cool_dev: thermal_cooling_device pointer to keep track of the
  68 *	registered cooling device.
  69 * @cpufreq_state: integer value representing the current state of cpufreq
  70 *	cooling	devices.
  71 * @clipped_freq: integer value representing the absolute value of the clipped
  72 *	frequency.
  73 * @max_level: maximum cooling level. One less than total number of valid
  74 *	cpufreq frequencies.
  75 * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device.
  76 * @node: list_head to link all cpufreq_cooling_device together.
  77 * @last_load: load measured by the latest call to cpufreq_get_requested_power()
  78 * @time_in_idle: previous reading of the absolute time that this cpu was idle
  79 * @time_in_idle_timestamp: wall time of the last invocation of
  80 *	get_cpu_idle_time_us()
  81 * @dyn_power_table: array of struct power_table for frequency to power
  82 *	conversion, sorted in ascending order.
  83 * @dyn_power_table_entries: number of entries in the @dyn_power_table array
  84 * @cpu_dev: the first cpu_device from @allowed_cpus that has OPPs registered
  85 * @plat_get_static_power: callback to calculate the static power
  86 *
  87 * This structure is required for keeping information of each registered
  88 * cpufreq_cooling_device.
  89 */
  90struct cpufreq_cooling_device {
  91	int id;
  92	struct thermal_cooling_device *cool_dev;
  93	unsigned int cpufreq_state;
  94	unsigned int clipped_freq;
  95	unsigned int max_level;
  96	unsigned int *freq_table;	/* In descending order */
  97	struct cpumask allowed_cpus;
  98	struct list_head node;
  99	u32 last_load;
 100	u64 *time_in_idle;
 101	u64 *time_in_idle_timestamp;
 102	struct power_table *dyn_power_table;
 103	int dyn_power_table_entries;
 104	struct device *cpu_dev;
 105	get_static_t plat_get_static_power;
 106};
 107static DEFINE_IDR(cpufreq_idr);
 108static DEFINE_MUTEX(cooling_cpufreq_lock);
 109
 110static unsigned int cpufreq_dev_count;
 111
 112static DEFINE_MUTEX(cooling_list_lock);
 113static LIST_HEAD(cpufreq_dev_list);
 114
 115/**
 116 * get_idr - function to get a unique id.
 117 * @idr: struct idr * handle used to create a id.
 118 * @id: int * value generated by this function.
 119 *
 120 * This function will populate @id with an unique
 121 * id, using the idr API.
 122 *
 123 * Return: 0 on success, an error code on failure.
 124 */
 125static int get_idr(struct idr *idr, int *id)
 126{
 127	int ret;
 128
 129	mutex_lock(&cooling_cpufreq_lock);
 130	ret = idr_alloc(idr, NULL, 0, 0, GFP_KERNEL);
 131	mutex_unlock(&cooling_cpufreq_lock);
 132	if (unlikely(ret < 0))
 133		return ret;
 134	*id = ret;
 135
 136	return 0;
 137}
 138
 139/**
 140 * release_idr - function to free the unique id.
 141 * @idr: struct idr * handle used for creating the id.
 142 * @id: int value representing the unique id.
 143 */
 144static void release_idr(struct idr *idr, int id)
 145{
 146	mutex_lock(&cooling_cpufreq_lock);
 147	idr_remove(idr, id);
 148	mutex_unlock(&cooling_cpufreq_lock);
 149}
 150
 151/* Below code defines functions to be used for cpufreq as cooling device */
 152
 153/**
 154 * get_level: Find the level for a particular frequency
 155 * @cpufreq_dev: cpufreq_dev for which the property is required
 156 * @freq: Frequency
 157 *
 158 * Return: level on success, THERMAL_CSTATE_INVALID on error.
 159 */
 160static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_dev,
 161			       unsigned int freq)
 162{
 163	unsigned long level;
 164
 165	for (level = 0; level <= cpufreq_dev->max_level; level++) {
 166		if (freq == cpufreq_dev->freq_table[level])
 167			return level;
 168
 169		if (freq > cpufreq_dev->freq_table[level])
 170			break;
 171	}
 172
 173	return THERMAL_CSTATE_INVALID;
 174}
 175
 176/**
 177 * cpufreq_cooling_get_level - for a given cpu, return the cooling level.
 178 * @cpu: cpu for which the level is required
 179 * @freq: the frequency of interest
 180 *
 181 * This function will match the cooling level corresponding to the
 182 * requested @freq and return it.
 183 *
 184 * Return: The matched cooling level on success or THERMAL_CSTATE_INVALID
 185 * otherwise.
 186 */
 187unsigned long cpufreq_cooling_get_level(unsigned int cpu, unsigned int freq)
 188{
 189	struct cpufreq_cooling_device *cpufreq_dev;
 190
 191	mutex_lock(&cooling_list_lock);
 192	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
 193		if (cpumask_test_cpu(cpu, &cpufreq_dev->allowed_cpus)) {
 194			mutex_unlock(&cooling_list_lock);
 195			return get_level(cpufreq_dev, freq);
 196		}
 197	}
 198	mutex_unlock(&cooling_list_lock);
 199
 200	pr_err("%s: cpu:%d not part of any cooling device\n", __func__, cpu);
 201	return THERMAL_CSTATE_INVALID;
 202}
 203EXPORT_SYMBOL_GPL(cpufreq_cooling_get_level);
 204
 205/**
 206 * cpufreq_thermal_notifier - notifier callback for cpufreq policy change.
 207 * @nb:	struct notifier_block * with callback info.
 208 * @event: value showing cpufreq event for which this function invoked.
 209 * @data: callback-specific data
 210 *
 211 * Callback to hijack the notification on cpufreq policy transition.
 212 * Every time there is a change in policy, we will intercept and
 213 * update the cpufreq policy with thermal constraints.
 214 *
 215 * Return: 0 (success)
 216 */
 217static int cpufreq_thermal_notifier(struct notifier_block *nb,
 218				    unsigned long event, void *data)
 219{
 220	struct cpufreq_policy *policy = data;
 221	unsigned long clipped_freq;
 222	struct cpufreq_cooling_device *cpufreq_dev;
 223
 224	if (event != CPUFREQ_ADJUST)
 225		return NOTIFY_DONE;
 226
 227	mutex_lock(&cooling_list_lock);
 228	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
 229		if (!cpumask_test_cpu(policy->cpu, &cpufreq_dev->allowed_cpus))
 230			continue;
 231
 232		/*
 233		 * policy->max is the maximum allowed frequency defined by user
 234		 * and clipped_freq is the maximum that thermal constraints
 235		 * allow.
 236		 *
 237		 * If clipped_freq is lower than policy->max, then we need to
 238		 * readjust policy->max.
 239		 *
 240		 * But, if clipped_freq is greater than policy->max, we don't
 241		 * need to do anything.
 242		 */
 243		clipped_freq = cpufreq_dev->clipped_freq;
 244
 245		if (policy->max > clipped_freq)
 246			cpufreq_verify_within_limits(policy, 0, clipped_freq);
 247		break;
 248	}
 249	mutex_unlock(&cooling_list_lock);
 250
 251	return NOTIFY_OK;
 252}
 253
 254/**
 255 * build_dyn_power_table() - create a dynamic power to frequency table
 256 * @cpufreq_device:	the cpufreq cooling device in which to store the table
 257 * @capacitance: dynamic power coefficient for these cpus
 258 *
 259 * Build a dynamic power to frequency table for this cpu and store it
 260 * in @cpufreq_device.  This table will be used in cpu_power_to_freq() and
 261 * cpu_freq_to_power() to convert between power and frequency
 262 * efficiently.  Power is stored in mW, frequency in KHz.  The
 263 * resulting table is in ascending order.
 264 *
 265 * Return: 0 on success, -EINVAL if there are no OPPs for any CPUs,
 266 * -ENOMEM if we run out of memory or -EAGAIN if an OPP was
 267 * added/enabled while the function was executing.
 268 */
 269static int build_dyn_power_table(struct cpufreq_cooling_device *cpufreq_device,
 270				 u32 capacitance)
 271{
 272	struct power_table *power_table;
 273	struct dev_pm_opp *opp;
 274	struct device *dev = NULL;
 275	int num_opps = 0, cpu, i, ret = 0;
 276	unsigned long freq;
 277
 278	for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
 279		dev = get_cpu_device(cpu);
 280		if (!dev) {
 281			dev_warn(&cpufreq_device->cool_dev->device,
 282				 "No cpu device for cpu %d\n", cpu);
 283			continue;
 284		}
 285
 286		num_opps = dev_pm_opp_get_opp_count(dev);
 287		if (num_opps > 0)
 288			break;
 289		else if (num_opps < 0)
 290			return num_opps;
 291	}
 292
 293	if (num_opps == 0)
 294		return -EINVAL;
 295
 296	power_table = kcalloc(num_opps, sizeof(*power_table), GFP_KERNEL);
 297	if (!power_table)
 298		return -ENOMEM;
 299
 300	rcu_read_lock();
 301
 302	for (freq = 0, i = 0;
 303	     opp = dev_pm_opp_find_freq_ceil(dev, &freq), !IS_ERR(opp);
 304	     freq++, i++) {
 305		u32 freq_mhz, voltage_mv;
 306		u64 power;
 307
 308		if (i >= num_opps) {
 309			rcu_read_unlock();
 310			ret = -EAGAIN;
 311			goto free_power_table;
 312		}
 313
 314		freq_mhz = freq / 1000000;
 315		voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
 316
 317		/*
 318		 * Do the multiplication with MHz and millivolt so as
 319		 * to not overflow.
 320		 */
 321		power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
 322		do_div(power, 1000000000);
 323
 324		/* frequency is stored in power_table in KHz */
 325		power_table[i].frequency = freq / 1000;
 326
 327		/* power is stored in mW */
 328		power_table[i].power = power;
 329	}
 330
 331	rcu_read_unlock();
 332
 333	if (i != num_opps) {
 334		ret = PTR_ERR(opp);
 335		goto free_power_table;
 336	}
 337
 338	cpufreq_device->cpu_dev = dev;
 339	cpufreq_device->dyn_power_table = power_table;
 340	cpufreq_device->dyn_power_table_entries = i;
 341
 342	return 0;
 343
 344free_power_table:
 345	kfree(power_table);
 346
 347	return ret;
 348}
 349
 350static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_device,
 351			     u32 freq)
 352{
 353	int i;
 354	struct power_table *pt = cpufreq_device->dyn_power_table;
 355
 356	for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
 357		if (freq < pt[i].frequency)
 358			break;
 359
 360	return pt[i - 1].power;
 361}
 362
 363static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_device,
 364			     u32 power)
 365{
 366	int i;
 367	struct power_table *pt = cpufreq_device->dyn_power_table;
 368
 369	for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
 370		if (power < pt[i].power)
 371			break;
 372
 373	return pt[i - 1].frequency;
 374}
 375
 376/**
 377 * get_load() - get load for a cpu since last updated
 378 * @cpufreq_device:	&struct cpufreq_cooling_device for this cpu
 379 * @cpu:	cpu number
 380 * @cpu_idx:	index of the cpu in cpufreq_device->allowed_cpus
 381 *
 382 * Return: The average load of cpu @cpu in percentage since this
 383 * function was last called.
 384 */
 385static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu,
 386		    int cpu_idx)
 387{
 388	u32 load;
 389	u64 now, now_idle, delta_time, delta_idle;
 390
 391	now_idle = get_cpu_idle_time(cpu, &now, 0);
 392	delta_idle = now_idle - cpufreq_device->time_in_idle[cpu_idx];
 393	delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu_idx];
 394
 395	if (delta_time <= delta_idle)
 396		load = 0;
 397	else
 398		load = div64_u64(100 * (delta_time - delta_idle), delta_time);
 399
 400	cpufreq_device->time_in_idle[cpu_idx] = now_idle;
 401	cpufreq_device->time_in_idle_timestamp[cpu_idx] = now;
 402
 403	return load;
 404}
 405
 406/**
 407 * get_static_power() - calculate the static power consumed by the cpus
 408 * @cpufreq_device:	struct &cpufreq_cooling_device for this cpu cdev
 409 * @tz:		thermal zone device in which we're operating
 410 * @freq:	frequency in KHz
 411 * @power:	pointer in which to store the calculated static power
 412 *
 413 * Calculate the static power consumed by the cpus described by
 414 * @cpu_actor running at frequency @freq.  This function relies on a
 415 * platform specific function that should have been provided when the
 416 * actor was registered.  If it wasn't, the static power is assumed to
 417 * be negligible.  The calculated static power is stored in @power.
 418 *
 419 * Return: 0 on success, -E* on failure.
 420 */
 421static int get_static_power(struct cpufreq_cooling_device *cpufreq_device,
 422			    struct thermal_zone_device *tz, unsigned long freq,
 423			    u32 *power)
 424{
 425	struct dev_pm_opp *opp;
 426	unsigned long voltage;
 427	struct cpumask *cpumask = &cpufreq_device->allowed_cpus;
 428	unsigned long freq_hz = freq * 1000;
 429
 430	if (!cpufreq_device->plat_get_static_power ||
 431	    !cpufreq_device->cpu_dev) {
 432		*power = 0;
 433		return 0;
 434	}
 435
 436	rcu_read_lock();
 437
 438	opp = dev_pm_opp_find_freq_exact(cpufreq_device->cpu_dev, freq_hz,
 439					 true);
 440	voltage = dev_pm_opp_get_voltage(opp);
 441
 442	rcu_read_unlock();
 443
 444	if (voltage == 0) {
 445		dev_warn_ratelimited(cpufreq_device->cpu_dev,
 446				     "Failed to get voltage for frequency %lu: %ld\n",
 447				     freq_hz, IS_ERR(opp) ? PTR_ERR(opp) : 0);
 448		return -EINVAL;
 449	}
 450
 451	return cpufreq_device->plat_get_static_power(cpumask, tz->passive_delay,
 452						     voltage, power);
 453}
 454
 455/**
 456 * get_dynamic_power() - calculate the dynamic power
 457 * @cpufreq_device:	&cpufreq_cooling_device for this cdev
 458 * @freq:	current frequency
 459 *
 460 * Return: the dynamic power consumed by the cpus described by
 461 * @cpufreq_device.
 462 */
 463static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_device,
 464			     unsigned long freq)
 465{
 466	u32 raw_cpu_power;
 467
 468	raw_cpu_power = cpu_freq_to_power(cpufreq_device, freq);
 469	return (raw_cpu_power * cpufreq_device->last_load) / 100;
 470}
 471
 472/* cpufreq cooling device callback functions are defined below */
 473
 474/**
 475 * cpufreq_get_max_state - callback function to get the max cooling state.
 476 * @cdev: thermal cooling device pointer.
 477 * @state: fill this variable with the max cooling state.
 478 *
 479 * Callback for the thermal cooling device to return the cpufreq
 480 * max cooling state.
 481 *
 482 * Return: 0 on success, an error code otherwise.
 483 */
 484static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
 485				 unsigned long *state)
 486{
 487	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 488
 489	*state = cpufreq_device->max_level;
 490	return 0;
 491}
 492
 493/**
 494 * cpufreq_get_cur_state - callback function to get the current cooling state.
 495 * @cdev: thermal cooling device pointer.
 496 * @state: fill this variable with the current cooling state.
 497 *
 498 * Callback for the thermal cooling device to return the cpufreq
 499 * current cooling state.
 500 *
 501 * Return: 0 on success, an error code otherwise.
 502 */
 503static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
 504				 unsigned long *state)
 505{
 506	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 507
 508	*state = cpufreq_device->cpufreq_state;
 509
 510	return 0;
 511}
 512
 513/**
 514 * cpufreq_set_cur_state - callback function to set the current cooling state.
 515 * @cdev: thermal cooling device pointer.
 516 * @state: set this variable to the current cooling state.
 517 *
 518 * Callback for the thermal cooling device to change the cpufreq
 519 * current cooling state.
 520 *
 521 * Return: 0 on success, an error code otherwise.
 522 */
 523static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 524				 unsigned long state)
 525{
 526	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 527	unsigned int cpu = cpumask_any(&cpufreq_device->allowed_cpus);
 528	unsigned int clip_freq;
 529
 530	/* Request state should be less than max_level */
 531	if (WARN_ON(state > cpufreq_device->max_level))
 532		return -EINVAL;
 533
 534	/* Check if the old cooling action is same as new cooling action */
 535	if (cpufreq_device->cpufreq_state == state)
 536		return 0;
 537
 538	clip_freq = cpufreq_device->freq_table[state];
 539	cpufreq_device->cpufreq_state = state;
 540	cpufreq_device->clipped_freq = clip_freq;
 541
 542	cpufreq_update_policy(cpu);
 543
 544	return 0;
 545}
 546
 547/**
 548 * cpufreq_get_requested_power() - get the current power
 549 * @cdev:	&thermal_cooling_device pointer
 550 * @tz:		a valid thermal zone device pointer
 551 * @power:	pointer in which to store the resulting power
 552 *
 553 * Calculate the current power consumption of the cpus in milliwatts
 554 * and store it in @power.  This function should actually calculate
 555 * the requested power, but it's hard to get the frequency that
 556 * cpufreq would have assigned if there were no thermal limits.
 557 * Instead, we calculate the current power on the assumption that the
 558 * immediate future will look like the immediate past.
 559 *
 560 * We use the current frequency and the average load since this
 561 * function was last called.  In reality, there could have been
 562 * multiple opps since this function was last called and that affects
 563 * the load calculation.  While it's not perfectly accurate, this
 564 * simplification is good enough and works.  REVISIT this, as more
 565 * complex code may be needed if experiments show that it's not
 566 * accurate enough.
 567 *
 568 * Return: 0 on success, -E* if getting the static power failed.
 569 */
 570static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
 571				       struct thermal_zone_device *tz,
 572				       u32 *power)
 573{
 574	unsigned long freq;
 575	int i = 0, cpu, ret;
 576	u32 static_power, dynamic_power, total_load = 0;
 577	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 578	u32 *load_cpu = NULL;
 579
 580	cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
 581
 582	/*
 583	 * All the CPUs are offline, thus the requested power by
 584	 * the cdev is 0
 585	 */
 586	if (cpu >= nr_cpu_ids) {
 587		*power = 0;
 588		return 0;
 589	}
 590
 591	freq = cpufreq_quick_get(cpu);
 592
 593	if (trace_thermal_power_cpu_get_power_enabled()) {
 594		u32 ncpus = cpumask_weight(&cpufreq_device->allowed_cpus);
 595
 596		load_cpu = kcalloc(ncpus, sizeof(*load_cpu), GFP_KERNEL);
 597	}
 598
 599	for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
 600		u32 load;
 601
 602		if (cpu_online(cpu))
 603			load = get_load(cpufreq_device, cpu, i);
 604		else
 605			load = 0;
 606
 607		total_load += load;
 608		if (trace_thermal_power_cpu_limit_enabled() && load_cpu)
 609			load_cpu[i] = load;
 610
 611		i++;
 612	}
 613
 614	cpufreq_device->last_load = total_load;
 615
 616	dynamic_power = get_dynamic_power(cpufreq_device, freq);
 617	ret = get_static_power(cpufreq_device, tz, freq, &static_power);
 618	if (ret) {
 619		kfree(load_cpu);
 620		return ret;
 621	}
 622
 623	if (load_cpu) {
 624		trace_thermal_power_cpu_get_power(
 625			&cpufreq_device->allowed_cpus,
 626			freq, load_cpu, i, dynamic_power, static_power);
 627
 628		kfree(load_cpu);
 629	}
 630
 631	*power = static_power + dynamic_power;
 632	return 0;
 633}
 634
 635/**
 636 * cpufreq_state2power() - convert a cpu cdev state to power consumed
 637 * @cdev:	&thermal_cooling_device pointer
 638 * @tz:		a valid thermal zone device pointer
 639 * @state:	cooling device state to be converted
 640 * @power:	pointer in which to store the resulting power
 641 *
 642 * Convert cooling device state @state into power consumption in
 643 * milliwatts assuming 100% load.  Store the calculated power in
 644 * @power.
 645 *
 646 * Return: 0 on success, -EINVAL if the cooling device state could not
 647 * be converted into a frequency or other -E* if there was an error
 648 * when calculating the static power.
 649 */
 650static int cpufreq_state2power(struct thermal_cooling_device *cdev,
 651			       struct thermal_zone_device *tz,
 652			       unsigned long state, u32 *power)
 653{
 654	unsigned int freq, num_cpus;
 655	cpumask_t cpumask;
 656	u32 static_power, dynamic_power;
 657	int ret;
 658	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 659
 660	cpumask_and(&cpumask, &cpufreq_device->allowed_cpus, cpu_online_mask);
 661	num_cpus = cpumask_weight(&cpumask);
 662
 663	/* None of our cpus are online, so no power */
 664	if (num_cpus == 0) {
 665		*power = 0;
 666		return 0;
 667	}
 668
 669	freq = cpufreq_device->freq_table[state];
 670	if (!freq)
 671		return -EINVAL;
 672
 673	dynamic_power = cpu_freq_to_power(cpufreq_device, freq) * num_cpus;
 674	ret = get_static_power(cpufreq_device, tz, freq, &static_power);
 675	if (ret)
 676		return ret;
 677
 678	*power = static_power + dynamic_power;
 679	return 0;
 680}
 681
 682/**
 683 * cpufreq_power2state() - convert power to a cooling device state
 684 * @cdev:	&thermal_cooling_device pointer
 685 * @tz:		a valid thermal zone device pointer
 686 * @power:	power in milliwatts to be converted
 687 * @state:	pointer in which to store the resulting state
 688 *
 689 * Calculate a cooling device state for the cpus described by @cdev
 690 * that would allow them to consume at most @power mW and store it in
 691 * @state.  Note that this calculation depends on external factors
 692 * such as the cpu load or the current static power.  Calling this
 693 * function with the same power as input can yield different cooling
 694 * device states depending on those external factors.
 695 *
 696 * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if
 697 * the calculated frequency could not be converted to a valid state.
 698 * The latter should not happen unless the frequencies available to
 699 * cpufreq have changed since the initialization of the cpu cooling
 700 * device.
 701 */
 702static int cpufreq_power2state(struct thermal_cooling_device *cdev,
 703			       struct thermal_zone_device *tz, u32 power,
 704			       unsigned long *state)
 705{
 706	unsigned int cpu, cur_freq, target_freq;
 707	int ret;
 708	s32 dyn_power;
 709	u32 last_load, normalised_power, static_power;
 710	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
 711
 712	cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
 713
 714	/* None of our cpus are online */
 715	if (cpu >= nr_cpu_ids)
 716		return -ENODEV;
 717
 718	cur_freq = cpufreq_quick_get(cpu);
 719	ret = get_static_power(cpufreq_device, tz, cur_freq, &static_power);
 720	if (ret)
 721		return ret;
 722
 723	dyn_power = power - static_power;
 724	dyn_power = dyn_power > 0 ? dyn_power : 0;
 725	last_load = cpufreq_device->last_load ?: 1;
 726	normalised_power = (dyn_power * 100) / last_load;
 727	target_freq = cpu_power_to_freq(cpufreq_device, normalised_power);
 728
 729	*state = cpufreq_cooling_get_level(cpu, target_freq);
 730	if (*state == THERMAL_CSTATE_INVALID) {
 731		dev_warn_ratelimited(&cdev->device,
 732				     "Failed to convert %dKHz for cpu %d into a cdev state\n",
 733				     target_freq, cpu);
 734		return -EINVAL;
 735	}
 736
 737	trace_thermal_power_cpu_limit(&cpufreq_device->allowed_cpus,
 738				      target_freq, *state, power);
 739	return 0;
 740}
 741
 742/* Bind cpufreq callbacks to thermal cooling device ops */
 743
 744static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
 745	.get_max_state = cpufreq_get_max_state,
 746	.get_cur_state = cpufreq_get_cur_state,
 747	.set_cur_state = cpufreq_set_cur_state,
 748};
 749
 750static struct thermal_cooling_device_ops cpufreq_power_cooling_ops = {
 751	.get_max_state		= cpufreq_get_max_state,
 752	.get_cur_state		= cpufreq_get_cur_state,
 753	.set_cur_state		= cpufreq_set_cur_state,
 754	.get_requested_power	= cpufreq_get_requested_power,
 755	.state2power		= cpufreq_state2power,
 756	.power2state		= cpufreq_power2state,
 757};
 758
 759/* Notifier for cpufreq policy change */
 760static struct notifier_block thermal_cpufreq_notifier_block = {
 761	.notifier_call = cpufreq_thermal_notifier,
 762};
 763
 764static unsigned int find_next_max(struct cpufreq_frequency_table *table,
 765				  unsigned int prev_max)
 766{
 767	struct cpufreq_frequency_table *pos;
 768	unsigned int max = 0;
 769
 770	cpufreq_for_each_valid_entry(pos, table) {
 771		if (pos->frequency > max && pos->frequency < prev_max)
 772			max = pos->frequency;
 773	}
 774
 775	return max;
 776}
 777
 778/**
 779 * __cpufreq_cooling_register - helper function to create cpufreq cooling device
 780 * @np: a valid struct device_node to the cooling device device tree node
 781 * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
 782 * Normally this should be same as cpufreq policy->related_cpus.
 783 * @capacitance: dynamic power coefficient for these cpus
 784 * @plat_static_func: function to calculate the static power consumed by these
 785 *                    cpus (optional)
 786 *
 787 * This interface function registers the cpufreq cooling device with the name
 788 * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
 789 * cooling devices. It also gives the opportunity to link the cooling device
 790 * with a device tree node, in order to bind it via the thermal DT code.
 791 *
 792 * Return: a valid struct thermal_cooling_device pointer on success,
 793 * on failure, it returns a corresponding ERR_PTR().
 794 */
 795static struct thermal_cooling_device *
 796__cpufreq_cooling_register(struct device_node *np,
 797			const struct cpumask *clip_cpus, u32 capacitance,
 798			get_static_t plat_static_func)
 799{
 800	struct cpufreq_policy *policy;
 801	struct thermal_cooling_device *cool_dev;
 802	struct cpufreq_cooling_device *cpufreq_dev;
 803	char dev_name[THERMAL_NAME_LENGTH];
 804	struct cpufreq_frequency_table *pos, *table;
 805	struct cpumask temp_mask;
 806	unsigned int freq, i, num_cpus;
 807	int ret;
 808	struct thermal_cooling_device_ops *cooling_ops;
 809
 810	cpumask_and(&temp_mask, clip_cpus, cpu_online_mask);
 811	policy = cpufreq_cpu_get(cpumask_first(&temp_mask));
 812	if (!policy) {
 813		pr_debug("%s: CPUFreq policy not found\n", __func__);
 814		return ERR_PTR(-EPROBE_DEFER);
 815	}
 816
 817	table = policy->freq_table;
 818	if (!table) {
 819		pr_debug("%s: CPUFreq table not found\n", __func__);
 820		cool_dev = ERR_PTR(-ENODEV);
 821		goto put_policy;
 822	}
 823
 824	cpufreq_dev = kzalloc(sizeof(*cpufreq_dev), GFP_KERNEL);
 825	if (!cpufreq_dev) {
 826		cool_dev = ERR_PTR(-ENOMEM);
 827		goto put_policy;
 828	}
 829
 830	num_cpus = cpumask_weight(clip_cpus);
 831	cpufreq_dev->time_in_idle = kcalloc(num_cpus,
 832					    sizeof(*cpufreq_dev->time_in_idle),
 833					    GFP_KERNEL);
 834	if (!cpufreq_dev->time_in_idle) {
 835		cool_dev = ERR_PTR(-ENOMEM);
 836		goto free_cdev;
 837	}
 838
 839	cpufreq_dev->time_in_idle_timestamp =
 840		kcalloc(num_cpus, sizeof(*cpufreq_dev->time_in_idle_timestamp),
 841			GFP_KERNEL);
 842	if (!cpufreq_dev->time_in_idle_timestamp) {
 843		cool_dev = ERR_PTR(-ENOMEM);
 844		goto free_time_in_idle;
 845	}
 846
 847	/* Find max levels */
 848	cpufreq_for_each_valid_entry(pos, table)
 849		cpufreq_dev->max_level++;
 850
 851	cpufreq_dev->freq_table = kmalloc(sizeof(*cpufreq_dev->freq_table) *
 852					  cpufreq_dev->max_level, GFP_KERNEL);
 853	if (!cpufreq_dev->freq_table) {
 854		cool_dev = ERR_PTR(-ENOMEM);
 855		goto free_time_in_idle_timestamp;
 856	}
 857
 858	/* max_level is an index, not a counter */
 859	cpufreq_dev->max_level--;
 860
 861	cpumask_copy(&cpufreq_dev->allowed_cpus, clip_cpus);
 862
 863	if (capacitance) {
 864		cpufreq_dev->plat_get_static_power = plat_static_func;
 865
 866		ret = build_dyn_power_table(cpufreq_dev, capacitance);
 867		if (ret) {
 868			cool_dev = ERR_PTR(ret);
 869			goto free_table;
 870		}
 871
 872		cooling_ops = &cpufreq_power_cooling_ops;
 873	} else {
 874		cooling_ops = &cpufreq_cooling_ops;
 875	}
 876
 877	ret = get_idr(&cpufreq_idr, &cpufreq_dev->id);
 878	if (ret) {
 879		cool_dev = ERR_PTR(ret);
 880		goto free_power_table;
 881	}
 882
 883	/* Fill freq-table in descending order of frequencies */
 884	for (i = 0, freq = -1; i <= cpufreq_dev->max_level; i++) {
 885		freq = find_next_max(table, freq);
 886		cpufreq_dev->freq_table[i] = freq;
 887
 888		/* Warn for duplicate entries */
 889		if (!freq)
 890			pr_warn("%s: table has duplicate entries\n", __func__);
 891		else
 892			pr_debug("%s: freq:%u KHz\n", __func__, freq);
 893	}
 894
 895	snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
 896		 cpufreq_dev->id);
 897
 898	cool_dev = thermal_of_cooling_device_register(np, dev_name, cpufreq_dev,
 899						      cooling_ops);
 900	if (IS_ERR(cool_dev))
 901		goto remove_idr;
 902
 903	cpufreq_dev->clipped_freq = cpufreq_dev->freq_table[0];
 904	cpufreq_dev->cool_dev = cool_dev;
 905
 906	mutex_lock(&cooling_cpufreq_lock);
 907
 908	mutex_lock(&cooling_list_lock);
 909	list_add(&cpufreq_dev->node, &cpufreq_dev_list);
 910	mutex_unlock(&cooling_list_lock);
 911
 912	/* Register the notifier for first cpufreq cooling device */
 913	if (!cpufreq_dev_count++)
 914		cpufreq_register_notifier(&thermal_cpufreq_notifier_block,
 915					  CPUFREQ_POLICY_NOTIFIER);
 916	mutex_unlock(&cooling_cpufreq_lock);
 917
 918	goto put_policy;
 919
 920remove_idr:
 921	release_idr(&cpufreq_idr, cpufreq_dev->id);
 922free_power_table:
 923	kfree(cpufreq_dev->dyn_power_table);
 924free_table:
 925	kfree(cpufreq_dev->freq_table);
 926free_time_in_idle_timestamp:
 927	kfree(cpufreq_dev->time_in_idle_timestamp);
 928free_time_in_idle:
 929	kfree(cpufreq_dev->time_in_idle);
 930free_cdev:
 931	kfree(cpufreq_dev);
 932put_policy:
 933	cpufreq_cpu_put(policy);
 934
 935	return cool_dev;
 936}
 937
 938/**
 939 * cpufreq_cooling_register - function to create cpufreq cooling device.
 940 * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
 941 *
 942 * This interface function registers the cpufreq cooling device with the name
 943 * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
 944 * cooling devices.
 945 *
 946 * Return: a valid struct thermal_cooling_device pointer on success,
 947 * on failure, it returns a corresponding ERR_PTR().
 948 */
 949struct thermal_cooling_device *
 950cpufreq_cooling_register(const struct cpumask *clip_cpus)
 951{
 952	return __cpufreq_cooling_register(NULL, clip_cpus, 0, NULL);
 953}
 954EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
 955
 956/**
 957 * of_cpufreq_cooling_register - function to create cpufreq cooling device.
 958 * @np: a valid struct device_node to the cooling device device tree node
 959 * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
 960 *
 961 * This interface function registers the cpufreq cooling device with the name
 962 * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
 963 * cooling devices. Using this API, the cpufreq cooling device will be
 964 * linked to the device tree node provided.
 965 *
 966 * Return: a valid struct thermal_cooling_device pointer on success,
 967 * on failure, it returns a corresponding ERR_PTR().
 968 */
 969struct thermal_cooling_device *
 970of_cpufreq_cooling_register(struct device_node *np,
 971			    const struct cpumask *clip_cpus)
 972{
 973	if (!np)
 974		return ERR_PTR(-EINVAL);
 975
 976	return __cpufreq_cooling_register(np, clip_cpus, 0, NULL);
 977}
 978EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
 979
 980/**
 981 * cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
 982 * @clip_cpus:	cpumask of cpus where the frequency constraints will happen
 983 * @capacitance:	dynamic power coefficient for these cpus
 984 * @plat_static_func:	function to calculate the static power consumed by these
 985 *			cpus (optional)
 986 *
 987 * This interface function registers the cpufreq cooling device with
 988 * the name "thermal-cpufreq-%x".  This api can support multiple
 989 * instances of cpufreq cooling devices.  Using this function, the
 990 * cooling device will implement the power extensions by using a
 991 * simple cpu power model.  The cpus must have registered their OPPs
 992 * using the OPP library.
 993 *
 994 * An optional @plat_static_func may be provided to calculate the
 995 * static power consumed by these cpus.  If the platform's static
 996 * power consumption is unknown or negligible, make it NULL.
 997 *
 998 * Return: a valid struct thermal_cooling_device pointer on success,
 999 * on failure, it returns a corresponding ERR_PTR().
1000 */
1001struct thermal_cooling_device *
1002cpufreq_power_cooling_register(const struct cpumask *clip_cpus, u32 capacitance,
1003			       get_static_t plat_static_func)
1004{
1005	return __cpufreq_cooling_register(NULL, clip_cpus, capacitance,
1006				plat_static_func);
1007}
1008EXPORT_SYMBOL(cpufreq_power_cooling_register);
1009
1010/**
1011 * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
1012 * @np:	a valid struct device_node to the cooling device device tree node
1013 * @clip_cpus:	cpumask of cpus where the frequency constraints will happen
1014 * @capacitance:	dynamic power coefficient for these cpus
1015 * @plat_static_func:	function to calculate the static power consumed by these
1016 *			cpus (optional)
1017 *
1018 * This interface function registers the cpufreq cooling device with
1019 * the name "thermal-cpufreq-%x".  This api can support multiple
1020 * instances of cpufreq cooling devices.  Using this API, the cpufreq
1021 * cooling device will be linked to the device tree node provided.
1022 * Using this function, the cooling device will implement the power
1023 * extensions by using a simple cpu power model.  The cpus must have
1024 * registered their OPPs using the OPP library.
1025 *
1026 * An optional @plat_static_func may be provided to calculate the
1027 * static power consumed by these cpus.  If the platform's static
1028 * power consumption is unknown or negligible, make it NULL.
1029 *
1030 * Return: a valid struct thermal_cooling_device pointer on success,
1031 * on failure, it returns a corresponding ERR_PTR().
1032 */
1033struct thermal_cooling_device *
1034of_cpufreq_power_cooling_register(struct device_node *np,
1035				  const struct cpumask *clip_cpus,
1036				  u32 capacitance,
1037				  get_static_t plat_static_func)
1038{
1039	if (!np)
1040		return ERR_PTR(-EINVAL);
1041
1042	return __cpufreq_cooling_register(np, clip_cpus, capacitance,
1043				plat_static_func);
1044}
1045EXPORT_SYMBOL(of_cpufreq_power_cooling_register);
1046
1047/**
1048 * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
1049 * @cdev: thermal cooling device pointer.
1050 *
1051 * This interface function unregisters the "thermal-cpufreq-%x" cooling device.
1052 */
1053void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
1054{
1055	struct cpufreq_cooling_device *cpufreq_dev;
1056
1057	if (!cdev)
1058		return;
1059
1060	cpufreq_dev = cdev->devdata;
1061
1062	/* Unregister the notifier for the last cpufreq cooling device */
1063	mutex_lock(&cooling_cpufreq_lock);
1064	if (!--cpufreq_dev_count)
1065		cpufreq_unregister_notifier(&thermal_cpufreq_notifier_block,
1066					    CPUFREQ_POLICY_NOTIFIER);
1067
1068	mutex_lock(&cooling_list_lock);
1069	list_del(&cpufreq_dev->node);
1070	mutex_unlock(&cooling_list_lock);
1071
1072	mutex_unlock(&cooling_cpufreq_lock);
1073
1074	thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
1075	release_idr(&cpufreq_idr, cpufreq_dev->id);
1076	kfree(cpufreq_dev->dyn_power_table);
1077	kfree(cpufreq_dev->time_in_idle_timestamp);
1078	kfree(cpufreq_dev->time_in_idle);
1079	kfree(cpufreq_dev->freq_table);
1080	kfree(cpufreq_dev);
1081}
1082EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);