Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Common code for Intel Running Average Power Limit (RAPL) support.
   4 * Copyright (c) 2019, Intel Corporation.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/kernel.h>
   9#include <linux/module.h>
  10#include <linux/list.h>
  11#include <linux/types.h>
  12#include <linux/device.h>
  13#include <linux/slab.h>
  14#include <linux/log2.h>
  15#include <linux/bitmap.h>
  16#include <linux/delay.h>
  17#include <linux/sysfs.h>
  18#include <linux/cpu.h>
  19#include <linux/powercap.h>
  20#include <linux/suspend.h>
  21#include <linux/intel_rapl.h>
  22#include <linux/processor.h>
  23#include <linux/platform_device.h>
  24
  25#include <asm/iosf_mbi.h>
  26#include <asm/cpu_device_id.h>
  27#include <asm/intel-family.h>
  28
  29/* bitmasks for RAPL MSRs, used by primitive access functions */
  30#define ENERGY_STATUS_MASK      0xffffffff
  31
  32#define POWER_LIMIT1_MASK       0x7FFF
  33#define POWER_LIMIT1_ENABLE     BIT(15)
  34#define POWER_LIMIT1_CLAMP      BIT(16)
  35
  36#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
  37#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
  38#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
  39#define POWER_HIGH_LOCK         BIT_ULL(63)
  40#define POWER_LOW_LOCK          BIT(31)
  41
  42#define POWER_LIMIT4_MASK		0x1FFF
  43
  44#define TIME_WINDOW1_MASK       (0x7FULL<<17)
  45#define TIME_WINDOW2_MASK       (0x7FULL<<49)
  46
  47#define POWER_UNIT_OFFSET	0
  48#define POWER_UNIT_MASK		0x0F
  49
  50#define ENERGY_UNIT_OFFSET	0x08
  51#define ENERGY_UNIT_MASK	0x1F00
  52
  53#define TIME_UNIT_OFFSET	0x10
  54#define TIME_UNIT_MASK		0xF0000
  55
  56#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
  57#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
  58#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
  59#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
  60
  61#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
  62#define PP_POLICY_MASK         0x1F
  63
  64/*
  65 * SPR has different layout for Psys Domain PowerLimit registers.
  66 * There are 17 bits of PL1 and PL2 instead of 15 bits.
  67 * The Enable bits and TimeWindow bits are also shifted as a result.
  68 */
  69#define PSYS_POWER_LIMIT1_MASK       0x1FFFF
  70#define PSYS_POWER_LIMIT1_ENABLE     BIT(17)
  71
  72#define PSYS_POWER_LIMIT2_MASK       (0x1FFFFULL<<32)
  73#define PSYS_POWER_LIMIT2_ENABLE     BIT_ULL(49)
  74
  75#define PSYS_TIME_WINDOW1_MASK       (0x7FULL<<19)
  76#define PSYS_TIME_WINDOW2_MASK       (0x7FULL<<51)
  77
  78/* Non HW constants */
  79#define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
  80#define RAPL_PRIMITIVE_DUMMY         BIT(2)
  81
  82#define TIME_WINDOW_MAX_MSEC 40000
  83#define TIME_WINDOW_MIN_MSEC 250
  84#define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
  85enum unit_type {
  86	ARBITRARY_UNIT,		/* no translation */
  87	POWER_UNIT,
  88	ENERGY_UNIT,
  89	TIME_UNIT,
  90};
  91
  92/* per domain data, some are optional */
  93#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
  94
  95#define	DOMAIN_STATE_INACTIVE           BIT(0)
  96#define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
  97#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
  98
  99static const char pl1_name[] = "long_term";
 100static const char pl2_name[] = "short_term";
 101static const char pl4_name[] = "peak_power";
 102
 103#define power_zone_to_rapl_domain(_zone) \
 104	container_of(_zone, struct rapl_domain, power_zone)
 105
 106struct rapl_defaults {
 107	u8 floor_freq_reg_addr;
 108	int (*check_unit)(struct rapl_package *rp, int cpu);
 109	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
 110	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
 111				    bool to_raw);
 112	unsigned int dram_domain_energy_unit;
 113	unsigned int psys_domain_energy_unit;
 114	bool spr_psys_bits;
 115};
 116static struct rapl_defaults *rapl_defaults;
 117
 118/* Sideband MBI registers */
 119#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
 120#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
 121
 122#define PACKAGE_PLN_INT_SAVED   BIT(0)
 123#define MAX_PRIM_NAME (32)
 124
 125/* per domain data. used to describe individual knobs such that access function
 126 * can be consolidated into one instead of many inline functions.
 127 */
 128struct rapl_primitive_info {
 129	const char *name;
 130	u64 mask;
 131	int shift;
 132	enum rapl_domain_reg_id id;
 133	enum unit_type unit;
 134	u32 flag;
 135};
 136
 137#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
 138		.name = #p,			\
 139		.mask = m,			\
 140		.shift = s,			\
 141		.id = i,			\
 142		.unit = u,			\
 143		.flag = f			\
 144	}
 145
 146static void rapl_init_domains(struct rapl_package *rp);
 147static int rapl_read_data_raw(struct rapl_domain *rd,
 148			      enum rapl_primitives prim,
 149			      bool xlate, u64 *data);
 150static int rapl_write_data_raw(struct rapl_domain *rd,
 151			       enum rapl_primitives prim,
 152			       unsigned long long value);
 153static u64 rapl_unit_xlate(struct rapl_domain *rd,
 154			   enum unit_type type, u64 value, int to_raw);
 155static void package_power_limit_irq_save(struct rapl_package *rp);
 156
 157static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
 158
 159static const char *const rapl_domain_names[] = {
 160	"package",
 161	"core",
 162	"uncore",
 163	"dram",
 164	"psys",
 165};
 166
 167static int get_energy_counter(struct powercap_zone *power_zone,
 168			      u64 *energy_raw)
 169{
 170	struct rapl_domain *rd;
 171	u64 energy_now;
 172
 173	/* prevent CPU hotplug, make sure the RAPL domain does not go
 174	 * away while reading the counter.
 175	 */
 176	cpus_read_lock();
 177	rd = power_zone_to_rapl_domain(power_zone);
 178
 179	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
 180		*energy_raw = energy_now;
 181		cpus_read_unlock();
 182
 183		return 0;
 184	}
 185	cpus_read_unlock();
 186
 187	return -EIO;
 188}
 189
 190static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
 191{
 192	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
 193
 194	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
 195	return 0;
 196}
 197
 198static int release_zone(struct powercap_zone *power_zone)
 199{
 200	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 201	struct rapl_package *rp = rd->rp;
 202
 203	/* package zone is the last zone of a package, we can free
 204	 * memory here since all children has been unregistered.
 205	 */
 206	if (rd->id == RAPL_DOMAIN_PACKAGE) {
 207		kfree(rd);
 208		rp->domains = NULL;
 209	}
 210
 211	return 0;
 212
 213}
 214
 215static int find_nr_power_limit(struct rapl_domain *rd)
 216{
 217	int i, nr_pl = 0;
 218
 219	for (i = 0; i < NR_POWER_LIMITS; i++) {
 220		if (rd->rpl[i].name)
 221			nr_pl++;
 222	}
 223
 224	return nr_pl;
 225}
 226
 227static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
 228{
 229	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 230
 231	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
 232		return -EACCES;
 233
 234	cpus_read_lock();
 235	rapl_write_data_raw(rd, PL1_ENABLE, mode);
 236	if (rapl_defaults->set_floor_freq)
 237		rapl_defaults->set_floor_freq(rd, mode);
 238	cpus_read_unlock();
 239
 240	return 0;
 241}
 242
 243static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
 244{
 245	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 246	u64 val;
 247
 248	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
 249		*mode = false;
 250		return 0;
 251	}
 252	cpus_read_lock();
 253	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
 254		cpus_read_unlock();
 255		return -EIO;
 256	}
 257	*mode = val;
 258	cpus_read_unlock();
 259
 260	return 0;
 261}
 262
 263/* per RAPL domain ops, in the order of rapl_domain_type */
 264static const struct powercap_zone_ops zone_ops[] = {
 265	/* RAPL_DOMAIN_PACKAGE */
 266	{
 267	 .get_energy_uj = get_energy_counter,
 268	 .get_max_energy_range_uj = get_max_energy_counter,
 269	 .release = release_zone,
 270	 .set_enable = set_domain_enable,
 271	 .get_enable = get_domain_enable,
 272	 },
 273	/* RAPL_DOMAIN_PP0 */
 274	{
 275	 .get_energy_uj = get_energy_counter,
 276	 .get_max_energy_range_uj = get_max_energy_counter,
 277	 .release = release_zone,
 278	 .set_enable = set_domain_enable,
 279	 .get_enable = get_domain_enable,
 280	 },
 281	/* RAPL_DOMAIN_PP1 */
 282	{
 283	 .get_energy_uj = get_energy_counter,
 284	 .get_max_energy_range_uj = get_max_energy_counter,
 285	 .release = release_zone,
 286	 .set_enable = set_domain_enable,
 287	 .get_enable = get_domain_enable,
 288	 },
 289	/* RAPL_DOMAIN_DRAM */
 290	{
 291	 .get_energy_uj = get_energy_counter,
 292	 .get_max_energy_range_uj = get_max_energy_counter,
 293	 .release = release_zone,
 294	 .set_enable = set_domain_enable,
 295	 .get_enable = get_domain_enable,
 296	 },
 297	/* RAPL_DOMAIN_PLATFORM */
 298	{
 299	 .get_energy_uj = get_energy_counter,
 300	 .get_max_energy_range_uj = get_max_energy_counter,
 301	 .release = release_zone,
 302	 .set_enable = set_domain_enable,
 303	 .get_enable = get_domain_enable,
 304	 },
 305};
 306
 307/*
 308 * Constraint index used by powercap can be different than power limit (PL)
 309 * index in that some  PLs maybe missing due to non-existent MSRs. So we
 310 * need to convert here by finding the valid PLs only (name populated).
 311 */
 312static int contraint_to_pl(struct rapl_domain *rd, int cid)
 313{
 314	int i, j;
 315
 316	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
 317		if ((rd->rpl[i].name) && j++ == cid) {
 318			pr_debug("%s: index %d\n", __func__, i);
 319			return i;
 320		}
 321	}
 322	pr_err("Cannot find matching power limit for constraint %d\n", cid);
 323
 324	return -EINVAL;
 325}
 326
 327static int set_power_limit(struct powercap_zone *power_zone, int cid,
 328			   u64 power_limit)
 329{
 330	struct rapl_domain *rd;
 331	struct rapl_package *rp;
 332	int ret = 0;
 333	int id;
 334
 335	cpus_read_lock();
 336	rd = power_zone_to_rapl_domain(power_zone);
 337	id = contraint_to_pl(rd, cid);
 338	if (id < 0) {
 339		ret = id;
 340		goto set_exit;
 341	}
 342
 343	rp = rd->rp;
 344
 345	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
 346		dev_warn(&power_zone->dev,
 347			 "%s locked by BIOS, monitoring only\n", rd->name);
 348		ret = -EACCES;
 349		goto set_exit;
 350	}
 351
 352	switch (rd->rpl[id].prim_id) {
 353	case PL1_ENABLE:
 354		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
 355		break;
 356	case PL2_ENABLE:
 357		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
 358		break;
 359	case PL4_ENABLE:
 360		rapl_write_data_raw(rd, POWER_LIMIT4, power_limit);
 361		break;
 362	default:
 363		ret = -EINVAL;
 364	}
 365	if (!ret)
 366		package_power_limit_irq_save(rp);
 367set_exit:
 368	cpus_read_unlock();
 369	return ret;
 370}
 371
 372static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
 373				   u64 *data)
 374{
 375	struct rapl_domain *rd;
 376	u64 val;
 377	int prim;
 378	int ret = 0;
 379	int id;
 380
 381	cpus_read_lock();
 382	rd = power_zone_to_rapl_domain(power_zone);
 383	id = contraint_to_pl(rd, cid);
 384	if (id < 0) {
 385		ret = id;
 386		goto get_exit;
 387	}
 388
 389	switch (rd->rpl[id].prim_id) {
 390	case PL1_ENABLE:
 391		prim = POWER_LIMIT1;
 392		break;
 393	case PL2_ENABLE:
 394		prim = POWER_LIMIT2;
 395		break;
 396	case PL4_ENABLE:
 397		prim = POWER_LIMIT4;
 398		break;
 399	default:
 400		cpus_read_unlock();
 401		return -EINVAL;
 402	}
 403	if (rapl_read_data_raw(rd, prim, true, &val))
 404		ret = -EIO;
 405	else
 406		*data = val;
 407
 408get_exit:
 409	cpus_read_unlock();
 410
 411	return ret;
 412}
 413
 414static int set_time_window(struct powercap_zone *power_zone, int cid,
 415			   u64 window)
 416{
 417	struct rapl_domain *rd;
 418	int ret = 0;
 419	int id;
 420
 421	cpus_read_lock();
 422	rd = power_zone_to_rapl_domain(power_zone);
 423	id = contraint_to_pl(rd, cid);
 424	if (id < 0) {
 425		ret = id;
 426		goto set_time_exit;
 427	}
 428
 429	switch (rd->rpl[id].prim_id) {
 430	case PL1_ENABLE:
 431		rapl_write_data_raw(rd, TIME_WINDOW1, window);
 432		break;
 433	case PL2_ENABLE:
 434		rapl_write_data_raw(rd, TIME_WINDOW2, window);
 435		break;
 436	default:
 437		ret = -EINVAL;
 438	}
 439
 440set_time_exit:
 441	cpus_read_unlock();
 442	return ret;
 443}
 444
 445static int get_time_window(struct powercap_zone *power_zone, int cid,
 446			   u64 *data)
 447{
 448	struct rapl_domain *rd;
 449	u64 val;
 450	int ret = 0;
 451	int id;
 452
 453	cpus_read_lock();
 454	rd = power_zone_to_rapl_domain(power_zone);
 455	id = contraint_to_pl(rd, cid);
 456	if (id < 0) {
 457		ret = id;
 458		goto get_time_exit;
 459	}
 460
 461	switch (rd->rpl[id].prim_id) {
 462	case PL1_ENABLE:
 463		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
 464		break;
 465	case PL2_ENABLE:
 466		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
 467		break;
 468	case PL4_ENABLE:
 469		/*
 470		 * Time window parameter is not applicable for PL4 entry
 471		 * so assigining '0' as default value.
 472		 */
 473		val = 0;
 474		break;
 475	default:
 476		cpus_read_unlock();
 477		return -EINVAL;
 478	}
 479	if (!ret)
 480		*data = val;
 481
 482get_time_exit:
 483	cpus_read_unlock();
 484
 485	return ret;
 486}
 487
 488static const char *get_constraint_name(struct powercap_zone *power_zone,
 489				       int cid)
 490{
 491	struct rapl_domain *rd;
 492	int id;
 493
 494	rd = power_zone_to_rapl_domain(power_zone);
 495	id = contraint_to_pl(rd, cid);
 496	if (id >= 0)
 497		return rd->rpl[id].name;
 498
 499	return NULL;
 500}
 501
 502static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
 503{
 504	struct rapl_domain *rd;
 505	u64 val;
 506	int prim;
 507	int ret = 0;
 508
 509	cpus_read_lock();
 510	rd = power_zone_to_rapl_domain(power_zone);
 511	switch (rd->rpl[id].prim_id) {
 512	case PL1_ENABLE:
 513		prim = THERMAL_SPEC_POWER;
 514		break;
 515	case PL2_ENABLE:
 516		prim = MAX_POWER;
 517		break;
 518	case PL4_ENABLE:
 519		prim = MAX_POWER;
 520		break;
 521	default:
 522		cpus_read_unlock();
 523		return -EINVAL;
 524	}
 525	if (rapl_read_data_raw(rd, prim, true, &val))
 526		ret = -EIO;
 527	else
 528		*data = val;
 529
 530	/* As a generalization rule, PL4 would be around two times PL2. */
 531	if (rd->rpl[id].prim_id == PL4_ENABLE)
 532		*data = *data * 2;
 533
 534	cpus_read_unlock();
 535
 536	return ret;
 537}
 538
 539static const struct powercap_zone_constraint_ops constraint_ops = {
 540	.set_power_limit_uw = set_power_limit,
 541	.get_power_limit_uw = get_current_power_limit,
 542	.set_time_window_us = set_time_window,
 543	.get_time_window_us = get_time_window,
 544	.get_max_power_uw = get_max_power,
 545	.get_name = get_constraint_name,
 546};
 547
 548/* called after domain detection and package level data are set */
 549static void rapl_init_domains(struct rapl_package *rp)
 550{
 551	enum rapl_domain_type i;
 552	enum rapl_domain_reg_id j;
 553	struct rapl_domain *rd = rp->domains;
 554
 555	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 556		unsigned int mask = rp->domain_map & (1 << i);
 557
 558		if (!mask)
 559			continue;
 560
 561		rd->rp = rp;
 562
 563		if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) {
 564			snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d",
 565				topology_physical_package_id(rp->lead_cpu));
 566		} else
 567			snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s",
 568				rapl_domain_names[i]);
 569
 570		rd->id = i;
 571		rd->rpl[0].prim_id = PL1_ENABLE;
 572		rd->rpl[0].name = pl1_name;
 573
 574		/*
 575		 * The PL2 power domain is applicable for limits two
 576		 * and limits three
 577		 */
 578		if (rp->priv->limits[i] >= 2) {
 579			rd->rpl[1].prim_id = PL2_ENABLE;
 580			rd->rpl[1].name = pl2_name;
 581		}
 582
 583		/* Enable PL4 domain if the total power limits are three */
 584		if (rp->priv->limits[i] == 3) {
 585			rd->rpl[2].prim_id = PL4_ENABLE;
 586			rd->rpl[2].name = pl4_name;
 587		}
 588
 589		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
 590			rd->regs[j] = rp->priv->regs[i][j];
 591
 592		switch (i) {
 593		case RAPL_DOMAIN_DRAM:
 594			rd->domain_energy_unit =
 595			    rapl_defaults->dram_domain_energy_unit;
 596			if (rd->domain_energy_unit)
 597				pr_info("DRAM domain energy unit %dpj\n",
 598					rd->domain_energy_unit);
 599			break;
 600		case RAPL_DOMAIN_PLATFORM:
 601			rd->domain_energy_unit =
 602			    rapl_defaults->psys_domain_energy_unit;
 603			if (rd->domain_energy_unit)
 604				pr_info("Platform domain energy unit %dpj\n",
 605					rd->domain_energy_unit);
 606			break;
 607		default:
 608			break;
 609		}
 610		rd++;
 611	}
 612}
 613
 614static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
 615			   u64 value, int to_raw)
 616{
 617	u64 units = 1;
 618	struct rapl_package *rp = rd->rp;
 619	u64 scale = 1;
 620
 621	switch (type) {
 622	case POWER_UNIT:
 623		units = rp->power_unit;
 624		break;
 625	case ENERGY_UNIT:
 626		scale = ENERGY_UNIT_SCALE;
 627		/* per domain unit takes precedence */
 628		if (rd->domain_energy_unit)
 629			units = rd->domain_energy_unit;
 630		else
 631			units = rp->energy_unit;
 632		break;
 633	case TIME_UNIT:
 634		return rapl_defaults->compute_time_window(rp, value, to_raw);
 635	case ARBITRARY_UNIT:
 636	default:
 637		return value;
 638	}
 639
 640	if (to_raw)
 641		return div64_u64(value, units) * scale;
 642
 643	value *= units;
 644
 645	return div64_u64(value, scale);
 646}
 647
 648/* in the order of enum rapl_primitives */
 649static struct rapl_primitive_info rpi[] = {
 650	/* name, mask, shift, msr index, unit divisor */
 651	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
 652			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
 653	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
 654			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 655	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
 656			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 657	PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0,
 658				RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0),
 659	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
 660			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 661	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
 662			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 663	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
 664			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 665	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
 666			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 667	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
 668			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 669	PRIMITIVE_INFO_INIT(PL4_ENABLE, POWER_LIMIT4_MASK, 0,
 670				RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0),
 671	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
 672			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 673	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
 674			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 675	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
 676			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 677	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
 678			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 679	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
 680			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 681	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
 682			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
 683	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
 684			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
 685	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
 686			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
 687	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0,
 688			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 689	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32,
 690			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 691	PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17,
 692			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 693	PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49,
 694			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 695	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19,
 696			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 697	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51,
 698			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 699	/* non-hardware */
 700	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
 701			    RAPL_PRIMITIVE_DERIVED),
 702	{NULL, 0, 0, 0},
 703};
 704
 705static enum rapl_primitives
 706prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
 707{
 708	if (!rapl_defaults->spr_psys_bits)
 709		return prim;
 710
 711	if (rd->id != RAPL_DOMAIN_PLATFORM)
 712		return prim;
 713
 714	switch (prim) {
 715	case POWER_LIMIT1:
 716		return PSYS_POWER_LIMIT1;
 717	case POWER_LIMIT2:
 718		return PSYS_POWER_LIMIT2;
 719	case PL1_ENABLE:
 720		return PSYS_PL1_ENABLE;
 721	case PL2_ENABLE:
 722		return PSYS_PL2_ENABLE;
 723	case TIME_WINDOW1:
 724		return PSYS_TIME_WINDOW1;
 725	case TIME_WINDOW2:
 726		return PSYS_TIME_WINDOW2;
 727	default:
 728		return prim;
 729	}
 730}
 731
 732/* Read primitive data based on its related struct rapl_primitive_info.
 733 * if xlate flag is set, return translated data based on data units, i.e.
 734 * time, energy, and power.
 735 * RAPL MSRs are non-architectual and are laid out not consistently across
 736 * domains. Here we use primitive info to allow writing consolidated access
 737 * functions.
 738 * For a given primitive, it is processed by MSR mask and shift. Unit conversion
 739 * is pre-assigned based on RAPL unit MSRs read at init time.
 740 * 63-------------------------- 31--------------------------- 0
 741 * |                           xxxxx (mask)                   |
 742 * |                                |<- shift ----------------|
 743 * 63-------------------------- 31--------------------------- 0
 744 */
 745static int rapl_read_data_raw(struct rapl_domain *rd,
 746			      enum rapl_primitives prim, bool xlate, u64 *data)
 747{
 748	u64 value;
 749	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
 750	struct rapl_primitive_info *rp = &rpi[prim_fixed];
 751	struct reg_action ra;
 752	int cpu;
 753
 754	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
 755		return -EINVAL;
 756
 757	ra.reg = rd->regs[rp->id];
 758	if (!ra.reg)
 759		return -EINVAL;
 760
 761	cpu = rd->rp->lead_cpu;
 762
 763	/* domain with 2 limits has different bit */
 764	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
 765		rp->mask = POWER_HIGH_LOCK;
 766		rp->shift = 63;
 767	}
 768	/* non-hardware data are collected by the polling thread */
 769	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
 770		*data = rd->rdd.primitives[prim];
 771		return 0;
 772	}
 773
 774	ra.mask = rp->mask;
 775
 776	if (rd->rp->priv->read_raw(cpu, &ra)) {
 777		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
 778		return -EIO;
 779	}
 780
 781	value = ra.value >> rp->shift;
 782
 783	if (xlate)
 784		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
 785	else
 786		*data = value;
 787
 788	return 0;
 789}
 790
 791/* Similar use of primitive info in the read counterpart */
 792static int rapl_write_data_raw(struct rapl_domain *rd,
 793			       enum rapl_primitives prim,
 794			       unsigned long long value)
 795{
 796	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
 797	struct rapl_primitive_info *rp = &rpi[prim_fixed];
 798	int cpu;
 799	u64 bits;
 800	struct reg_action ra;
 801	int ret;
 802
 803	cpu = rd->rp->lead_cpu;
 804	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
 805	bits <<= rp->shift;
 806	bits &= rp->mask;
 807
 808	memset(&ra, 0, sizeof(ra));
 809
 810	ra.reg = rd->regs[rp->id];
 811	ra.mask = rp->mask;
 812	ra.value = bits;
 813
 814	ret = rd->rp->priv->write_raw(cpu, &ra);
 815
 816	return ret;
 817}
 818
 819/*
 820 * Raw RAPL data stored in MSRs are in certain scales. We need to
 821 * convert them into standard units based on the units reported in
 822 * the RAPL unit MSRs. This is specific to CPUs as the method to
 823 * calculate units differ on different CPUs.
 824 * We convert the units to below format based on CPUs.
 825 * i.e.
 826 * energy unit: picoJoules  : Represented in picoJoules by default
 827 * power unit : microWatts  : Represented in milliWatts by default
 828 * time unit  : microseconds: Represented in seconds by default
 829 */
 830static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 831{
 832	struct reg_action ra;
 833	u32 value;
 834
 835	ra.reg = rp->priv->reg_unit;
 836	ra.mask = ~0;
 837	if (rp->priv->read_raw(cpu, &ra)) {
 838		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 839		       rp->priv->reg_unit, cpu);
 840		return -ENODEV;
 841	}
 842
 843	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
 844	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
 845
 846	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
 847	rp->power_unit = 1000000 / (1 << value);
 848
 849	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
 850	rp->time_unit = 1000000 / (1 << value);
 851
 852	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
 853		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
 854
 855	return 0;
 856}
 857
 858static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 859{
 860	struct reg_action ra;
 861	u32 value;
 862
 863	ra.reg = rp->priv->reg_unit;
 864	ra.mask = ~0;
 865	if (rp->priv->read_raw(cpu, &ra)) {
 866		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 867		       rp->priv->reg_unit, cpu);
 868		return -ENODEV;
 869	}
 870
 871	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
 872	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
 873
 874	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
 875	rp->power_unit = (1 << value) * 1000;
 876
 877	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
 878	rp->time_unit = 1000000 / (1 << value);
 879
 880	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
 881		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
 882
 883	return 0;
 884}
 885
 886static void power_limit_irq_save_cpu(void *info)
 887{
 888	u32 l, h = 0;
 889	struct rapl_package *rp = (struct rapl_package *)info;
 890
 891	/* save the state of PLN irq mask bit before disabling it */
 892	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
 893	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
 894		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
 895		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
 896	}
 897	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
 898	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 899}
 900
 901/* REVISIT:
 902 * When package power limit is set artificially low by RAPL, LVT
 903 * thermal interrupt for package power limit should be ignored
 904 * since we are not really exceeding the real limit. The intention
 905 * is to avoid excessive interrupts while we are trying to save power.
 906 * A useful feature might be routing the package_power_limit interrupt
 907 * to userspace via eventfd. once we have a usecase, this is simple
 908 * to do by adding an atomic notifier.
 909 */
 910
 911static void package_power_limit_irq_save(struct rapl_package *rp)
 912{
 913	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 914		return;
 915
 916	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
 917}
 918
 919/*
 920 * Restore per package power limit interrupt enable state. Called from cpu
 921 * hotplug code on package removal.
 922 */
 923static void package_power_limit_irq_restore(struct rapl_package *rp)
 924{
 925	u32 l, h;
 926
 927	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 928		return;
 929
 930	/* irq enable state not saved, nothing to restore */
 931	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
 932		return;
 933
 934	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
 935
 936	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
 937		l |= PACKAGE_THERM_INT_PLN_ENABLE;
 938	else
 939		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
 940
 941	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 942}
 943
 944static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
 945{
 946	int nr_powerlimit = find_nr_power_limit(rd);
 947
 948	/* always enable clamp such that p-state can go below OS requested
 949	 * range. power capping priority over guranteed frequency.
 950	 */
 951	rapl_write_data_raw(rd, PL1_CLAMP, mode);
 952
 953	/* some domains have pl2 */
 954	if (nr_powerlimit > 1) {
 955		rapl_write_data_raw(rd, PL2_ENABLE, mode);
 956		rapl_write_data_raw(rd, PL2_CLAMP, mode);
 957	}
 958}
 959
 960static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
 961{
 962	static u32 power_ctrl_orig_val;
 963	u32 mdata;
 964
 965	if (!rapl_defaults->floor_freq_reg_addr) {
 966		pr_err("Invalid floor frequency config register\n");
 967		return;
 968	}
 969
 970	if (!power_ctrl_orig_val)
 971		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
 972			      rapl_defaults->floor_freq_reg_addr,
 973			      &power_ctrl_orig_val);
 974	mdata = power_ctrl_orig_val;
 975	if (enable) {
 976		mdata &= ~(0x7f << 8);
 977		mdata |= 1 << 8;
 978	}
 979	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
 980		       rapl_defaults->floor_freq_reg_addr, mdata);
 981}
 982
 983static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
 984					 bool to_raw)
 985{
 986	u64 f, y;		/* fraction and exp. used for time unit */
 987
 988	/*
 989	 * Special processing based on 2^Y*(1+F/4), refer
 990	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
 991	 */
 992	if (!to_raw) {
 993		f = (value & 0x60) >> 5;
 994		y = value & 0x1f;
 995		value = (1 << y) * (4 + f) * rp->time_unit / 4;
 996	} else {
 997		if (value < rp->time_unit)
 998			return 0;
 999
1000		do_div(value, rp->time_unit);
1001		y = ilog2(value);
1002		f = div64_u64(4 * (value - (1 << y)), 1 << y);
1003		value = (y & 0x1f) | ((f & 0x3) << 5);
1004	}
1005	return value;
1006}
1007
1008static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
1009					 bool to_raw)
1010{
1011	/*
1012	 * Atom time unit encoding is straight forward val * time_unit,
1013	 * where time_unit is default to 1 sec. Never 0.
1014	 */
1015	if (!to_raw)
1016		return (value) ? value * rp->time_unit : rp->time_unit;
1017
1018	value = div64_u64(value, rp->time_unit);
1019
1020	return value;
1021}
1022
1023static const struct rapl_defaults rapl_defaults_core = {
1024	.floor_freq_reg_addr = 0,
1025	.check_unit = rapl_check_unit_core,
1026	.set_floor_freq = set_floor_freq_default,
1027	.compute_time_window = rapl_compute_time_window_core,
1028};
1029
1030static const struct rapl_defaults rapl_defaults_hsw_server = {
1031	.check_unit = rapl_check_unit_core,
1032	.set_floor_freq = set_floor_freq_default,
1033	.compute_time_window = rapl_compute_time_window_core,
1034	.dram_domain_energy_unit = 15300,
1035};
1036
1037static const struct rapl_defaults rapl_defaults_spr_server = {
1038	.check_unit = rapl_check_unit_core,
1039	.set_floor_freq = set_floor_freq_default,
1040	.compute_time_window = rapl_compute_time_window_core,
1041	.psys_domain_energy_unit = 1000000000,
1042	.spr_psys_bits = true,
1043};
1044
1045static const struct rapl_defaults rapl_defaults_byt = {
1046	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
1047	.check_unit = rapl_check_unit_atom,
1048	.set_floor_freq = set_floor_freq_atom,
1049	.compute_time_window = rapl_compute_time_window_atom,
1050};
1051
1052static const struct rapl_defaults rapl_defaults_tng = {
1053	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
1054	.check_unit = rapl_check_unit_atom,
1055	.set_floor_freq = set_floor_freq_atom,
1056	.compute_time_window = rapl_compute_time_window_atom,
1057};
1058
1059static const struct rapl_defaults rapl_defaults_ann = {
1060	.floor_freq_reg_addr = 0,
1061	.check_unit = rapl_check_unit_atom,
1062	.set_floor_freq = NULL,
1063	.compute_time_window = rapl_compute_time_window_atom,
1064};
1065
1066static const struct rapl_defaults rapl_defaults_cht = {
1067	.floor_freq_reg_addr = 0,
1068	.check_unit = rapl_check_unit_atom,
1069	.set_floor_freq = NULL,
1070	.compute_time_window = rapl_compute_time_window_atom,
1071};
1072
1073static const struct rapl_defaults rapl_defaults_amd = {
1074	.check_unit = rapl_check_unit_core,
1075};
1076
1077static const struct x86_cpu_id rapl_ids[] __initconst = {
1078	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&rapl_defaults_core),
1079	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&rapl_defaults_core),
1080
1081	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&rapl_defaults_core),
1082	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&rapl_defaults_core),
1083
1084	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&rapl_defaults_core),
1085	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&rapl_defaults_core),
1086	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&rapl_defaults_core),
1087	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&rapl_defaults_hsw_server),
1088
1089	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&rapl_defaults_core),
1090	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&rapl_defaults_core),
1091	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&rapl_defaults_core),
1092	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&rapl_defaults_hsw_server),
1093
1094	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&rapl_defaults_core),
1095	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&rapl_defaults_core),
1096	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&rapl_defaults_hsw_server),
1097	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&rapl_defaults_core),
1098	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&rapl_defaults_core),
1099	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&rapl_defaults_core),
1100	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&rapl_defaults_core),
1101	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&rapl_defaults_core),
1102	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI,	&rapl_defaults_core),
1103	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&rapl_defaults_hsw_server),
1104	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&rapl_defaults_hsw_server),
1105	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&rapl_defaults_core),
1106	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&rapl_defaults_core),
1107	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,		&rapl_defaults_core),
1108	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,		&rapl_defaults_core),
1109	X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,		&rapl_defaults_core),
1110	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,		&rapl_defaults_core),
1111	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		&rapl_defaults_core),
1112	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,		&rapl_defaults_core),
1113	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,		&rapl_defaults_core),
1114	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,        &rapl_defaults_core),
1115	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S,	&rapl_defaults_core),
1116	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&rapl_defaults_spr_server),
1117	X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD,		&rapl_defaults_core),
1118
1119	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT,	&rapl_defaults_byt),
1120	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT,	&rapl_defaults_cht),
1121	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID,	&rapl_defaults_tng),
1122	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID,	&rapl_defaults_ann),
1123	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&rapl_defaults_core),
1124	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&rapl_defaults_core),
1125	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&rapl_defaults_core),
1126	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT,	&rapl_defaults_core),
1127	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&rapl_defaults_core),
1128	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,	&rapl_defaults_core),
1129
1130	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&rapl_defaults_hsw_server),
1131	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&rapl_defaults_hsw_server),
1132
1133	X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd),
1134	X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd),
1135	X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd),
1136	{}
1137};
1138MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
1139
1140/* Read once for all raw primitive data for domains */
1141static void rapl_update_domain_data(struct rapl_package *rp)
1142{
1143	int dmn, prim;
1144	u64 val;
1145
1146	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1147		pr_debug("update %s domain %s data\n", rp->name,
1148			 rp->domains[dmn].name);
1149		/* exclude non-raw primitives */
1150		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1151			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1152						rpi[prim].unit, &val))
1153				rp->domains[dmn].rdd.primitives[prim] = val;
1154		}
1155	}
1156
1157}
1158
1159static int rapl_package_register_powercap(struct rapl_package *rp)
1160{
1161	struct rapl_domain *rd;
1162	struct powercap_zone *power_zone = NULL;
1163	int nr_pl, ret;
1164
1165	/* Update the domain data of the new package */
1166	rapl_update_domain_data(rp);
1167
1168	/* first we register package domain as the parent zone */
1169	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1170		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1171			nr_pl = find_nr_power_limit(rd);
1172			pr_debug("register package domain %s\n", rp->name);
1173			power_zone = powercap_register_zone(&rd->power_zone,
1174					    rp->priv->control_type, rp->name,
1175					    NULL, &zone_ops[rd->id], nr_pl,
1176					    &constraint_ops);
1177			if (IS_ERR(power_zone)) {
1178				pr_debug("failed to register power zone %s\n",
1179					 rp->name);
1180				return PTR_ERR(power_zone);
1181			}
1182			/* track parent zone in per package/socket data */
1183			rp->power_zone = power_zone;
1184			/* done, only one package domain per socket */
1185			break;
1186		}
1187	}
1188	if (!power_zone) {
1189		pr_err("no package domain found, unknown topology!\n");
1190		return -ENODEV;
1191	}
1192	/* now register domains as children of the socket/package */
1193	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1194		struct powercap_zone *parent = rp->power_zone;
1195
1196		if (rd->id == RAPL_DOMAIN_PACKAGE)
1197			continue;
1198		if (rd->id == RAPL_DOMAIN_PLATFORM)
1199			parent = NULL;
1200		/* number of power limits per domain varies */
1201		nr_pl = find_nr_power_limit(rd);
1202		power_zone = powercap_register_zone(&rd->power_zone,
1203						    rp->priv->control_type,
1204						    rd->name, parent,
1205						    &zone_ops[rd->id], nr_pl,
1206						    &constraint_ops);
1207
1208		if (IS_ERR(power_zone)) {
1209			pr_debug("failed to register power_zone, %s:%s\n",
1210				 rp->name, rd->name);
1211			ret = PTR_ERR(power_zone);
1212			goto err_cleanup;
1213		}
1214	}
1215	return 0;
1216
1217err_cleanup:
1218	/*
1219	 * Clean up previously initialized domains within the package if we
1220	 * failed after the first domain setup.
1221	 */
1222	while (--rd >= rp->domains) {
1223		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1224		powercap_unregister_zone(rp->priv->control_type,
1225					 &rd->power_zone);
1226	}
1227
1228	return ret;
1229}
1230
1231static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
1232{
1233	struct reg_action ra;
1234
1235	switch (domain) {
1236	case RAPL_DOMAIN_PACKAGE:
1237	case RAPL_DOMAIN_PP0:
1238	case RAPL_DOMAIN_PP1:
1239	case RAPL_DOMAIN_DRAM:
1240	case RAPL_DOMAIN_PLATFORM:
1241		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1242		break;
1243	default:
1244		pr_err("invalid domain id %d\n", domain);
1245		return -EINVAL;
1246	}
1247	/* make sure domain counters are available and contains non-zero
1248	 * values, otherwise skip it.
1249	 */
1250
1251	ra.mask = ENERGY_STATUS_MASK;
1252	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
1253		return -ENODEV;
1254
1255	return 0;
1256}
1257
1258/*
1259 * Check if power limits are available. Two cases when they are not available:
1260 * 1. Locked by BIOS, in this case we still provide read-only access so that
1261 *    users can see what limit is set by the BIOS.
1262 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1263 *    exist at all. In this case, we do not show the constraints in powercap.
1264 *
1265 * Called after domains are detected and initialized.
1266 */
1267static void rapl_detect_powerlimit(struct rapl_domain *rd)
1268{
1269	u64 val64;
1270	int i;
1271
1272	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1273	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
1274		if (val64) {
1275			pr_info("RAPL %s domain %s locked by BIOS\n",
1276				rd->rp->name, rd->name);
1277			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1278		}
1279	}
1280	/* check if power limit MSR exists, otherwise domain is monitoring only */
1281	for (i = 0; i < NR_POWER_LIMITS; i++) {
1282		int prim = rd->rpl[i].prim_id;
1283
1284		if (rapl_read_data_raw(rd, prim, false, &val64))
1285			rd->rpl[i].name = NULL;
1286	}
1287}
1288
1289/* Detect active and valid domains for the given CPU, caller must
1290 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1291 */
1292static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1293{
1294	struct rapl_domain *rd;
1295	int i;
1296
1297	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1298		/* use physical package id to read counters */
1299		if (!rapl_check_domain(cpu, i, rp)) {
1300			rp->domain_map |= 1 << i;
1301			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1302		}
1303	}
1304	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1305	if (!rp->nr_domains) {
1306		pr_debug("no valid rapl domains found in %s\n", rp->name);
1307		return -ENODEV;
1308	}
1309	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1310
1311	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1312			      GFP_KERNEL);
1313	if (!rp->domains)
1314		return -ENOMEM;
1315
1316	rapl_init_domains(rp);
1317
1318	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1319		rapl_detect_powerlimit(rd);
1320
1321	return 0;
1322}
1323
1324/* called from CPU hotplug notifier, hotplug lock held */
1325void rapl_remove_package(struct rapl_package *rp)
1326{
1327	struct rapl_domain *rd, *rd_package = NULL;
1328
1329	package_power_limit_irq_restore(rp);
1330
1331	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1332		rapl_write_data_raw(rd, PL1_ENABLE, 0);
1333		rapl_write_data_raw(rd, PL1_CLAMP, 0);
1334		if (find_nr_power_limit(rd) > 1) {
1335			rapl_write_data_raw(rd, PL2_ENABLE, 0);
1336			rapl_write_data_raw(rd, PL2_CLAMP, 0);
1337			rapl_write_data_raw(rd, PL4_ENABLE, 0);
1338		}
1339		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1340			rd_package = rd;
1341			continue;
1342		}
1343		pr_debug("remove package, undo power limit on %s: %s\n",
1344			 rp->name, rd->name);
1345		powercap_unregister_zone(rp->priv->control_type,
1346					 &rd->power_zone);
1347	}
1348	/* do parent zone last */
1349	powercap_unregister_zone(rp->priv->control_type,
1350				 &rd_package->power_zone);
1351	list_del(&rp->plist);
1352	kfree(rp);
1353}
1354EXPORT_SYMBOL_GPL(rapl_remove_package);
1355
1356/* caller to ensure CPU hotplug lock is held */
1357struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
1358{
1359	int id = topology_logical_die_id(cpu);
1360	struct rapl_package *rp;
1361
1362	list_for_each_entry(rp, &rapl_packages, plist) {
1363		if (rp->id == id
1364		    && rp->priv->control_type == priv->control_type)
1365			return rp;
1366	}
1367
1368	return NULL;
1369}
1370EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1371
1372/* called from CPU hotplug notifier, hotplug lock held */
1373struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
1374{
1375	int id = topology_logical_die_id(cpu);
1376	struct rapl_package *rp;
1377	int ret;
1378
1379	if (!rapl_defaults)
1380		return ERR_PTR(-ENODEV);
1381
1382	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1383	if (!rp)
1384		return ERR_PTR(-ENOMEM);
1385
1386	/* add the new package to the list */
1387	rp->id = id;
1388	rp->lead_cpu = cpu;
1389	rp->priv = priv;
1390
1391	if (topology_max_die_per_package() > 1)
1392		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
1393			 "package-%d-die-%d",
1394			 topology_physical_package_id(cpu), topology_die_id(cpu));
1395	else
1396		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1397			 topology_physical_package_id(cpu));
1398
1399	/* check if the package contains valid domains */
1400	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
1401		ret = -ENODEV;
1402		goto err_free_package;
1403	}
1404	ret = rapl_package_register_powercap(rp);
1405	if (!ret) {
1406		INIT_LIST_HEAD(&rp->plist);
1407		list_add(&rp->plist, &rapl_packages);
1408		return rp;
1409	}
1410
1411err_free_package:
1412	kfree(rp->domains);
1413	kfree(rp);
1414	return ERR_PTR(ret);
1415}
1416EXPORT_SYMBOL_GPL(rapl_add_package);
1417
1418static void power_limit_state_save(void)
1419{
1420	struct rapl_package *rp;
1421	struct rapl_domain *rd;
1422	int nr_pl, ret, i;
1423
1424	cpus_read_lock();
1425	list_for_each_entry(rp, &rapl_packages, plist) {
1426		if (!rp->power_zone)
1427			continue;
1428		rd = power_zone_to_rapl_domain(rp->power_zone);
1429		nr_pl = find_nr_power_limit(rd);
1430		for (i = 0; i < nr_pl; i++) {
1431			switch (rd->rpl[i].prim_id) {
1432			case PL1_ENABLE:
1433				ret = rapl_read_data_raw(rd,
1434						 POWER_LIMIT1, true,
1435						 &rd->rpl[i].last_power_limit);
1436				if (ret)
1437					rd->rpl[i].last_power_limit = 0;
1438				break;
1439			case PL2_ENABLE:
1440				ret = rapl_read_data_raw(rd,
1441						 POWER_LIMIT2, true,
1442						 &rd->rpl[i].last_power_limit);
1443				if (ret)
1444					rd->rpl[i].last_power_limit = 0;
1445				break;
1446			case PL4_ENABLE:
1447				ret = rapl_read_data_raw(rd,
1448						 POWER_LIMIT4, true,
1449						 &rd->rpl[i].last_power_limit);
1450				if (ret)
1451					rd->rpl[i].last_power_limit = 0;
1452				break;
1453			}
1454		}
1455	}
1456	cpus_read_unlock();
1457}
1458
1459static void power_limit_state_restore(void)
1460{
1461	struct rapl_package *rp;
1462	struct rapl_domain *rd;
1463	int nr_pl, i;
1464
1465	cpus_read_lock();
1466	list_for_each_entry(rp, &rapl_packages, plist) {
1467		if (!rp->power_zone)
1468			continue;
1469		rd = power_zone_to_rapl_domain(rp->power_zone);
1470		nr_pl = find_nr_power_limit(rd);
1471		for (i = 0; i < nr_pl; i++) {
1472			switch (rd->rpl[i].prim_id) {
1473			case PL1_ENABLE:
1474				if (rd->rpl[i].last_power_limit)
1475					rapl_write_data_raw(rd, POWER_LIMIT1,
1476					    rd->rpl[i].last_power_limit);
1477				break;
1478			case PL2_ENABLE:
1479				if (rd->rpl[i].last_power_limit)
1480					rapl_write_data_raw(rd, POWER_LIMIT2,
1481					    rd->rpl[i].last_power_limit);
1482				break;
1483			case PL4_ENABLE:
1484				if (rd->rpl[i].last_power_limit)
1485					rapl_write_data_raw(rd, POWER_LIMIT4,
1486					    rd->rpl[i].last_power_limit);
1487				break;
1488			}
1489		}
1490	}
1491	cpus_read_unlock();
1492}
1493
1494static int rapl_pm_callback(struct notifier_block *nb,
1495			    unsigned long mode, void *_unused)
1496{
1497	switch (mode) {
1498	case PM_SUSPEND_PREPARE:
1499		power_limit_state_save();
1500		break;
1501	case PM_POST_SUSPEND:
1502		power_limit_state_restore();
1503		break;
1504	}
1505	return NOTIFY_OK;
1506}
1507
1508static struct notifier_block rapl_pm_notifier = {
1509	.notifier_call = rapl_pm_callback,
1510};
1511
1512static struct platform_device *rapl_msr_platdev;
1513
1514static int __init rapl_init(void)
1515{
1516	const struct x86_cpu_id *id;
1517	int ret;
1518
1519	id = x86_match_cpu(rapl_ids);
1520	if (!id) {
1521		pr_err("driver does not support CPU family %d model %d\n",
1522		       boot_cpu_data.x86, boot_cpu_data.x86_model);
1523
1524		return -ENODEV;
1525	}
1526
1527	rapl_defaults = (struct rapl_defaults *)id->driver_data;
1528
1529	ret = register_pm_notifier(&rapl_pm_notifier);
1530	if (ret)
1531		return ret;
1532
1533	rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1534	if (!rapl_msr_platdev) {
1535		ret = -ENOMEM;
1536		goto end;
1537	}
1538
1539	ret = platform_device_add(rapl_msr_platdev);
1540	if (ret)
1541		platform_device_put(rapl_msr_platdev);
1542
1543end:
1544	if (ret)
1545		unregister_pm_notifier(&rapl_pm_notifier);
1546
1547	return ret;
1548}
1549
1550static void __exit rapl_exit(void)
1551{
1552	platform_device_unregister(rapl_msr_platdev);
1553	unregister_pm_notifier(&rapl_pm_notifier);
1554}
1555
1556fs_initcall(rapl_init);
1557module_exit(rapl_exit);
1558
1559MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1560MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1561MODULE_LICENSE("GPL v2");