Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * x86 APERF/MPERF KHz calculation for
4 * /sys/.../cpufreq/scaling_cur_freq
5 *
6 * Copyright (C) 2017 Intel Corp.
7 * Author: Len Brown <len.brown@intel.com>
8 */
9
10#include <linux/delay.h>
11#include <linux/ktime.h>
12#include <linux/math64.h>
13#include <linux/percpu.h>
14#include <linux/cpufreq.h>
15#include <linux/smp.h>
16#include <linux/sched/isolation.h>
17
18#include "cpu.h"
19
20struct aperfmperf_sample {
21 unsigned int khz;
22 ktime_t time;
23 u64 aperf;
24 u64 mperf;
25};
26
27static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
28
29#define APERFMPERF_CACHE_THRESHOLD_MS 10
30#define APERFMPERF_REFRESH_DELAY_MS 10
31#define APERFMPERF_STALE_THRESHOLD_MS 1000
32
33/*
34 * aperfmperf_snapshot_khz()
35 * On the current CPU, snapshot APERF, MPERF, and jiffies
36 * unless we already did it within 10ms
37 * calculate kHz, save snapshot
38 */
39static void aperfmperf_snapshot_khz(void *dummy)
40{
41 u64 aperf, aperf_delta;
42 u64 mperf, mperf_delta;
43 struct aperfmperf_sample *s = this_cpu_ptr(&samples);
44 unsigned long flags;
45
46 local_irq_save(flags);
47 rdmsrl(MSR_IA32_APERF, aperf);
48 rdmsrl(MSR_IA32_MPERF, mperf);
49 local_irq_restore(flags);
50
51 aperf_delta = aperf - s->aperf;
52 mperf_delta = mperf - s->mperf;
53
54 /*
55 * There is no architectural guarantee that MPERF
56 * increments faster than we can read it.
57 */
58 if (mperf_delta == 0)
59 return;
60
61 s->time = ktime_get();
62 s->aperf = aperf;
63 s->mperf = mperf;
64 s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
65}
66
67static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
68{
69 s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
70
71 /* Don't bother re-computing within the cache threshold time. */
72 if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
73 return true;
74
75 smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
76
77 /* Return false if the previous iteration was too long ago. */
78 return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
79}
80
81unsigned int aperfmperf_get_khz(int cpu)
82{
83 if (!cpu_khz)
84 return 0;
85
86 if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
87 return 0;
88
89 if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
90 return 0;
91
92 aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
93 return per_cpu(samples.khz, cpu);
94}
95
96void arch_freq_prepare_all(void)
97{
98 ktime_t now = ktime_get();
99 bool wait = false;
100 int cpu;
101
102 if (!cpu_khz)
103 return;
104
105 if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
106 return;
107
108 for_each_online_cpu(cpu) {
109 if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
110 continue;
111 if (!aperfmperf_snapshot_cpu(cpu, now, false))
112 wait = true;
113 }
114
115 if (wait)
116 msleep(APERFMPERF_REFRESH_DELAY_MS);
117}
118
119unsigned int arch_freq_get_on_cpu(int cpu)
120{
121 if (!cpu_khz)
122 return 0;
123
124 if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
125 return 0;
126
127 if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
128 return 0;
129
130 if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
131 return per_cpu(samples.khz, cpu);
132
133 msleep(APERFMPERF_REFRESH_DELAY_MS);
134 smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
135
136 return per_cpu(samples.khz, cpu);
137}
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * x86 APERF/MPERF KHz calculation for
4 * /sys/.../cpufreq/scaling_cur_freq
5 *
6 * Copyright (C) 2017 Intel Corp.
7 * Author: Len Brown <len.brown@intel.com>
8 */
9#include <linux/cpufreq.h>
10#include <linux/delay.h>
11#include <linux/ktime.h>
12#include <linux/math64.h>
13#include <linux/percpu.h>
14#include <linux/rcupdate.h>
15#include <linux/sched/isolation.h>
16#include <linux/sched/topology.h>
17#include <linux/smp.h>
18#include <linux/syscore_ops.h>
19
20#include <asm/cpu.h>
21#include <asm/cpu_device_id.h>
22#include <asm/intel-family.h>
23
24#include "cpu.h"
25
26struct aperfmperf {
27 seqcount_t seq;
28 unsigned long last_update;
29 u64 acnt;
30 u64 mcnt;
31 u64 aperf;
32 u64 mperf;
33};
34
35static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
36 .seq = SEQCNT_ZERO(cpu_samples.seq)
37};
38
39static void init_counter_refs(void)
40{
41 u64 aperf, mperf;
42
43 rdmsrl(MSR_IA32_APERF, aperf);
44 rdmsrl(MSR_IA32_MPERF, mperf);
45
46 this_cpu_write(cpu_samples.aperf, aperf);
47 this_cpu_write(cpu_samples.mperf, mperf);
48}
49
50#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
51/*
52 * APERF/MPERF frequency ratio computation.
53 *
54 * The scheduler wants to do frequency invariant accounting and needs a <1
55 * ratio to account for the 'current' frequency, corresponding to
56 * freq_curr / freq_max.
57 *
58 * Since the frequency freq_curr on x86 is controlled by micro-controller and
59 * our P-state setting is little more than a request/hint, we need to observe
60 * the effective frequency 'BusyMHz', i.e. the average frequency over a time
61 * interval after discarding idle time. This is given by:
62 *
63 * BusyMHz = delta_APERF / delta_MPERF * freq_base
64 *
65 * where freq_base is the max non-turbo P-state.
66 *
67 * The freq_max term has to be set to a somewhat arbitrary value, because we
68 * can't know which turbo states will be available at a given point in time:
69 * it all depends on the thermal headroom of the entire package. We set it to
70 * the turbo level with 4 cores active.
71 *
72 * Benchmarks show that's a good compromise between the 1C turbo ratio
73 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
74 * which would ignore the entire turbo range (a conspicuous part, making
75 * freq_curr/freq_max always maxed out).
76 *
77 * An exception to the heuristic above is the Atom uarch, where we choose the
78 * highest turbo level for freq_max since Atom's are generally oriented towards
79 * power efficiency.
80 *
81 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
82 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
83 */
84
85DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
86
87static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
88static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
89
90void arch_set_max_freq_ratio(bool turbo_disabled)
91{
92 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
93 arch_turbo_freq_ratio;
94}
95EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
96
97static bool __init turbo_disabled(void)
98{
99 u64 misc_en;
100 int err;
101
102 err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
103 if (err)
104 return false;
105
106 return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
107}
108
109static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
110{
111 int err;
112
113 err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
114 if (err)
115 return false;
116
117 err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
118 if (err)
119 return false;
120
121 *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
122 *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
123
124 return true;
125}
126
127#define X86_MATCH(vfm) \
128 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
129
130static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
131 X86_MATCH(INTEL_XEON_PHI_KNL),
132 X86_MATCH(INTEL_XEON_PHI_KNM),
133 {}
134};
135
136static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
137 X86_MATCH(INTEL_SKYLAKE_X),
138 {}
139};
140
141static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
142 X86_MATCH(INTEL_ATOM_GOLDMONT),
143 X86_MATCH(INTEL_ATOM_GOLDMONT_D),
144 X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
145 {}
146};
147
148static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
149 int num_delta_fratio)
150{
151 int fratio, delta_fratio, found;
152 int err, i;
153 u64 msr;
154
155 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
156 if (err)
157 return false;
158
159 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
160
161 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
162 if (err)
163 return false;
164
165 fratio = (msr >> 8) & 0xFF;
166 i = 16;
167 found = 0;
168 do {
169 if (found >= num_delta_fratio) {
170 *turbo_freq = fratio;
171 return true;
172 }
173
174 delta_fratio = (msr >> (i + 5)) & 0x7;
175
176 if (delta_fratio) {
177 found += 1;
178 fratio -= delta_fratio;
179 }
180
181 i += 8;
182 } while (i < 64);
183
184 return true;
185}
186
187static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
188{
189 u64 ratios, counts;
190 u32 group_size;
191 int err, i;
192
193 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
194 if (err)
195 return false;
196
197 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
198
199 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
200 if (err)
201 return false;
202
203 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
204 if (err)
205 return false;
206
207 for (i = 0; i < 64; i += 8) {
208 group_size = (counts >> i) & 0xFF;
209 if (group_size >= size) {
210 *turbo_freq = (ratios >> i) & 0xFF;
211 return true;
212 }
213 }
214
215 return false;
216}
217
218static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
219{
220 u64 msr;
221 int err;
222
223 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
224 if (err)
225 return false;
226
227 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
228 if (err)
229 return false;
230
231 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
232 *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
233
234 /* The CPU may have less than 4 cores */
235 if (!*turbo_freq)
236 *turbo_freq = msr & 0xFF; /* 1C turbo */
237
238 return true;
239}
240
241static bool __init intel_set_max_freq_ratio(void)
242{
243 u64 base_freq, turbo_freq;
244 u64 turbo_ratio;
245
246 if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
247 goto out;
248
249 if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
250 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
251 goto out;
252
253 if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
254 knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
255 goto out;
256
257 if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
258 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
259 goto out;
260
261 if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
262 goto out;
263
264 return false;
265
266out:
267 /*
268 * Some hypervisors advertise X86_FEATURE_APERFMPERF
269 * but then fill all MSR's with zeroes.
270 * Some CPUs have turbo boost but don't declare any turbo ratio
271 * in MSR_TURBO_RATIO_LIMIT.
272 */
273 if (!base_freq || !turbo_freq) {
274 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
275 return false;
276 }
277
278 turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
279 if (!turbo_ratio) {
280 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
281 return false;
282 }
283
284 arch_turbo_freq_ratio = turbo_ratio;
285 arch_set_max_freq_ratio(turbo_disabled());
286
287 return true;
288}
289
290#ifdef CONFIG_PM_SLEEP
291static struct syscore_ops freq_invariance_syscore_ops = {
292 .resume = init_counter_refs,
293};
294
295static void register_freq_invariance_syscore_ops(void)
296{
297 register_syscore_ops(&freq_invariance_syscore_ops);
298}
299#else
300static inline void register_freq_invariance_syscore_ops(void) {}
301#endif
302
303static void freq_invariance_enable(void)
304{
305 if (static_branch_unlikely(&arch_scale_freq_key)) {
306 WARN_ON_ONCE(1);
307 return;
308 }
309 static_branch_enable_cpuslocked(&arch_scale_freq_key);
310 register_freq_invariance_syscore_ops();
311 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
312}
313
314void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
315{
316 arch_turbo_freq_ratio = ratio;
317 arch_set_max_freq_ratio(turbo_disabled);
318 freq_invariance_enable();
319}
320
321static void __init bp_init_freq_invariance(void)
322{
323 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
324 return;
325
326 if (intel_set_max_freq_ratio()) {
327 guard(cpus_read_lock)();
328 freq_invariance_enable();
329 }
330}
331
332static void disable_freq_invariance_workfn(struct work_struct *work)
333{
334 int cpu;
335
336 static_branch_disable(&arch_scale_freq_key);
337
338 /*
339 * Set arch_freq_scale to a default value on all cpus
340 * This negates the effect of scaling
341 */
342 for_each_possible_cpu(cpu)
343 per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
344}
345
346static DECLARE_WORK(disable_freq_invariance_work,
347 disable_freq_invariance_workfn);
348
349DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
350EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
351
352static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
353
354struct arch_hybrid_cpu_scale {
355 unsigned long capacity;
356 unsigned long freq_ratio;
357};
358
359static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
360
361/**
362 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
363 *
364 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
365 * initialize it and set the static key controlling its code paths.
366 *
367 * Must be called before arch_set_cpu_capacity().
368 */
369bool arch_enable_hybrid_capacity_scale(void)
370{
371 int cpu;
372
373 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
374 WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
375 return true;
376 }
377
378 arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
379 if (!arch_cpu_scale)
380 return false;
381
382 for_each_possible_cpu(cpu) {
383 per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
384 per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
385 }
386
387 static_branch_enable(&arch_hybrid_cap_scale_key);
388
389 pr_info("Hybrid CPU capacity scaling enabled\n");
390
391 return true;
392}
393
394/**
395 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
396 * @cpu: Target CPU.
397 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
398 * @max_cap: System-wide maximum CPU capacity.
399 * @cap_freq: Frequency of @cpu corresponding to @cap.
400 * @base_freq: Frequency of @cpu at which MPERF counts.
401 *
402 * The units in which @cap and @max_cap are expressed do not matter, so long
403 * as they are consistent, because the former is effectively divided by the
404 * latter. Analogously for @cap_freq and @base_freq.
405 *
406 * After calling this function for all CPUs, call arch_rebuild_sched_domains()
407 * to let the scheduler know that capacity-aware scheduling can be used going
408 * forward.
409 */
410void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
411 unsigned long cap_freq, unsigned long base_freq)
412{
413 if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
414 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
415 div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
416 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
417 div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
418 } else {
419 WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
420 }
421}
422
423unsigned long arch_scale_cpu_capacity(int cpu)
424{
425 if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
426 return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
427
428 return SCHED_CAPACITY_SCALE;
429}
430EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
431
432static void scale_freq_tick(u64 acnt, u64 mcnt)
433{
434 u64 freq_scale, freq_ratio;
435
436 if (!arch_scale_freq_invariant())
437 return;
438
439 if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
440 goto error;
441
442 if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
443 freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
444 else
445 freq_ratio = arch_max_freq_ratio;
446
447 if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
448 goto error;
449
450 freq_scale = div64_u64(acnt, mcnt);
451 if (!freq_scale)
452 goto error;
453
454 if (freq_scale > SCHED_CAPACITY_SCALE)
455 freq_scale = SCHED_CAPACITY_SCALE;
456
457 this_cpu_write(arch_freq_scale, freq_scale);
458 return;
459
460error:
461 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
462 schedule_work(&disable_freq_invariance_work);
463}
464#else
465static inline void bp_init_freq_invariance(void) { }
466static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
467#endif /* CONFIG_X86_64 && CONFIG_SMP */
468
469void arch_scale_freq_tick(void)
470{
471 struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
472 u64 acnt, mcnt, aperf, mperf;
473
474 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
475 return;
476
477 rdmsrl(MSR_IA32_APERF, aperf);
478 rdmsrl(MSR_IA32_MPERF, mperf);
479 acnt = aperf - s->aperf;
480 mcnt = mperf - s->mperf;
481
482 s->aperf = aperf;
483 s->mperf = mperf;
484
485 raw_write_seqcount_begin(&s->seq);
486 s->last_update = jiffies;
487 s->acnt = acnt;
488 s->mcnt = mcnt;
489 raw_write_seqcount_end(&s->seq);
490
491 scale_freq_tick(acnt, mcnt);
492}
493
494/*
495 * Discard samples older than the define maximum sample age of 20ms. There
496 * is no point in sending IPIs in such a case. If the scheduler tick was
497 * not running then the CPU is either idle or isolated.
498 */
499#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
500
501unsigned int arch_freq_get_on_cpu(int cpu)
502{
503 struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
504 unsigned int seq, freq;
505 unsigned long last;
506 u64 acnt, mcnt;
507
508 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
509 goto fallback;
510
511 do {
512 seq = raw_read_seqcount_begin(&s->seq);
513 last = s->last_update;
514 acnt = s->acnt;
515 mcnt = s->mcnt;
516 } while (read_seqcount_retry(&s->seq, seq));
517
518 /*
519 * Bail on invalid count and when the last update was too long ago,
520 * which covers idle and NOHZ full CPUs.
521 */
522 if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
523 goto fallback;
524
525 return div64_u64((cpu_khz * acnt), mcnt);
526
527fallback:
528 freq = cpufreq_quick_get(cpu);
529 return freq ? freq : cpu_khz;
530}
531
532static int __init bp_init_aperfmperf(void)
533{
534 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
535 return 0;
536
537 init_counter_refs();
538 bp_init_freq_invariance();
539 return 0;
540}
541early_initcall(bp_init_aperfmperf);
542
543void ap_init_aperfmperf(void)
544{
545 if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
546 init_counter_refs();
547}