fair.c - kernel/sched/fair.c - Linux source code v6.13.7

Note: File does not exist in v3.1.
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
    4 *
    5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
    6 *
    7 *  Interactivity improvements by Mike Galbraith
    8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
    9 *
   10 *  Various enhancements by Dmitry Adamushko.
   11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
   12 *
   13 *  Group scheduling enhancements by Srivatsa Vaddagiri
   14 *  Copyright IBM Corporation, 2007
   15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
   16 *
   17 *  Scaled math optimizations by Thomas Gleixner
   18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
   19 *
   20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
   21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
   22 */
   23#include <linux/energy_model.h>
   24#include <linux/mmap_lock.h>
   25#include <linux/hugetlb_inline.h>
   26#include <linux/jiffies.h>
   27#include <linux/mm_api.h>
   28#include <linux/highmem.h>
   29#include <linux/spinlock_api.h>
   30#include <linux/cpumask_api.h>
   31#include <linux/lockdep_api.h>
   32#include <linux/softirq.h>
   33#include <linux/refcount_api.h>
   34#include <linux/topology.h>
   35#include <linux/sched/clock.h>
   36#include <linux/sched/cond_resched.h>
   37#include <linux/sched/cputime.h>
   38#include <linux/sched/isolation.h>
   39#include <linux/sched/nohz.h>
   40
   41#include <linux/cpuidle.h>
   42#include <linux/interrupt.h>
   43#include <linux/memory-tiers.h>
   44#include <linux/mempolicy.h>
   45#include <linux/mutex_api.h>
   46#include <linux/profile.h>
   47#include <linux/psi.h>
   48#include <linux/ratelimit.h>
   49#include <linux/task_work.h>
   50#include <linux/rbtree_augmented.h>
   51
   52#include <asm/switch_to.h>
   53
   54#include "sched.h"
   55#include "stats.h"
   56#include "autogroup.h"
   57
   58/*
   59 * The initial- and re-scaling of tunables is configurable
   60 *
   61 * Options are:
   62 *
   63 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
   64 *   SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
   65 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
   66 *
   67 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
   68 */
   69unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
   70
   71/*
   72 * Minimal preemption granularity for CPU-bound tasks:
   73 *
   74 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
   75 */
   76unsigned int sysctl_sched_base_slice			= 750000ULL;
   77static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
   78
   79const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
   80
   81static int __init setup_sched_thermal_decay_shift(char *str)
   82{
   83	pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
   84	return 1;
   85}
   86__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
   87
   88#ifdef CONFIG_SMP
   89/*
   90 * For asym packing, by default the lower numbered CPU has higher priority.
   91 */
   92int __weak arch_asym_cpu_priority(int cpu)
   93{
   94	return -cpu;
   95}
   96
   97/*
   98 * The margin used when comparing utilization with CPU capacity.
   99 *
  100 * (default: ~20%)
  101 */
  102#define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)
  103
  104/*
  105 * The margin used when comparing CPU capacities.
  106 * is 'cap1' noticeably greater than 'cap2'
  107 *
  108 * (default: ~5%)
  109 */
  110#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
  111#endif
  112
  113#ifdef CONFIG_CFS_BANDWIDTH
  114/*
  115 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
  116 * each time a cfs_rq requests quota.
  117 *
  118 * Note: in the case that the slice exceeds the runtime remaining (either due
  119 * to consumption or the quota being specified to be smaller than the slice)
  120 * we will always only issue the remaining available time.
  121 *
  122 * (default: 5 msec, units: microseconds)
  123 */
  124static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
  125#endif
  126
  127#ifdef CONFIG_NUMA_BALANCING
  128/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
  129static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
  130#endif
  131
  132#ifdef CONFIG_SYSCTL
  133static struct ctl_table sched_fair_sysctls[] = {
  134#ifdef CONFIG_CFS_BANDWIDTH
  135	{
  136		.procname       = "sched_cfs_bandwidth_slice_us",
  137		.data           = &sysctl_sched_cfs_bandwidth_slice,
  138		.maxlen         = sizeof(unsigned int),
  139		.mode           = 0644,
  140		.proc_handler   = proc_dointvec_minmax,
  141		.extra1         = SYSCTL_ONE,
  142	},
  143#endif
  144#ifdef CONFIG_NUMA_BALANCING
  145	{
  146		.procname	= "numa_balancing_promote_rate_limit_MBps",
  147		.data		= &sysctl_numa_balancing_promote_rate_limit,
  148		.maxlen		= sizeof(unsigned int),
  149		.mode		= 0644,
  150		.proc_handler	= proc_dointvec_minmax,
  151		.extra1		= SYSCTL_ZERO,
  152	},
  153#endif /* CONFIG_NUMA_BALANCING */
  154};
  155
  156static int __init sched_fair_sysctl_init(void)
  157{
  158	register_sysctl_init("kernel", sched_fair_sysctls);
  159	return 0;
  160}
  161late_initcall(sched_fair_sysctl_init);
  162#endif
  163
  164static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  165{
  166	lw->weight += inc;
  167	lw->inv_weight = 0;
  168}
  169
  170static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
  171{
  172	lw->weight -= dec;
  173	lw->inv_weight = 0;
  174}
  175
  176static inline void update_load_set(struct load_weight *lw, unsigned long w)
  177{
  178	lw->weight = w;
  179	lw->inv_weight = 0;
  180}
  181
  182/*
  183 * Increase the granularity value when there are more CPUs,
  184 * because with more CPUs the 'effective latency' as visible
  185 * to users decreases. But the relationship is not linear,
  186 * so pick a second-best guess by going with the log2 of the
  187 * number of CPUs.
  188 *
  189 * This idea comes from the SD scheduler of Con Kolivas:
  190 */
  191static unsigned int get_update_sysctl_factor(void)
  192{
  193	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
  194	unsigned int factor;
  195
  196	switch (sysctl_sched_tunable_scaling) {
  197	case SCHED_TUNABLESCALING_NONE:
  198		factor = 1;
  199		break;
  200	case SCHED_TUNABLESCALING_LINEAR:
  201		factor = cpus;
  202		break;
  203	case SCHED_TUNABLESCALING_LOG:
  204	default:
  205		factor = 1 + ilog2(cpus);
  206		break;
  207	}
  208
  209	return factor;
  210}
  211
  212static void update_sysctl(void)
  213{
  214	unsigned int factor = get_update_sysctl_factor();
  215
  216#define SET_SYSCTL(name) \
  217	(sysctl_##name = (factor) * normalized_sysctl_##name)
  218	SET_SYSCTL(sched_base_slice);
  219#undef SET_SYSCTL
  220}
  221
  222void __init sched_init_granularity(void)
  223{
  224	update_sysctl();
  225}
  226
  227#define WMULT_CONST	(~0U)
  228#define WMULT_SHIFT	32
  229
  230static void __update_inv_weight(struct load_weight *lw)
  231{
  232	unsigned long w;
  233
  234	if (likely(lw->inv_weight))
  235		return;
  236
  237	w = scale_load_down(lw->weight);
  238
  239	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
  240		lw->inv_weight = 1;
  241	else if (unlikely(!w))
  242		lw->inv_weight = WMULT_CONST;
  243	else
  244		lw->inv_weight = WMULT_CONST / w;
  245}
  246
  247/*
  248 * delta_exec * weight / lw.weight
  249 *   OR
  250 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
  251 *
  252 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
  253 * we're guaranteed shift stays positive because inv_weight is guaranteed to
  254 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
  255 *
  256 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
  257 * weight/lw.weight <= 1, and therefore our shift will also be positive.
  258 */
  259static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
  260{
  261	u64 fact = scale_load_down(weight);
  262	u32 fact_hi = (u32)(fact >> 32);
  263	int shift = WMULT_SHIFT;
  264	int fs;
  265
  266	__update_inv_weight(lw);
  267
  268	if (unlikely(fact_hi)) {
  269		fs = fls(fact_hi);
  270		shift -= fs;
  271		fact >>= fs;
  272	}
  273
  274	fact = mul_u32_u32(fact, lw->inv_weight);
  275
  276	fact_hi = (u32)(fact >> 32);
  277	if (fact_hi) {
  278		fs = fls(fact_hi);
  279		shift -= fs;
  280		fact >>= fs;
  281	}
  282
  283	return mul_u64_u32_shr(delta_exec, fact, shift);
  284}
  285
  286/*
  287 * delta /= w
  288 */
  289static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
  290{
  291	if (unlikely(se->load.weight != NICE_0_LOAD))
  292		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
  293
  294	return delta;
  295}
  296
  297const struct sched_class fair_sched_class;
  298
  299/**************************************************************
  300 * CFS operations on generic schedulable entities:
  301 */
  302
  303#ifdef CONFIG_FAIR_GROUP_SCHED
  304
  305/* Walk up scheduling entities hierarchy */
  306#define for_each_sched_entity(se) \
  307		for (; se; se = se->parent)
  308
  309static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  310{
  311	struct rq *rq = rq_of(cfs_rq);
  312	int cpu = cpu_of(rq);
  313
  314	if (cfs_rq->on_list)
  315		return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
  316
  317	cfs_rq->on_list = 1;
  318
  319	/*
  320	 * Ensure we either appear before our parent (if already
  321	 * enqueued) or force our parent to appear after us when it is
  322	 * enqueued. The fact that we always enqueue bottom-up
  323	 * reduces this to two cases and a special case for the root
  324	 * cfs_rq. Furthermore, it also means that we will always reset
  325	 * tmp_alone_branch either when the branch is connected
  326	 * to a tree or when we reach the top of the tree
  327	 */
  328	if (cfs_rq->tg->parent &&
  329	    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
  330		/*
  331		 * If parent is already on the list, we add the child
  332		 * just before. Thanks to circular linked property of
  333		 * the list, this means to put the child at the tail
  334		 * of the list that starts by parent.
  335		 */
  336		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  337			&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
  338		/*
  339		 * The branch is now connected to its tree so we can
  340		 * reset tmp_alone_branch to the beginning of the
  341		 * list.
  342		 */
  343		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  344		return true;
  345	}
  346
  347	if (!cfs_rq->tg->parent) {
  348		/*
  349		 * cfs rq without parent should be put
  350		 * at the tail of the list.
  351		 */
  352		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  353			&rq->leaf_cfs_rq_list);
  354		/*
  355		 * We have reach the top of a tree so we can reset
  356		 * tmp_alone_branch to the beginning of the list.
  357		 */
  358		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  359		return true;
  360	}
  361
  362	/*
  363	 * The parent has not already been added so we want to
  364	 * make sure that it will be put after us.
  365	 * tmp_alone_branch points to the begin of the branch
  366	 * where we will add parent.
  367	 */
  368	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
  369	/*
  370	 * update tmp_alone_branch to points to the new begin
  371	 * of the branch
  372	 */
  373	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
  374	return false;
  375}
  376
  377static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  378{
  379	if (cfs_rq->on_list) {
  380		struct rq *rq = rq_of(cfs_rq);
  381
  382		/*
  383		 * With cfs_rq being unthrottled/throttled during an enqueue,
  384		 * it can happen the tmp_alone_branch points to the leaf that
  385		 * we finally want to delete. In this case, tmp_alone_branch moves
  386		 * to the prev element but it will point to rq->leaf_cfs_rq_list
  387		 * at the end of the enqueue.
  388		 */
  389		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
  390			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
  391
  392		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
  393		cfs_rq->on_list = 0;
  394	}
  395}
  396
  397static inline void assert_list_leaf_cfs_rq(struct rq *rq)
  398{
  399	SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
  400}
  401
  402/* Iterate through all leaf cfs_rq's on a runqueue */
  403#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
  404	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
  405				 leaf_cfs_rq_list)
  406
  407/* Do the two (enqueued) entities belong to the same group ? */
  408static inline struct cfs_rq *
  409is_same_group(struct sched_entity *se, struct sched_entity *pse)
  410{
  411	if (se->cfs_rq == pse->cfs_rq)
  412		return se->cfs_rq;
  413
  414	return NULL;
  415}
  416
  417static inline struct sched_entity *parent_entity(const struct sched_entity *se)
  418{
  419	return se->parent;
  420}
  421
  422static void
  423find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  424{
  425	int se_depth, pse_depth;
  426
  427	/*
  428	 * preemption test can be made between sibling entities who are in the
  429	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
  430	 * both tasks until we find their ancestors who are siblings of common
  431	 * parent.
  432	 */
  433
  434	/* First walk up until both entities are at same depth */
  435	se_depth = (*se)->depth;
  436	pse_depth = (*pse)->depth;
  437
  438	while (se_depth > pse_depth) {
  439		se_depth--;
  440		*se = parent_entity(*se);
  441	}
  442
  443	while (pse_depth > se_depth) {
  444		pse_depth--;
  445		*pse = parent_entity(*pse);
  446	}
  447
  448	while (!is_same_group(*se, *pse)) {
  449		*se = parent_entity(*se);
  450		*pse = parent_entity(*pse);
  451	}
  452}
  453
  454static int tg_is_idle(struct task_group *tg)
  455{
  456	return tg->idle > 0;
  457}
  458
  459static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
  460{
  461	return cfs_rq->idle > 0;
  462}
  463
  464static int se_is_idle(struct sched_entity *se)
  465{
  466	if (entity_is_task(se))
  467		return task_has_idle_policy(task_of(se));
  468	return cfs_rq_is_idle(group_cfs_rq(se));
  469}
  470
  471#else	/* !CONFIG_FAIR_GROUP_SCHED */
  472
  473#define for_each_sched_entity(se) \
  474		for (; se; se = NULL)
  475
  476static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  477{
  478	return true;
  479}
  480
  481static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  482{
  483}
  484
  485static inline void assert_list_leaf_cfs_rq(struct rq *rq)
  486{
  487}
  488
  489#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
  490		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
  491
  492static inline struct sched_entity *parent_entity(struct sched_entity *se)
  493{
  494	return NULL;
  495}
  496
  497static inline void
  498find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  499{
  500}
  501
  502static inline int tg_is_idle(struct task_group *tg)
  503{
  504	return 0;
  505}
  506
  507static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
  508{
  509	return 0;
  510}
  511
  512static int se_is_idle(struct sched_entity *se)
  513{
  514	return task_has_idle_policy(task_of(se));
  515}
  516
  517#endif	/* CONFIG_FAIR_GROUP_SCHED */
  518
  519static __always_inline
  520void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
  521
  522/**************************************************************
  523 * Scheduling class tree data structure manipulation methods:
  524 */
  525
  526static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
  527{
  528	s64 delta = (s64)(vruntime - max_vruntime);
  529	if (delta > 0)
  530		max_vruntime = vruntime;
  531
  532	return max_vruntime;
  533}
  534
  535static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
  536{
  537	s64 delta = (s64)(vruntime - min_vruntime);
  538	if (delta < 0)
  539		min_vruntime = vruntime;
  540
  541	return min_vruntime;
  542}
  543
  544static inline bool entity_before(const struct sched_entity *a,
  545				 const struct sched_entity *b)
  546{
  547	/*
  548	 * Tiebreak on vruntime seems unnecessary since it can
  549	 * hardly happen.
  550	 */
  551	return (s64)(a->deadline - b->deadline) < 0;
  552}
  553
  554static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
  555{
  556	return (s64)(se->vruntime - cfs_rq->min_vruntime);
  557}
  558
  559#define __node_2_se(node) \
  560	rb_entry((node), struct sched_entity, run_node)
  561
  562/*
  563 * Compute virtual time from the per-task service numbers:
  564 *
  565 * Fair schedulers conserve lag:
  566 *
  567 *   \Sum lag_i = 0
  568 *
  569 * Where lag_i is given by:
  570 *
  571 *   lag_i = S - s_i = w_i * (V - v_i)
  572 *
  573 * Where S is the ideal service time and V is it's virtual time counterpart.
  574 * Therefore:
  575 *
  576 *   \Sum lag_i = 0
  577 *   \Sum w_i * (V - v_i) = 0
  578 *   \Sum w_i * V - w_i * v_i = 0
  579 *
  580 * From which we can solve an expression for V in v_i (which we have in
  581 * se->vruntime):
  582 *
  583 *       \Sum v_i * w_i   \Sum v_i * w_i
  584 *   V = -------------- = --------------
  585 *          \Sum w_i            W
  586 *
  587 * Specifically, this is the weighted average of all entity virtual runtimes.
  588 *
  589 * [[ NOTE: this is only equal to the ideal scheduler under the condition
  590 *          that join/leave operations happen at lag_i = 0, otherwise the
  591 *          virtual time has non-contiguous motion equivalent to:
  592 *
  593 *	      V +-= lag_i / W
  594 *
  595 *	    Also see the comment in place_entity() that deals with this. ]]
  596 *
  597 * However, since v_i is u64, and the multiplication could easily overflow
  598 * transform it into a relative form that uses smaller quantities:
  599 *
  600 * Substitute: v_i == (v_i - v0) + v0
  601 *
  602 *     \Sum ((v_i - v0) + v0) * w_i   \Sum (v_i - v0) * w_i
  603 * V = ---------------------------- = --------------------- + v0
  604 *                  W                            W
  605 *
  606 * Which we track using:
  607 *
  608 *                    v0 := cfs_rq->min_vruntime
  609 * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
  610 *              \Sum w_i := cfs_rq->avg_load
  611 *
  612 * Since min_vruntime is a monotonic increasing variable that closely tracks
  613 * the per-task service, these deltas: (v_i - v), will be in the order of the
  614 * maximal (virtual) lag induced in the system due to quantisation.
  615 *
  616 * Also, we use scale_load_down() to reduce the size.
  617 *
  618 * As measured, the max (key * weight) value was ~44 bits for a kernel build.
  619 */
  620static void
  621avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
  622{
  623	unsigned long weight = scale_load_down(se->load.weight);
  624	s64 key = entity_key(cfs_rq, se);
  625
  626	cfs_rq->avg_vruntime += key * weight;
  627	cfs_rq->avg_load += weight;
  628}
  629
  630static void
  631avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
  632{
  633	unsigned long weight = scale_load_down(se->load.weight);
  634	s64 key = entity_key(cfs_rq, se);
  635
  636	cfs_rq->avg_vruntime -= key * weight;
  637	cfs_rq->avg_load -= weight;
  638}
  639
  640static inline
  641void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
  642{
  643	/*
  644	 * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
  645	 */
  646	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
  647}
  648
  649/*
  650 * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
  651 * For this to be so, the result of this function must have a left bias.
  652 */
  653u64 avg_vruntime(struct cfs_rq *cfs_rq)
  654{
  655	struct sched_entity *curr = cfs_rq->curr;
  656	s64 avg = cfs_rq->avg_vruntime;
  657	long load = cfs_rq->avg_load;
  658
  659	if (curr && curr->on_rq) {
  660		unsigned long weight = scale_load_down(curr->load.weight);
  661
  662		avg += entity_key(cfs_rq, curr) * weight;
  663		load += weight;
  664	}
  665
  666	if (load) {
  667		/* sign flips effective floor / ceiling */
  668		if (avg < 0)
  669			avg -= (load - 1);
  670		avg = div_s64(avg, load);
  671	}
  672
  673	return cfs_rq->min_vruntime + avg;
  674}
  675
  676/*
  677 * lag_i = S - s_i = w_i * (V - v_i)
  678 *
  679 * However, since V is approximated by the weighted average of all entities it
  680 * is possible -- by addition/removal/reweight to the tree -- to move V around
  681 * and end up with a larger lag than we started with.
  682 *
  683 * Limit this to either double the slice length with a minimum of TICK_NSEC
  684 * since that is the timing granularity.
  685 *
  686 * EEVDF gives the following limit for a steady state system:
  687 *
  688 *   -r_max < lag < max(r_max, q)
  689 *
  690 * XXX could add max_slice to the augmented data to track this.
  691 */
  692static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
  693{
  694	s64 vlag, limit;
  695
  696	SCHED_WARN_ON(!se->on_rq);
  697
  698	vlag = avg_vruntime(cfs_rq) - se->vruntime;
  699	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
  700
  701	se->vlag = clamp(vlag, -limit, limit);
  702}
  703
  704/*
  705 * Entity is eligible once it received less service than it ought to have,
  706 * eg. lag >= 0.
  707 *
  708 * lag_i = S - s_i = w_i*(V - v_i)
  709 *
  710 * lag_i >= 0 -> V >= v_i
  711 *
  712 *     \Sum (v_i - v)*w_i
  713 * V = ------------------ + v
  714 *          \Sum w_i
  715 *
  716 * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
  717 *
  718 * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
  719 *       to the loss in precision caused by the division.
  720 */
  721static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
  722{
  723	struct sched_entity *curr = cfs_rq->curr;
  724	s64 avg = cfs_rq->avg_vruntime;
  725	long load = cfs_rq->avg_load;
  726
  727	if (curr && curr->on_rq) {
  728		unsigned long weight = scale_load_down(curr->load.weight);
  729
  730		avg += entity_key(cfs_rq, curr) * weight;
  731		load += weight;
  732	}
  733
  734	return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
  735}
  736
  737int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
  738{
  739	return vruntime_eligible(cfs_rq, se->vruntime);
  740}
  741
  742static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
  743{
  744	u64 min_vruntime = cfs_rq->min_vruntime;
  745	/*
  746	 * open coded max_vruntime() to allow updating avg_vruntime
  747	 */
  748	s64 delta = (s64)(vruntime - min_vruntime);
  749	if (delta > 0) {
  750		avg_vruntime_update(cfs_rq, delta);
  751		min_vruntime = vruntime;
  752	}
  753	return min_vruntime;
  754}
  755
  756static void update_min_vruntime(struct cfs_rq *cfs_rq)
  757{
  758	struct sched_entity *se = __pick_root_entity(cfs_rq);
  759	struct sched_entity *curr = cfs_rq->curr;
  760	u64 vruntime = cfs_rq->min_vruntime;
  761
  762	if (curr) {
  763		if (curr->on_rq)
  764			vruntime = curr->vruntime;
  765		else
  766			curr = NULL;
  767	}
  768
  769	if (se) {
  770		if (!curr)
  771			vruntime = se->min_vruntime;
  772		else
  773			vruntime = min_vruntime(vruntime, se->min_vruntime);
  774	}
  775
  776	/* ensure we never gain time by being placed backwards. */
  777	cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
  778}
  779
  780static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
  781{
  782	struct sched_entity *root = __pick_root_entity(cfs_rq);
  783	struct sched_entity *curr = cfs_rq->curr;
  784	u64 min_slice = ~0ULL;
  785
  786	if (curr && curr->on_rq)
  787		min_slice = curr->slice;
  788
  789	if (root)
  790		min_slice = min(min_slice, root->min_slice);
  791
  792	return min_slice;
  793}
  794
  795static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
  796{
  797	return entity_before(__node_2_se(a), __node_2_se(b));
  798}
  799
  800#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
  801
  802static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
  803{
  804	if (node) {
  805		struct sched_entity *rse = __node_2_se(node);
  806		if (vruntime_gt(min_vruntime, se, rse))
  807			se->min_vruntime = rse->min_vruntime;
  808	}
  809}
  810
  811static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
  812{
  813	if (node) {
  814		struct sched_entity *rse = __node_2_se(node);
  815		if (rse->min_slice < se->min_slice)
  816			se->min_slice = rse->min_slice;
  817	}
  818}
  819
  820/*
  821 * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
  822 */
  823static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
  824{
  825	u64 old_min_vruntime = se->min_vruntime;
  826	u64 old_min_slice = se->min_slice;
  827	struct rb_node *node = &se->run_node;
  828
  829	se->min_vruntime = se->vruntime;
  830	__min_vruntime_update(se, node->rb_right);
  831	__min_vruntime_update(se, node->rb_left);
  832
  833	se->min_slice = se->slice;
  834	__min_slice_update(se, node->rb_right);
  835	__min_slice_update(se, node->rb_left);
  836
  837	return se->min_vruntime == old_min_vruntime &&
  838	       se->min_slice == old_min_slice;
  839}
  840
  841RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
  842		     run_node, min_vruntime, min_vruntime_update);
  843
  844/*
  845 * Enqueue an entity into the rb-tree:
  846 */
  847static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  848{
  849	avg_vruntime_add(cfs_rq, se);
  850	se->min_vruntime = se->vruntime;
  851	se->min_slice = se->slice;
  852	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
  853				__entity_less, &min_vruntime_cb);
  854}
  855
  856static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  857{
  858	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
  859				  &min_vruntime_cb);
  860	avg_vruntime_sub(cfs_rq, se);
  861}
  862
  863struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
  864{
  865	struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
  866
  867	if (!root)
  868		return NULL;
  869
  870	return __node_2_se(root);
  871}
  872
  873struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  874{
  875	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
  876
  877	if (!left)
  878		return NULL;
  879
  880	return __node_2_se(left);
  881}
  882
  883/*
  884 * Earliest Eligible Virtual Deadline First
  885 *
  886 * In order to provide latency guarantees for different request sizes
  887 * EEVDF selects the best runnable task from two criteria:
  888 *
  889 *  1) the task must be eligible (must be owed service)
  890 *
  891 *  2) from those tasks that meet 1), we select the one
  892 *     with the earliest virtual deadline.
  893 *
  894 * We can do this in O(log n) time due to an augmented RB-tree. The
  895 * tree keeps the entries sorted on deadline, but also functions as a
  896 * heap based on the vruntime by keeping:
  897 *
  898 *  se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
  899 *
  900 * Which allows tree pruning through eligibility.
  901 */
  902static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
  903{
  904	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
  905	struct sched_entity *se = __pick_first_entity(cfs_rq);
  906	struct sched_entity *curr = cfs_rq->curr;
  907	struct sched_entity *best = NULL;
  908
  909	/*
  910	 * We can safely skip eligibility check if there is only one entity
  911	 * in this cfs_rq, saving some cycles.
  912	 */
  913	if (cfs_rq->nr_running == 1)
  914		return curr && curr->on_rq ? curr : se;
  915
  916	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
  917		curr = NULL;
  918
  919	/*
  920	 * Once selected, run a task until it either becomes non-eligible or
  921	 * until it gets a new slice. See the HACK in set_next_entity().
  922	 */
  923	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
  924		return curr;
  925
  926	/* Pick the leftmost entity if it's eligible */
  927	if (se && entity_eligible(cfs_rq, se)) {
  928		best = se;
  929		goto found;
  930	}
  931
  932	/* Heap search for the EEVD entity */
  933	while (node) {
  934		struct rb_node *left = node->rb_left;
  935
  936		/*
  937		 * Eligible entities in left subtree are always better
  938		 * choices, since they have earlier deadlines.
  939		 */
  940		if (left && vruntime_eligible(cfs_rq,
  941					__node_2_se(left)->min_vruntime)) {
  942			node = left;
  943			continue;
  944		}
  945
  946		se = __node_2_se(node);
  947
  948		/*
  949		 * The left subtree either is empty or has no eligible
  950		 * entity, so check the current node since it is the one
  951		 * with earliest deadline that might be eligible.
  952		 */
  953		if (entity_eligible(cfs_rq, se)) {
  954			best = se;
  955			break;
  956		}
  957
  958		node = node->rb_right;
  959	}
  960found:
  961	if (!best || (curr && entity_before(curr, best)))
  962		best = curr;
  963
  964	return best;
  965}
  966
  967#ifdef CONFIG_SCHED_DEBUG
  968struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  969{
  970	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
  971
  972	if (!last)
  973		return NULL;
  974
  975	return __node_2_se(last);
  976}
  977
  978/**************************************************************
  979 * Scheduling class statistics methods:
  980 */
  981#ifdef CONFIG_SMP
  982int sched_update_scaling(void)
  983{
  984	unsigned int factor = get_update_sysctl_factor();
  985
  986#define WRT_SYSCTL(name) \
  987	(normalized_sysctl_##name = sysctl_##name / (factor))
  988	WRT_SYSCTL(sched_base_slice);
  989#undef WRT_SYSCTL
  990
  991	return 0;
  992}
  993#endif
  994#endif
  995
  996static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
  997
  998/*
  999 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
 1000 * this is probably good enough.
 1001 */
 1002static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 1003{
 1004	if ((s64)(se->vruntime - se->deadline) < 0)
 1005		return false;
 1006
 1007	/*
 1008	 * For EEVDF the virtual time slope is determined by w_i (iow.
 1009	 * nice) while the request time r_i is determined by
 1010	 * sysctl_sched_base_slice.
 1011	 */
 1012	if (!se->custom_slice)
 1013		se->slice = sysctl_sched_base_slice;
 1014
 1015	/*
 1016	 * EEVDF: vd_i = ve_i + r_i / w_i
 1017	 */
 1018	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
 1019
 1020	/*
 1021	 * The task has consumed its request, reschedule.
 1022	 */
 1023	return true;
 1024}
 1025
 1026#include "pelt.h"
 1027#ifdef CONFIG_SMP
 1028
 1029static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 1030static unsigned long task_h_load(struct task_struct *p);
 1031static unsigned long capacity_of(int cpu);
 1032
 1033/* Give new sched_entity start runnable values to heavy its load in infant time */
 1034void init_entity_runnable_average(struct sched_entity *se)
 1035{
 1036	struct sched_avg *sa = &se->avg;
 1037
 1038	memset(sa, 0, sizeof(*sa));
 1039
 1040	/*
 1041	 * Tasks are initialized with full load to be seen as heavy tasks until
 1042	 * they get a chance to stabilize to their real load level.
 1043	 * Group entities are initialized with zero load to reflect the fact that
 1044	 * nothing has been attached to the task group yet.
 1045	 */
 1046	if (entity_is_task(se))
 1047		sa->load_avg = scale_load_down(se->load.weight);
 1048
 1049	/* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
 1050}
 1051
 1052/*
 1053 * With new tasks being created, their initial util_avgs are extrapolated
 1054 * based on the cfs_rq's current util_avg:
 1055 *
 1056 *   util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
 1057 *		* se_weight(se)
 1058 *
 1059 * However, in many cases, the above util_avg does not give a desired
 1060 * value. Moreover, the sum of the util_avgs may be divergent, such
 1061 * as when the series is a harmonic series.
 1062 *
 1063 * To solve this problem, we also cap the util_avg of successive tasks to
 1064 * only 1/2 of the left utilization budget:
 1065 *
 1066 *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
 1067 *
 1068 * where n denotes the nth task and cpu_scale the CPU capacity.
 1069 *
 1070 * For example, for a CPU with 1024 of capacity, a simplest series from
 1071 * the beginning would be like:
 1072 *
 1073 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 1074 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 1075 *
 1076 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 1077 * if util_avg > util_avg_cap.
 1078 */
 1079void post_init_entity_util_avg(struct task_struct *p)
 1080{
 1081	struct sched_entity *se = &p->se;
 1082	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 1083	struct sched_avg *sa = &se->avg;
 1084	long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
 1085	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
 1086
 1087	if (p->sched_class != &fair_sched_class) {
 1088		/*
 1089		 * For !fair tasks do:
 1090		 *
 1091		update_cfs_rq_load_avg(now, cfs_rq);
 1092		attach_entity_load_avg(cfs_rq, se);
 1093		switched_from_fair(rq, p);
 1094		 *
 1095		 * such that the next switched_to_fair() has the
 1096		 * expected state.
 1097		 */
 1098		se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
 1099		return;
 1100	}
 1101
 1102	if (cap > 0) {
 1103		if (cfs_rq->avg.util_avg != 0) {
 1104			sa->util_avg  = cfs_rq->avg.util_avg * se_weight(se);
 1105			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 1106
 1107			if (sa->util_avg > cap)
 1108				sa->util_avg = cap;
 1109		} else {
 1110			sa->util_avg = cap;
 1111		}
 1112	}
 1113
 1114	sa->runnable_avg = sa->util_avg;
 1115}
 1116
 1117#else /* !CONFIG_SMP */
 1118void init_entity_runnable_average(struct sched_entity *se)
 1119{
 1120}
 1121void post_init_entity_util_avg(struct task_struct *p)
 1122{
 1123}
 1124static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 1125{
 1126}
 1127#endif /* CONFIG_SMP */
 1128
 1129static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 1130{
 1131	u64 now = rq_clock_task(rq);
 1132	s64 delta_exec;
 1133
 1134	delta_exec = now - curr->exec_start;
 1135	if (unlikely(delta_exec <= 0))
 1136		return delta_exec;
 1137
 1138	curr->exec_start = now;
 1139	curr->sum_exec_runtime += delta_exec;
 1140
 1141	if (schedstat_enabled()) {
 1142		struct sched_statistics *stats;
 1143
 1144		stats = __schedstats_from_se(curr);
 1145		__schedstat_set(stats->exec_max,
 1146				max(delta_exec, stats->exec_max));
 1147	}
 1148
 1149	return delta_exec;
 1150}
 1151
 1152static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
 1153{
 1154	trace_sched_stat_runtime(p, delta_exec);
 1155	account_group_exec_runtime(p, delta_exec);
 1156	cgroup_account_cputime(p, delta_exec);
 1157}
 1158
 1159static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 1160{
 1161	if (!sched_feat(PREEMPT_SHORT))
 1162		return false;
 1163
 1164	if (curr->vlag == curr->deadline)
 1165		return false;
 1166
 1167	return !entity_eligible(cfs_rq, curr);
 1168}
 1169
 1170static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
 1171				    struct sched_entity *pse, struct sched_entity *se)
 1172{
 1173	if (!sched_feat(PREEMPT_SHORT))
 1174		return false;
 1175
 1176	if (pse->slice >= se->slice)
 1177		return false;
 1178
 1179	if (!entity_eligible(cfs_rq, pse))
 1180		return false;
 1181
 1182	if (entity_before(pse, se))
 1183		return true;
 1184
 1185	if (!entity_eligible(cfs_rq, se))
 1186		return true;
 1187
 1188	return false;
 1189}
 1190
 1191/*
 1192 * Used by other classes to account runtime.
 1193 */
 1194s64 update_curr_common(struct rq *rq)
 1195{
 1196	struct task_struct *donor = rq->donor;
 1197	s64 delta_exec;
 1198
 1199	delta_exec = update_curr_se(rq, &donor->se);
 1200	if (likely(delta_exec > 0))
 1201		update_curr_task(donor, delta_exec);
 1202
 1203	return delta_exec;
 1204}
 1205
 1206/*
 1207 * Update the current task's runtime statistics.
 1208 */
 1209static void update_curr(struct cfs_rq *cfs_rq)
 1210{
 1211	struct sched_entity *curr = cfs_rq->curr;
 1212	struct rq *rq = rq_of(cfs_rq);
 1213	s64 delta_exec;
 1214	bool resched;
 1215
 1216	if (unlikely(!curr))
 1217		return;
 1218
 1219	delta_exec = update_curr_se(rq, curr);
 1220	if (unlikely(delta_exec <= 0))
 1221		return;
 1222
 1223	curr->vruntime += calc_delta_fair(delta_exec, curr);
 1224	resched = update_deadline(cfs_rq, curr);
 1225	update_min_vruntime(cfs_rq);
 1226
 1227	if (entity_is_task(curr)) {
 1228		struct task_struct *p = task_of(curr);
 1229
 1230		update_curr_task(p, delta_exec);
 1231
 1232		/*
 1233		 * If the fair_server is active, we need to account for the
 1234		 * fair_server time whether or not the task is running on
 1235		 * behalf of fair_server or not:
 1236		 *  - If the task is running on behalf of fair_server, we need
 1237		 *    to limit its time based on the assigned runtime.
 1238		 *  - Fair task that runs outside of fair_server should account
 1239		 *    against fair_server such that it can account for this time
 1240		 *    and possibly avoid running this period.
 1241		 */
 1242		if (dl_server_active(&rq->fair_server))
 1243			dl_server_update(&rq->fair_server, delta_exec);
 1244	}
 1245
 1246	account_cfs_rq_runtime(cfs_rq, delta_exec);
 1247
 1248	if (cfs_rq->nr_running == 1)
 1249		return;
 1250
 1251	if (resched || did_preempt_short(cfs_rq, curr)) {
 1252		resched_curr_lazy(rq);
 1253		clear_buddies(cfs_rq, curr);
 1254	}
 1255}
 1256
 1257static void update_curr_fair(struct rq *rq)
 1258{
 1259	update_curr(cfs_rq_of(&rq->donor->se));
 1260}
 1261
 1262static inline void
 1263update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 1264{
 1265	struct sched_statistics *stats;
 1266	struct task_struct *p = NULL;
 1267
 1268	if (!schedstat_enabled())
 1269		return;
 1270
 1271	stats = __schedstats_from_se(se);
 1272
 1273	if (entity_is_task(se))
 1274		p = task_of(se);
 1275
 1276	__update_stats_wait_start(rq_of(cfs_rq), p, stats);
 1277}
 1278
 1279static inline void
 1280update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 1281{
 1282	struct sched_statistics *stats;
 1283	struct task_struct *p = NULL;
 1284
 1285	if (!schedstat_enabled())
 1286		return;
 1287
 1288	stats = __schedstats_from_se(se);
 1289
 1290	/*
 1291	 * When the sched_schedstat changes from 0 to 1, some sched se
 1292	 * maybe already in the runqueue, the se->statistics.wait_start
 1293	 * will be 0.So it will let the delta wrong. We need to avoid this
 1294	 * scenario.
 1295	 */
 1296	if (unlikely(!schedstat_val(stats->wait_start)))
 1297		return;
 1298
 1299	if (entity_is_task(se))
 1300		p = task_of(se);
 1301
 1302	__update_stats_wait_end(rq_of(cfs_rq), p, stats);
 1303}
 1304
 1305static inline void
 1306update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 1307{
 1308	struct sched_statistics *stats;
 1309	struct task_struct *tsk = NULL;
 1310
 1311	if (!schedstat_enabled())
 1312		return;
 1313
 1314	stats = __schedstats_from_se(se);
 1315
 1316	if (entity_is_task(se))
 1317		tsk = task_of(se);
 1318
 1319	__update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
 1320}
 1321
 1322/*
 1323 * Task is being enqueued - update stats:
 1324 */
 1325static inline void
 1326update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 1327{
 1328	if (!schedstat_enabled())
 1329		return;
 1330
 1331	/*
 1332	 * Are we enqueueing a waiting task? (for current tasks
 1333	 * a dequeue/enqueue event is a NOP)
 1334	 */
 1335	if (se != cfs_rq->curr)
 1336		update_stats_wait_start_fair(cfs_rq, se);
 1337
 1338	if (flags & ENQUEUE_WAKEUP)
 1339		update_stats_enqueue_sleeper_fair(cfs_rq, se);
 1340}
 1341
 1342static inline void
 1343update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 1344{
 1345
 1346	if (!schedstat_enabled())
 1347		return;
 1348
 1349	/*
 1350	 * Mark the end of the wait period if dequeueing a
 1351	 * waiting task:
 1352	 */
 1353	if (se != cfs_rq->curr)
 1354		update_stats_wait_end_fair(cfs_rq, se);
 1355
 1356	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
 1357		struct task_struct *tsk = task_of(se);
 1358		unsigned int state;
 1359
 1360		/* XXX racy against TTWU */
 1361		state = READ_ONCE(tsk->__state);
 1362		if (state & TASK_INTERRUPTIBLE)
 1363			__schedstat_set(tsk->stats.sleep_start,
 1364				      rq_clock(rq_of(cfs_rq)));
 1365		if (state & TASK_UNINTERRUPTIBLE)
 1366			__schedstat_set(tsk->stats.block_start,
 1367				      rq_clock(rq_of(cfs_rq)));
 1368	}
 1369}
 1370
 1371/*
 1372 * We are picking a new current task - update its stats:
 1373 */
 1374static inline void
 1375update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 1376{
 1377	/*
 1378	 * We are starting a new run period:
 1379	 */
 1380	se->exec_start = rq_clock_task(rq_of(cfs_rq));
 1381}
 1382
 1383/**************************************************
 1384 * Scheduling class queueing methods:
 1385 */
 1386
 1387static inline bool is_core_idle(int cpu)
 1388{
 1389#ifdef CONFIG_SCHED_SMT
 1390	int sibling;
 1391
 1392	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
 1393		if (cpu == sibling)
 1394			continue;
 1395
 1396		if (!idle_cpu(sibling))
 1397			return false;
 1398	}
 1399#endif
 1400
 1401	return true;
 1402}
 1403
 1404#ifdef CONFIG_NUMA
 1405#define NUMA_IMBALANCE_MIN 2
 1406
 1407static inline long
 1408adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
 1409{
 1410	/*
 1411	 * Allow a NUMA imbalance if busy CPUs is less than the maximum
 1412	 * threshold. Above this threshold, individual tasks may be contending
 1413	 * for both memory bandwidth and any shared HT resources.  This is an
 1414	 * approximation as the number of running tasks may not be related to
 1415	 * the number of busy CPUs due to sched_setaffinity.
 1416	 */
 1417	if (dst_running > imb_numa_nr)
 1418		return imbalance;
 1419
 1420	/*
 1421	 * Allow a small imbalance based on a simple pair of communicating
 1422	 * tasks that remain local when the destination is lightly loaded.
 1423	 */
 1424	if (imbalance <= NUMA_IMBALANCE_MIN)
 1425		return 0;
 1426
 1427	return imbalance;
 1428}
 1429#endif /* CONFIG_NUMA */
 1430
 1431#ifdef CONFIG_NUMA_BALANCING
 1432/*
 1433 * Approximate time to scan a full NUMA task in ms. The task scan period is
 1434 * calculated based on the tasks virtual memory size and
 1435 * numa_balancing_scan_size.
 1436 */
 1437unsigned int sysctl_numa_balancing_scan_period_min = 1000;
 1438unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 1439
 1440/* Portion of address space to scan in MB */
 1441unsigned int sysctl_numa_balancing_scan_size = 256;
 1442
 1443/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 1444unsigned int sysctl_numa_balancing_scan_delay = 1000;
 1445
 1446/* The page with hint page fault latency < threshold in ms is considered hot */
 1447unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
 1448
 1449struct numa_group {
 1450	refcount_t refcount;
 1451
 1452	spinlock_t lock; /* nr_tasks, tasks */
 1453	int nr_tasks;
 1454	pid_t gid;
 1455	int active_nodes;
 1456
 1457	struct rcu_head rcu;
 1458	unsigned long total_faults;
 1459	unsigned long max_faults_cpu;
 1460	/*
 1461	 * faults[] array is split into two regions: faults_mem and faults_cpu.
 1462	 *
 1463	 * Faults_cpu is used to decide whether memory should move
 1464	 * towards the CPU. As a consequence, these stats are weighted
 1465	 * more by CPU use than by memory faults.
 1466	 */
 1467	unsigned long faults[];
 1468};
 1469
 1470/*
 1471 * For functions that can be called in multiple contexts that permit reading
 1472 * ->numa_group (see struct task_struct for locking rules).
 1473 */
 1474static struct numa_group *deref_task_numa_group(struct task_struct *p)
 1475{
 1476	return rcu_dereference_check(p->numa_group, p == current ||
 1477		(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
 1478}
 1479
 1480static struct numa_group *deref_curr_numa_group(struct task_struct *p)
 1481{
 1482	return rcu_dereference_protected(p->numa_group, p == current);
 1483}
 1484
 1485static inline unsigned long group_faults_priv(struct numa_group *ng);
 1486static inline unsigned long group_faults_shared(struct numa_group *ng);
 1487
 1488static unsigned int task_nr_scan_windows(struct task_struct *p)
 1489{
 1490	unsigned long rss = 0;
 1491	unsigned long nr_scan_pages;
 1492
 1493	/*
 1494	 * Calculations based on RSS as non-present and empty pages are skipped
 1495	 * by the PTE scanner and NUMA hinting faults should be trapped based
 1496	 * on resident pages
 1497	 */
 1498	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
 1499	rss = get_mm_rss(p->mm);
 1500	if (!rss)
 1501		rss = nr_scan_pages;
 1502
 1503	rss = round_up(rss, nr_scan_pages);
 1504	return rss / nr_scan_pages;
 1505}
 1506
 1507/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
 1508#define MAX_SCAN_WINDOW 2560
 1509
 1510static unsigned int task_scan_min(struct task_struct *p)
 1511{
 1512	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 1513	unsigned int scan, floor;
 1514	unsigned int windows = 1;
 1515
 1516	if (scan_size < MAX_SCAN_WINDOW)
 1517		windows = MAX_SCAN_WINDOW / scan_size;
 1518	floor = 1000 / windows;
 1519
 1520	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
 1521	return max_t(unsigned int, floor, scan);
 1522}
 1523
 1524static unsigned int task_scan_start(struct task_struct *p)
 1525{
 1526	unsigned long smin = task_scan_min(p);
 1527	unsigned long period = smin;
 1528	struct numa_group *ng;
 1529
 1530	/* Scale the maximum scan period with the amount of shared memory. */
 1531	rcu_read_lock();
 1532	ng = rcu_dereference(p->numa_group);
 1533	if (ng) {
 1534		unsigned long shared = group_faults_shared(ng);
 1535		unsigned long private = group_faults_priv(ng);
 1536
 1537		period *= refcount_read(&ng->refcount);
 1538		period *= shared + 1;
 1539		period /= private + shared + 1;
 1540	}
 1541	rcu_read_unlock();
 1542
 1543	return max(smin, period);
 1544}
 1545
 1546static unsigned int task_scan_max(struct task_struct *p)
 1547{
 1548	unsigned long smin = task_scan_min(p);
 1549	unsigned long smax;
 1550	struct numa_group *ng;
 1551
 1552	/* Watch for min being lower than max due to floor calculations */
 1553	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
 1554
 1555	/* Scale the maximum scan period with the amount of shared memory. */
 1556	ng = deref_curr_numa_group(p);
 1557	if (ng) {
 1558		unsigned long shared = group_faults_shared(ng);
 1559		unsigned long private = group_faults_priv(ng);
 1560		unsigned long period = smax;
 1561
 1562		period *= refcount_read(&ng->refcount);
 1563		period *= shared + 1;
 1564		period /= private + shared + 1;
 1565
 1566		smax = max(smax, period);
 1567	}
 1568
 1569	return max(smin, smax);
 1570}
 1571
 1572static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 1573{
 1574	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
 1575	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
 1576}
 1577
 1578static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 1579{
 1580	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
 1581	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 1582}
 1583
 1584/* Shared or private faults. */
 1585#define NR_NUMA_HINT_FAULT_TYPES 2
 1586
 1587/* Memory and CPU locality */
 1588#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
 1589
 1590/* Averaged statistics, and temporary buffers. */
 1591#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 1592
 1593pid_t task_numa_group_id(struct task_struct *p)
 1594{
 1595	struct numa_group *ng;
 1596	pid_t gid = 0;
 1597
 1598	rcu_read_lock();
 1599	ng = rcu_dereference(p->numa_group);
 1600	if (ng)
 1601		gid = ng->gid;
 1602	rcu_read_unlock();
 1603
 1604	return gid;
 1605}
 1606
 1607/*
 1608 * The averaged statistics, shared & private, memory & CPU,
 1609 * occupy the first half of the array. The second half of the
 1610 * array is for current counters, which are averaged into the
 1611 * first set by task_numa_placement.
 1612 */
 1613static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
 1614{
 1615	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
 1616}
 1617
 1618static inline unsigned long task_faults(struct task_struct *p, int nid)
 1619{
 1620	if (!p->numa_faults)
 1621		return 0;
 1622
 1623	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 1624		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
 1625}
 1626
 1627static inline unsigned long group_faults(struct task_struct *p, int nid)
 1628{
 1629	struct numa_group *ng = deref_task_numa_group(p);
 1630
 1631	if (!ng)
 1632		return 0;
 1633
 1634	return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 1635		ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
 1636}
 1637
 1638static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 1639{
 1640	return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
 1641		group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
 1642}
 1643
 1644static inline unsigned long group_faults_priv(struct numa_group *ng)
 1645{
 1646	unsigned long faults = 0;
 1647	int node;
 1648
 1649	for_each_online_node(node) {
 1650		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
 1651	}
 1652
 1653	return faults;
 1654}
 1655
 1656static inline unsigned long group_faults_shared(struct numa_group *ng)
 1657{
 1658	unsigned long faults = 0;
 1659	int node;
 1660
 1661	for_each_online_node(node) {
 1662		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
 1663	}
 1664
 1665	return faults;
 1666}
 1667
 1668/*
 1669 * A node triggering more than 1/3 as many NUMA faults as the maximum is
 1670 * considered part of a numa group's pseudo-interleaving set. Migrations
 1671 * between these nodes are slowed down, to allow things to settle down.
 1672 */
 1673#define ACTIVE_NODE_FRACTION 3
 1674
 1675static bool numa_is_active_node(int nid, struct numa_group *ng)
 1676{
 1677	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
 1678}
 1679
 1680/* Handle placement on systems where not all nodes are directly connected. */
 1681static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 1682					int lim_dist, bool task)
 1683{
 1684	unsigned long score = 0;
 1685	int node, max_dist;
 1686
 1687	/*
 1688	 * All nodes are directly connected, and the same distance
 1689	 * from each other. No need for fancy placement algorithms.
 1690	 */
 1691	if (sched_numa_topology_type == NUMA_DIRECT)
 1692		return 0;
 1693
 1694	/* sched_max_numa_distance may be changed in parallel. */
 1695	max_dist = READ_ONCE(sched_max_numa_distance);
 1696	/*
 1697	 * This code is called for each node, introducing N^2 complexity,
 1698	 * which should be OK given the number of nodes rarely exceeds 8.
 1699	 */
 1700	for_each_online_node(node) {
 1701		unsigned long faults;
 1702		int dist = node_distance(nid, node);
 1703
 1704		/*
 1705		 * The furthest away nodes in the system are not interesting
 1706		 * for placement; nid was already counted.
 1707		 */
 1708		if (dist >= max_dist || node == nid)
 1709			continue;
 1710
 1711		/*
 1712		 * On systems with a backplane NUMA topology, compare groups
 1713		 * of nodes, and move tasks towards the group with the most
 1714		 * memory accesses. When comparing two nodes at distance
 1715		 * "hoplimit", only nodes closer by than "hoplimit" are part
 1716		 * of each group. Skip other nodes.
 1717		 */
 1718		if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
 1719			continue;
 1720
 1721		/* Add up the faults from nearby nodes. */
 1722		if (task)
 1723			faults = task_faults(p, node);
 1724		else
 1725			faults = group_faults(p, node);
 1726
 1727		/*
 1728		 * On systems with a glueless mesh NUMA topology, there are
 1729		 * no fixed "groups of nodes". Instead, nodes that are not
 1730		 * directly connected bounce traffic through intermediate
 1731		 * nodes; a numa_group can occupy any set of nodes.
 1732		 * The further away a node is, the less the faults count.
 1733		 * This seems to result in good task placement.
 1734		 */
 1735		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
 1736			faults *= (max_dist - dist);
 1737			faults /= (max_dist - LOCAL_DISTANCE);
 1738		}
 1739
 1740		score += faults;
 1741	}
 1742
 1743	return score;
 1744}
 1745
 1746/*
 1747 * These return the fraction of accesses done by a particular task, or
 1748 * task group, on a particular numa node.  The group weight is given a
 1749 * larger multiplier, in order to group tasks together that are almost
 1750 * evenly spread out between numa nodes.
 1751 */
 1752static inline unsigned long task_weight(struct task_struct *p, int nid,
 1753					int dist)
 1754{
 1755	unsigned long faults, total_faults;
 1756
 1757	if (!p->numa_faults)
 1758		return 0;
 1759
 1760	total_faults = p->total_numa_faults;
 1761
 1762	if (!total_faults)
 1763		return 0;
 1764
 1765	faults = task_faults(p, nid);
 1766	faults += score_nearby_nodes(p, nid, dist, true);
 1767
 1768	return 1000 * faults / total_faults;
 1769}
 1770
 1771static inline unsigned long group_weight(struct task_struct *p, int nid,
 1772					 int dist)
 1773{
 1774	struct numa_group *ng = deref_task_numa_group(p);
 1775	unsigned long faults, total_faults;
 1776
 1777	if (!ng)
 1778		return 0;
 1779
 1780	total_faults = ng->total_faults;
 1781
 1782	if (!total_faults)
 1783		return 0;
 1784
 1785	faults = group_faults(p, nid);
 1786	faults += score_nearby_nodes(p, nid, dist, false);
 1787
 1788	return 1000 * faults / total_faults;
 1789}
 1790
 1791/*
 1792 * If memory tiering mode is enabled, cpupid of slow memory page is
 1793 * used to record scan time instead of CPU and PID.  When tiering mode
 1794 * is disabled at run time, the scan time (in cpupid) will be
 1795 * interpreted as CPU and PID.  So CPU needs to be checked to avoid to
 1796 * access out of array bound.
 1797 */
 1798static inline bool cpupid_valid(int cpupid)
 1799{
 1800	return cpupid_to_cpu(cpupid) < nr_cpu_ids;
 1801}
 1802
 1803/*
 1804 * For memory tiering mode, if there are enough free pages (more than
 1805 * enough watermark defined here) in fast memory node, to take full
 1806 * advantage of fast memory capacity, all recently accessed slow
 1807 * memory pages will be migrated to fast memory node without
 1808 * considering hot threshold.
 1809 */
 1810static bool pgdat_free_space_enough(struct pglist_data *pgdat)
 1811{
 1812	int z;
 1813	unsigned long enough_wmark;
 1814
 1815	enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
 1816			   pgdat->node_present_pages >> 4);
 1817	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
 1818		struct zone *zone = pgdat->node_zones + z;
 1819
 1820		if (!populated_zone(zone))
 1821			continue;
 1822
 1823		if (zone_watermark_ok(zone, 0,
 1824				      promo_wmark_pages(zone) + enough_wmark,
 1825				      ZONE_MOVABLE, 0))
 1826			return true;
 1827	}
 1828	return false;
 1829}
 1830
 1831/*
 1832 * For memory tiering mode, when page tables are scanned, the scan
 1833 * time will be recorded in struct page in addition to make page
 1834 * PROT_NONE for slow memory page.  So when the page is accessed, in
 1835 * hint page fault handler, the hint page fault latency is calculated
 1836 * via,
 1837 *
 1838 *	hint page fault latency = hint page fault time - scan time
 1839 *
 1840 * The smaller the hint page fault latency, the higher the possibility
 1841 * for the page to be hot.
 1842 */
 1843static int numa_hint_fault_latency(struct folio *folio)
 1844{
 1845	int last_time, time;
 1846
 1847	time = jiffies_to_msecs(jiffies);
 1848	last_time = folio_xchg_access_time(folio, time);
 1849
 1850	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
 1851}
 1852
 1853/*
 1854 * For memory tiering mode, too high promotion/demotion throughput may
 1855 * hurt application latency.  So we provide a mechanism to rate limit
 1856 * the number of pages that are tried to be promoted.
 1857 */
 1858static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
 1859				      unsigned long rate_limit, int nr)
 1860{
 1861	unsigned long nr_cand;
 1862	unsigned int now, start;
 1863
 1864	now = jiffies_to_msecs(jiffies);
 1865	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
 1866	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
 1867	start = pgdat->nbp_rl_start;
 1868	if (now - start > MSEC_PER_SEC &&
 1869	    cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
 1870		pgdat->nbp_rl_nr_cand = nr_cand;
 1871	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
 1872		return true;
 1873	return false;
 1874}
 1875
 1876#define NUMA_MIGRATION_ADJUST_STEPS	16
 1877
 1878static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
 1879					    unsigned long rate_limit,
 1880					    unsigned int ref_th)
 1881{
 1882	unsigned int now, start, th_period, unit_th, th;
 1883	unsigned long nr_cand, ref_cand, diff_cand;
 1884
 1885	now = jiffies_to_msecs(jiffies);
 1886	th_period = sysctl_numa_balancing_scan_period_max;
 1887	start = pgdat->nbp_th_start;
 1888	if (now - start > th_period &&
 1889	    cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
 1890		ref_cand = rate_limit *
 1891			sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
 1892		nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
 1893		diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
 1894		unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
 1895		th = pgdat->nbp_threshold ? : ref_th;
 1896		if (diff_cand > ref_cand * 11 / 10)
 1897			th = max(th - unit_th, unit_th);
 1898		else if (diff_cand < ref_cand * 9 / 10)
 1899			th = min(th + unit_th, ref_th * 2);
 1900		pgdat->nbp_th_nr_cand = nr_cand;
 1901		pgdat->nbp_threshold = th;
 1902	}
 1903}
 1904
 1905bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 1906				int src_nid, int dst_cpu)
 1907{
 1908	struct numa_group *ng = deref_curr_numa_group(p);
 1909	int dst_nid = cpu_to_node(dst_cpu);
 1910	int last_cpupid, this_cpupid;
 1911
 1912	/*
 1913	 * Cannot migrate to memoryless nodes.
 1914	 */
 1915	if (!node_state(dst_nid, N_MEMORY))
 1916		return false;
 1917
 1918	/*
 1919	 * The pages in slow memory node should be migrated according
 1920	 * to hot/cold instead of private/shared.
 1921	 */
 1922	if (folio_use_access_time(folio)) {
 1923		struct pglist_data *pgdat;
 1924		unsigned long rate_limit;
 1925		unsigned int latency, th, def_th;
 1926
 1927		pgdat = NODE_DATA(dst_nid);
 1928		if (pgdat_free_space_enough(pgdat)) {
 1929			/* workload changed, reset hot threshold */
 1930			pgdat->nbp_threshold = 0;
 1931			return true;
 1932		}
 1933
 1934		def_th = sysctl_numa_balancing_hot_threshold;
 1935		rate_limit = sysctl_numa_balancing_promote_rate_limit << \
 1936			(20 - PAGE_SHIFT);
 1937		numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
 1938
 1939		th = pgdat->nbp_threshold ? : def_th;
 1940		latency = numa_hint_fault_latency(folio);
 1941		if (latency >= th)
 1942			return false;
 1943
 1944		return !numa_promotion_rate_limit(pgdat, rate_limit,
 1945						  folio_nr_pages(folio));
 1946	}
 1947
 1948	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
 1949	last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
 1950
 1951	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
 1952	    !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
 1953		return false;
 1954
 1955	/*
 1956	 * Allow first faults or private faults to migrate immediately early in
 1957	 * the lifetime of a task. The magic number 4 is based on waiting for
 1958	 * two full passes of the "multi-stage node selection" test that is
 1959	 * executed below.
 1960	 */
 1961	if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
 1962	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
 1963		return true;
 1964
 1965	/*
 1966	 * Multi-stage node selection is used in conjunction with a periodic
 1967	 * migration fault to build a temporal task<->page relation. By using
 1968	 * a two-stage filter we remove short/unlikely relations.
 1969	 *
 1970	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
 1971	 * a task's usage of a particular page (n_p) per total usage of this
 1972	 * page (n_t) (in a given time-span) to a probability.
 1973	 *
 1974	 * Our periodic faults will sample this probability and getting the
 1975	 * same result twice in a row, given these samples are fully
 1976	 * independent, is then given by P(n)^2, provided our sample period
 1977	 * is sufficiently short compared to the usage pattern.
 1978	 *
 1979	 * This quadric squishes small probabilities, making it less likely we
 1980	 * act on an unlikely task<->page relation.
 1981	 */
 1982	if (!cpupid_pid_unset(last_cpupid) &&
 1983				cpupid_to_nid(last_cpupid) != dst_nid)
 1984		return false;
 1985
 1986	/* Always allow migrate on private faults */
 1987	if (cpupid_match_pid(p, last_cpupid))
 1988		return true;
 1989
 1990	/* A shared fault, but p->numa_group has not been set up yet. */
 1991	if (!ng)
 1992		return true;
 1993
 1994	/*
 1995	 * Destination node is much more heavily used than the source
 1996	 * node? Allow migration.
 1997	 */
 1998	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
 1999					ACTIVE_NODE_FRACTION)
 2000		return true;
 2001
 2002	/*
 2003	 * Distribute memory according to CPU & memory use on each node,
 2004	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
 2005	 *
 2006	 * faults_cpu(dst)   3   faults_cpu(src)
 2007	 * --------------- * - > ---------------
 2008	 * faults_mem(dst)   4   faults_mem(src)
 2009	 */
 2010	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
 2011	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 2012}
 2013
 2014/*
 2015 * 'numa_type' describes the node at the moment of load balancing.
 2016 */
 2017enum numa_type {
 2018	/* The node has spare capacity that can be used to run more tasks.  */
 2019	node_has_spare = 0,
 2020	/*
 2021	 * The node is fully used and the tasks don't compete for more CPU
 2022	 * cycles. Nevertheless, some tasks might wait before running.
 2023	 */
 2024	node_fully_busy,
 2025	/*
 2026	 * The node is overloaded and can't provide expected CPU cycles to all
 2027	 * tasks.
 2028	 */
 2029	node_overloaded
 2030};
 2031
 2032/* Cached statistics for all CPUs within a node */
 2033struct numa_stats {
 2034	unsigned long load;
 2035	unsigned long runnable;
 2036	unsigned long util;
 2037	/* Total compute capacity of CPUs on a node */
 2038	unsigned long compute_capacity;
 2039	unsigned int nr_running;
 2040	unsigned int weight;
 2041	enum numa_type node_type;
 2042	int idle_cpu;
 2043};
 2044
 2045struct task_numa_env {
 2046	struct task_struct *p;
 2047
 2048	int src_cpu, src_nid;
 2049	int dst_cpu, dst_nid;
 2050	int imb_numa_nr;
 2051
 2052	struct numa_stats src_stats, dst_stats;
 2053
 2054	int imbalance_pct;
 2055	int dist;
 2056
 2057	struct task_struct *best_task;
 2058	long best_imp;
 2059	int best_cpu;
 2060};
 2061
 2062static unsigned long cpu_load(struct rq *rq);
 2063static unsigned long cpu_runnable(struct rq *rq);
 2064
 2065static inline enum
 2066numa_type numa_classify(unsigned int imbalance_pct,
 2067			 struct numa_stats *ns)
 2068{
 2069	if ((ns->nr_running > ns->weight) &&
 2070	    (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
 2071	     ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
 2072		return node_overloaded;
 2073
 2074	if ((ns->nr_running < ns->weight) ||
 2075	    (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
 2076	     ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
 2077		return node_has_spare;
 2078
 2079	return node_fully_busy;
 2080}
 2081
 2082#ifdef CONFIG_SCHED_SMT
 2083/* Forward declarations of select_idle_sibling helpers */
 2084static inline bool test_idle_cores(int cpu);
 2085static inline int numa_idle_core(int idle_core, int cpu)
 2086{
 2087	if (!static_branch_likely(&sched_smt_present) ||
 2088	    idle_core >= 0 || !test_idle_cores(cpu))
 2089		return idle_core;
 2090
 2091	/*
 2092	 * Prefer cores instead of packing HT siblings
 2093	 * and triggering future load balancing.
 2094	 */
 2095	if (is_core_idle(cpu))
 2096		idle_core = cpu;
 2097
 2098	return idle_core;
 2099}
 2100#else
 2101static inline int numa_idle_core(int idle_core, int cpu)
 2102{
 2103	return idle_core;
 2104}
 2105#endif
 2106
 2107/*
 2108 * Gather all necessary information to make NUMA balancing placement
 2109 * decisions that are compatible with standard load balancer. This
 2110 * borrows code and logic from update_sg_lb_stats but sharing a
 2111 * common implementation is impractical.
 2112 */
 2113static void update_numa_stats(struct task_numa_env *env,
 2114			      struct numa_stats *ns, int nid,
 2115			      bool find_idle)
 2116{
 2117	int cpu, idle_core = -1;
 2118
 2119	memset(ns, 0, sizeof(*ns));
 2120	ns->idle_cpu = -1;
 2121
 2122	rcu_read_lock();
 2123	for_each_cpu(cpu, cpumask_of_node(nid)) {
 2124		struct rq *rq = cpu_rq(cpu);
 2125
 2126		ns->load += cpu_load(rq);
 2127		ns->runnable += cpu_runnable(rq);
 2128		ns->util += cpu_util_cfs(cpu);
 2129		ns->nr_running += rq->cfs.h_nr_running;
 2130		ns->compute_capacity += capacity_of(cpu);
 2131
 2132		if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
 2133			if (READ_ONCE(rq->numa_migrate_on) ||
 2134			    !cpumask_test_cpu(cpu, env->p->cpus_ptr))
 2135				continue;
 2136
 2137			if (ns->idle_cpu == -1)
 2138				ns->idle_cpu = cpu;
 2139
 2140			idle_core = numa_idle_core(idle_core, cpu);
 2141		}
 2142	}
 2143	rcu_read_unlock();
 2144
 2145	ns->weight = cpumask_weight(cpumask_of_node(nid));
 2146
 2147	ns->node_type = numa_classify(env->imbalance_pct, ns);
 2148
 2149	if (idle_core >= 0)
 2150		ns->idle_cpu = idle_core;
 2151}
 2152
 2153static void task_numa_assign(struct task_numa_env *env,
 2154			     struct task_struct *p, long imp)
 2155{
 2156	struct rq *rq = cpu_rq(env->dst_cpu);
 2157
 2158	/* Check if run-queue part of active NUMA balance. */
 2159	if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
 2160		int cpu;
 2161		int start = env->dst_cpu;
 2162
 2163		/* Find alternative idle CPU. */
 2164		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
 2165			if (cpu == env->best_cpu || !idle_cpu(cpu) ||
 2166			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
 2167				continue;
 2168			}
 2169
 2170			env->dst_cpu = cpu;
 2171			rq = cpu_rq(env->dst_cpu);
 2172			if (!xchg(&rq->numa_migrate_on, 1))
 2173				goto assign;
 2174		}
 2175
 2176		/* Failed to find an alternative idle CPU */
 2177		return;
 2178	}
 2179
 2180assign:
 2181	/*
 2182	 * Clear previous best_cpu/rq numa-migrate flag, since task now
 2183	 * found a better CPU to move/swap.
 2184	 */
 2185	if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
 2186		rq = cpu_rq(env->best_cpu);
 2187		WRITE_ONCE(rq->numa_migrate_on, 0);
 2188	}
 2189
 2190	if (env->best_task)
 2191		put_task_struct(env->best_task);
 2192	if (p)
 2193		get_task_struct(p);
 2194
 2195	env->best_task = p;
 2196	env->best_imp = imp;
 2197	env->best_cpu = env->dst_cpu;
 2198}
 2199
 2200static bool load_too_imbalanced(long src_load, long dst_load,
 2201				struct task_numa_env *env)
 2202{
 2203	long imb, old_imb;
 2204	long orig_src_load, orig_dst_load;
 2205	long src_capacity, dst_capacity;
 2206
 2207	/*
 2208	 * The load is corrected for the CPU capacity available on each node.
 2209	 *
 2210	 * src_load        dst_load
 2211	 * ------------ vs ---------
 2212	 * src_capacity    dst_capacity
 2213	 */
 2214	src_capacity = env->src_stats.compute_capacity;
 2215	dst_capacity = env->dst_stats.compute_capacity;
 2216
 2217	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
 2218
 2219	orig_src_load = env->src_stats.load;
 2220	orig_dst_load = env->dst_stats.load;
 2221
 2222	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
 2223
 2224	/* Would this change make things worse? */
 2225	return (imb > old_imb);
 2226}
 2227
 2228/*
 2229 * Maximum NUMA importance can be 1998 (2*999);
 2230 * SMALLIMP @ 30 would be close to 1998/64.
 2231 * Used to deter task migration.
 2232 */
 2233#define SMALLIMP	30
 2234
 2235/*
 2236 * This checks if the overall compute and NUMA accesses of the system would
 2237 * be improved if the source tasks was migrated to the target dst_cpu taking
 2238 * into account that it might be best if task running on the dst_cpu should
 2239 * be exchanged with the source task
 2240 */
 2241static bool task_numa_compare(struct task_numa_env *env,
 2242			      long taskimp, long groupimp, bool maymove)
 2243{
 2244	struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
 2245	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 2246	long imp = p_ng ? groupimp : taskimp;
 2247	struct task_struct *cur;
 2248	long src_load, dst_load;
 2249	int dist = env->dist;
 2250	long moveimp = imp;
 2251	long load;
 2252	bool stopsearch = false;
 2253
 2254	if (READ_ONCE(dst_rq->numa_migrate_on))
 2255		return false;
 2256
 2257	rcu_read_lock();
 2258	cur = rcu_dereference(dst_rq->curr);
 2259	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
 2260		cur = NULL;
 2261
 2262	/*
 2263	 * Because we have preemption enabled we can get migrated around and
 2264	 * end try selecting ourselves (current == env->p) as a swap candidate.
 2265	 */
 2266	if (cur == env->p) {
 2267		stopsearch = true;
 2268		goto unlock;
 2269	}
 2270
 2271	if (!cur) {
 2272		if (maymove && moveimp >= env->best_imp)
 2273			goto assign;
 2274		else
 2275			goto unlock;
 2276	}
 2277
 2278	/* Skip this swap candidate if cannot move to the source cpu. */
 2279	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
 2280		goto unlock;
 2281
 2282	/*
 2283	 * Skip this swap candidate if it is not moving to its preferred
 2284	 * node and the best task is.
 2285	 */
 2286	if (env->best_task &&
 2287	    env->best_task->numa_preferred_nid == env->src_nid &&
 2288	    cur->numa_preferred_nid != env->src_nid) {
 2289		goto unlock;
 2290	}
 2291
 2292	/*
 2293	 * "imp" is the fault differential for the source task between the
 2294	 * source and destination node. Calculate the total differential for
 2295	 * the source task and potential destination task. The more negative
 2296	 * the value is, the more remote accesses that would be expected to
 2297	 * be incurred if the tasks were swapped.
 2298	 *
 2299	 * If dst and source tasks are in the same NUMA group, or not
 2300	 * in any group then look only at task weights.
 2301	 */
 2302	cur_ng = rcu_dereference(cur->numa_group);
 2303	if (cur_ng == p_ng) {
 2304		/*
 2305		 * Do not swap within a group or between tasks that have
 2306		 * no group if there is spare capacity. Swapping does
 2307		 * not address the load imbalance and helps one task at
 2308		 * the cost of punishing another.
 2309		 */
 2310		if (env->dst_stats.node_type == node_has_spare)
 2311			goto unlock;
 2312
 2313		imp = taskimp + task_weight(cur, env->src_nid, dist) -
 2314		      task_weight(cur, env->dst_nid, dist);
 2315		/*
 2316		 * Add some hysteresis to prevent swapping the
 2317		 * tasks within a group over tiny differences.
 2318		 */
 2319		if (cur_ng)
 2320			imp -= imp / 16;
 2321	} else {
 2322		/*
 2323		 * Compare the group weights. If a task is all by itself
 2324		 * (not part of a group), use the task weight instead.
 2325		 */
 2326		if (cur_ng && p_ng)
 2327			imp += group_weight(cur, env->src_nid, dist) -
 2328			       group_weight(cur, env->dst_nid, dist);
 2329		else
 2330			imp += task_weight(cur, env->src_nid, dist) -
 2331			       task_weight(cur, env->dst_nid, dist);
 2332	}
 2333
 2334	/* Discourage picking a task already on its preferred node */
 2335	if (cur->numa_preferred_nid == env->dst_nid)
 2336		imp -= imp / 16;
 2337
 2338	/*
 2339	 * Encourage picking a task that moves to its preferred node.
 2340	 * This potentially makes imp larger than it's maximum of
 2341	 * 1998 (see SMALLIMP and task_weight for why) but in this
 2342	 * case, it does not matter.
 2343	 */
 2344	if (cur->numa_preferred_nid == env->src_nid)
 2345		imp += imp / 8;
 2346
 2347	if (maymove && moveimp > imp && moveimp > env->best_imp) {
 2348		imp = moveimp;
 2349		cur = NULL;
 2350		goto assign;
 2351	}
 2352
 2353	/*
 2354	 * Prefer swapping with a task moving to its preferred node over a
 2355	 * task that is not.
 2356	 */
 2357	if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
 2358	    env->best_task->numa_preferred_nid != env->src_nid) {
 2359		goto assign;
 2360	}
 2361
 2362	/*
 2363	 * If the NUMA importance is less than SMALLIMP,
 2364	 * task migration might only result in ping pong
 2365	 * of tasks and also hurt performance due to cache
 2366	 * misses.
 2367	 */
 2368	if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
 2369		goto unlock;
 2370
 2371	/*
 2372	 * In the overloaded case, try and keep the load balanced.
 2373	 */
 2374	load = task_h_load(env->p) - task_h_load(cur);
 2375	if (!load)
 2376		goto assign;
 2377
 2378	dst_load = env->dst_stats.load + load;
 2379	src_load = env->src_stats.load - load;
 2380
 2381	if (load_too_imbalanced(src_load, dst_load, env))
 2382		goto unlock;
 2383
 2384assign:
 2385	/* Evaluate an idle CPU for a task numa move. */
 2386	if (!cur) {
 2387		int cpu = env->dst_stats.idle_cpu;
 2388
 2389		/* Nothing cached so current CPU went idle since the search. */
 2390		if (cpu < 0)
 2391			cpu = env->dst_cpu;
 2392
 2393		/*
 2394		 * If the CPU is no longer truly idle and the previous best CPU
 2395		 * is, keep using it.
 2396		 */
 2397		if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
 2398		    idle_cpu(env->best_cpu)) {
 2399			cpu = env->best_cpu;
 2400		}
 2401
 2402		env->dst_cpu = cpu;
 2403	}
 2404
 2405	task_numa_assign(env, cur, imp);
 2406
 2407	/*
 2408	 * If a move to idle is allowed because there is capacity or load
 2409	 * balance improves then stop the search. While a better swap
 2410	 * candidate may exist, a search is not free.
 2411	 */
 2412	if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
 2413		stopsearch = true;
 2414
 2415	/*
 2416	 * If a swap candidate must be identified and the current best task
 2417	 * moves its preferred node then stop the search.
 2418	 */
 2419	if (!maymove && env->best_task &&
 2420	    env->best_task->numa_preferred_nid == env->src_nid) {
 2421		stopsearch = true;
 2422	}
 2423unlock:
 2424	rcu_read_unlock();
 2425
 2426	return stopsearch;
 2427}
 2428
 2429static void task_numa_find_cpu(struct task_numa_env *env,
 2430				long taskimp, long groupimp)
 2431{
 2432	bool maymove = false;
 2433	int cpu;
 2434
 2435	/*
 2436	 * If dst node has spare capacity, then check if there is an
 2437	 * imbalance that would be overruled by the load balancer.
 2438	 */
 2439	if (env->dst_stats.node_type == node_has_spare) {
 2440		unsigned int imbalance;
 2441		int src_running, dst_running;
 2442
 2443		/*
 2444		 * Would movement cause an imbalance? Note that if src has
 2445		 * more running tasks that the imbalance is ignored as the
 2446		 * move improves the imbalance from the perspective of the
 2447		 * CPU load balancer.
 2448		 * */
 2449		src_running = env->src_stats.nr_running - 1;
 2450		dst_running = env->dst_stats.nr_running + 1;
 2451		imbalance = max(0, dst_running - src_running);
 2452		imbalance = adjust_numa_imbalance(imbalance, dst_running,
 2453						  env->imb_numa_nr);
 2454
 2455		/* Use idle CPU if there is no imbalance */
 2456		if (!imbalance) {
 2457			maymove = true;
 2458			if (env->dst_stats.idle_cpu >= 0) {
 2459				env->dst_cpu = env->dst_stats.idle_cpu;
 2460				task_numa_assign(env, NULL, 0);
 2461				return;
 2462			}
 2463		}
 2464	} else {
 2465		long src_load, dst_load, load;
 2466		/*
 2467		 * If the improvement from just moving env->p direction is better
 2468		 * than swapping tasks around, check if a move is possible.
 2469		 */
 2470		load = task_h_load(env->p);
 2471		dst_load = env->dst_stats.load + load;
 2472		src_load = env->src_stats.load - load;
 2473		maymove = !load_too_imbalanced(src_load, dst_load, env);
 2474	}
 2475
 2476	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 2477		/* Skip this CPU if the source task cannot migrate */
 2478		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
 2479			continue;
 2480
 2481		env->dst_cpu = cpu;
 2482		if (task_numa_compare(env, taskimp, groupimp, maymove))
 2483			break;
 2484	}
 2485}
 2486
 2487static int task_numa_migrate(struct task_struct *p)
 2488{
 2489	struct task_numa_env env = {
 2490		.p = p,
 2491
 2492		.src_cpu = task_cpu(p),
 2493		.src_nid = task_node(p),
 2494
 2495		.imbalance_pct = 112,
 2496
 2497		.best_task = NULL,
 2498		.best_imp = 0,
 2499		.best_cpu = -1,
 2500	};
 2501	unsigned long taskweight, groupweight;
 2502	struct sched_domain *sd;
 2503	long taskimp, groupimp;
 2504	struct numa_group *ng;
 2505	struct rq *best_rq;
 2506	int nid, ret, dist;
 2507
 2508	/*
 2509	 * Pick the lowest SD_NUMA domain, as that would have the smallest
 2510	 * imbalance and would be the first to start moving tasks about.
 2511	 *
 2512	 * And we want to avoid any moving of tasks about, as that would create
 2513	 * random movement of tasks -- counter the numa conditions we're trying
 2514	 * to satisfy here.
 2515	 */
 2516	rcu_read_lock();
 2517	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
 2518	if (sd) {
 2519		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
 2520		env.imb_numa_nr = sd->imb_numa_nr;
 2521	}
 2522	rcu_read_unlock();
 2523
 2524	/*
 2525	 * Cpusets can break the scheduler domain tree into smaller
 2526	 * balance domains, some of which do not cross NUMA boundaries.
 2527	 * Tasks that are "trapped" in such domains cannot be migrated
 2528	 * elsewhere, so there is no point in (re)trying.
 2529	 */
 2530	if (unlikely(!sd)) {
 2531		sched_setnuma(p, task_node(p));
 2532		return -EINVAL;
 2533	}
 2534
 2535	env.dst_nid = p->numa_preferred_nid;
 2536	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
 2537	taskweight = task_weight(p, env.src_nid, dist);
 2538	groupweight = group_weight(p, env.src_nid, dist);
 2539	update_numa_stats(&env, &env.src_stats, env.src_nid, false);
 2540	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
 2541	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
 2542	update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
 2543
 2544	/* Try to find a spot on the preferred nid. */
 2545	task_numa_find_cpu(&env, taskimp, groupimp);
 2546
 2547	/*
 2548	 * Look at other nodes in these cases:
 2549	 * - there is no space available on the preferred_nid
 2550	 * - the task is part of a numa_group that is interleaved across
 2551	 *   multiple NUMA nodes; in order to better consolidate the group,
 2552	 *   we need to check other locations.
 2553	 */
 2554	ng = deref_curr_numa_group(p);
 2555	if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
 2556		for_each_node_state(nid, N_CPU) {
 2557			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 2558				continue;
 2559
 2560			dist = node_distance(env.src_nid, env.dst_nid);
 2561			if (sched_numa_topology_type == NUMA_BACKPLANE &&
 2562						dist != env.dist) {
 2563				taskweight = task_weight(p, env.src_nid, dist);
 2564				groupweight = group_weight(p, env.src_nid, dist);
 2565			}
 2566
 2567			/* Only consider nodes where both task and groups benefit */
 2568			taskimp = task_weight(p, nid, dist) - taskweight;
 2569			groupimp = group_weight(p, nid, dist) - groupweight;
 2570			if (taskimp < 0 && groupimp < 0)
 2571				continue;
 2572
 2573			env.dist = dist;
 2574			env.dst_nid = nid;
 2575			update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
 2576			task_numa_find_cpu(&env, taskimp, groupimp);
 2577		}
 2578	}
 2579
 2580	/*
 2581	 * If the task is part of a workload that spans multiple NUMA nodes,
 2582	 * and is migrating into one of the workload's active nodes, remember
 2583	 * this node as the task's preferred numa node, so the workload can
 2584	 * settle down.
 2585	 * A task that migrated to a second choice node will be better off
 2586	 * trying for a better one later. Do not set the preferred node here.
 2587	 */
 2588	if (ng) {
 2589		if (env.best_cpu == -1)
 2590			nid = env.src_nid;
 2591		else
 2592			nid = cpu_to_node(env.best_cpu);
 2593
 2594		if (nid != p->numa_preferred_nid)
 2595			sched_setnuma(p, nid);
 2596	}
 2597
 2598	/* No better CPU than the current one was found. */
 2599	if (env.best_cpu == -1) {
 2600		trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
 2601		return -EAGAIN;
 2602	}
 2603
 2604	best_rq = cpu_rq(env.best_cpu);
 2605	if (env.best_task == NULL) {
 2606		ret = migrate_task_to(p, env.best_cpu);
 2607		WRITE_ONCE(best_rq->numa_migrate_on, 0);
 2608		if (ret != 0)
 2609			trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
 2610		return ret;
 2611	}
 2612
 2613	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
 2614	WRITE_ONCE(best_rq->numa_migrate_on, 0);
 2615
 2616	if (ret != 0)
 2617		trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
 2618	put_task_struct(env.best_task);
 2619	return ret;
 2620}
 2621
 2622/* Attempt to migrate a task to a CPU on the preferred node. */
 2623static void numa_migrate_preferred(struct task_struct *p)
 2624{
 2625	unsigned long interval = HZ;
 2626
 2627	/* This task has no NUMA fault statistics yet */
 2628	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
 2629		return;
 2630
 2631	/* Periodically retry migrating the task to the preferred node */
 2632	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
 2633	p->numa_migrate_retry = jiffies + interval;
 2634
 2635	/* Success if task is already running on preferred CPU */
 2636	if (task_node(p) == p->numa_preferred_nid)
 2637		return;
 2638
 2639	/* Otherwise, try migrate to a CPU on the preferred node */
 2640	task_numa_migrate(p);
 2641}
 2642
 2643/*
 2644 * Find out how many nodes the workload is actively running on. Do this by
 2645 * tracking the nodes from which NUMA hinting faults are triggered. This can
 2646 * be different from the set of nodes where the workload's memory is currently
 2647 * located.
 2648 */
 2649static void numa_group_count_active_nodes(struct numa_group *numa_group)
 2650{
 2651	unsigned long faults, max_faults = 0;
 2652	int nid, active_nodes = 0;
 2653
 2654	for_each_node_state(nid, N_CPU) {
 2655		faults = group_faults_cpu(numa_group, nid);
 2656		if (faults > max_faults)
 2657			max_faults = faults;
 2658	}
 2659
 2660	for_each_node_state(nid, N_CPU) {
 2661		faults = group_faults_cpu(numa_group, nid);
 2662		if (faults * ACTIVE_NODE_FRACTION > max_faults)
 2663			active_nodes++;
 2664	}
 2665
 2666	numa_group->max_faults_cpu = max_faults;
 2667	numa_group->active_nodes = active_nodes;
 2668}
 2669
 2670/*
 2671 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
 2672 * increments. The more local the fault statistics are, the higher the scan
 2673 * period will be for the next scan window. If local/(local+remote) ratio is
 2674 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
 2675 * the scan period will decrease. Aim for 70% local accesses.
 2676 */
 2677#define NUMA_PERIOD_SLOTS 10
 2678#define NUMA_PERIOD_THRESHOLD 7
 2679
 2680/*
 2681 * Increase the scan period (slow down scanning) if the majority of
 2682 * our memory is already on our local node, or if the majority of
 2683 * the page accesses are shared with other processes.
 2684 * Otherwise, decrease the scan period.
 2685 */
 2686static void update_task_scan_period(struct task_struct *p,
 2687			unsigned long shared, unsigned long private)
 2688{
 2689	unsigned int period_slot;
 2690	int lr_ratio, ps_ratio;
 2691	int diff;
 2692
 2693	unsigned long remote = p->numa_faults_locality[0];
 2694	unsigned long local = p->numa_faults_locality[1];
 2695
 2696	/*
 2697	 * If there were no record hinting faults then either the task is
 2698	 * completely idle or all activity is in areas that are not of interest
 2699	 * to automatic numa balancing. Related to that, if there were failed
 2700	 * migration then it implies we are migrating too quickly or the local
 2701	 * node is overloaded. In either case, scan slower
 2702	 */
 2703	if (local + shared == 0 || p->numa_faults_locality[2]) {
 2704		p->numa_scan_period = min(p->numa_scan_period_max,
 2705			p->numa_scan_period << 1);
 2706
 2707		p->mm->numa_next_scan = jiffies +
 2708			msecs_to_jiffies(p->numa_scan_period);
 2709
 2710		return;
 2711	}
 2712
 2713	/*
 2714	 * Prepare to scale scan period relative to the current period.
 2715	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
 2716	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
 2717	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
 2718	 */
 2719	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
 2720	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
 2721	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
 2722
 2723	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
 2724		/*
 2725		 * Most memory accesses are local. There is no need to
 2726		 * do fast NUMA scanning, since memory is already local.
 2727		 */
 2728		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
 2729		if (!slot)
 2730			slot = 1;
 2731		diff = slot * period_slot;
 2732	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
 2733		/*
 2734		 * Most memory accesses are shared with other tasks.
 2735		 * There is no point in continuing fast NUMA scanning,
 2736		 * since other tasks may just move the memory elsewhere.
 2737		 */
 2738		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
 2739		if (!slot)
 2740			slot = 1;
 2741		diff = slot * period_slot;
 2742	} else {
 2743		/*
 2744		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
 2745		 * yet they are not on the local NUMA node. Speed up
 2746		 * NUMA scanning to get the memory moved over.
 2747		 */
 2748		int ratio = max(lr_ratio, ps_ratio);
 2749		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
 2750	}
 2751
 2752	p->numa_scan_period = clamp(p->numa_scan_period + diff,
 2753			task_scan_min(p), task_scan_max(p));
 2754	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 2755}
 2756
 2757/*
 2758 * Get the fraction of time the task has been running since the last
 2759 * NUMA placement cycle. The scheduler keeps similar statistics, but
 2760 * decays those on a 32ms period, which is orders of magnitude off
 2761 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
 2762 * stats only if the task is so new there are no NUMA statistics yet.
 2763 */
 2764static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 2765{
 2766	u64 runtime, delta, now;
 2767	/* Use the start of this time slice to avoid calculations. */
 2768	now = p->se.exec_start;
 2769	runtime = p->se.sum_exec_runtime;
 2770
 2771	if (p->last_task_numa_placement) {
 2772		delta = runtime - p->last_sum_exec_runtime;
 2773		*period = now - p->last_task_numa_placement;
 2774
 2775		/* Avoid time going backwards, prevent potential divide error: */
 2776		if (unlikely((s64)*period < 0))
 2777			*period = 0;
 2778	} else {
 2779		delta = p->se.avg.load_sum;
 2780		*period = LOAD_AVG_MAX;
 2781	}
 2782
 2783	p->last_sum_exec_runtime = runtime;
 2784	p->last_task_numa_placement = now;
 2785
 2786	return delta;
 2787}
 2788
 2789/*
 2790 * Determine the preferred nid for a task in a numa_group. This needs to
 2791 * be done in a way that produces consistent results with group_weight,
 2792 * otherwise workloads might not converge.
 2793 */
 2794static int preferred_group_nid(struct task_struct *p, int nid)
 2795{
 2796	nodemask_t nodes;
 2797	int dist;
 2798
 2799	/* Direct connections between all NUMA nodes. */
 2800	if (sched_numa_topology_type == NUMA_DIRECT)
 2801		return nid;
 2802
 2803	/*
 2804	 * On a system with glueless mesh NUMA topology, group_weight
 2805	 * scores nodes according to the number of NUMA hinting faults on
 2806	 * both the node itself, and on nearby nodes.
 2807	 */
 2808	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
 2809		unsigned long score, max_score = 0;
 2810		int node, max_node = nid;
 2811
 2812		dist = sched_max_numa_distance;
 2813
 2814		for_each_node_state(node, N_CPU) {
 2815			score = group_weight(p, node, dist);
 2816			if (score > max_score) {
 2817				max_score = score;
 2818				max_node = node;
 2819			}
 2820		}
 2821		return max_node;
 2822	}
 2823
 2824	/*
 2825	 * Finding the preferred nid in a system with NUMA backplane
 2826	 * interconnect topology is more involved. The goal is to locate
 2827	 * tasks from numa_groups near each other in the system, and
 2828	 * untangle workloads from different sides of the system. This requires
 2829	 * searching down the hierarchy of node groups, recursively searching
 2830	 * inside the highest scoring group of nodes. The nodemask tricks
 2831	 * keep the complexity of the search down.
 2832	 */
 2833	nodes = node_states[N_CPU];
 2834	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
 2835		unsigned long max_faults = 0;
 2836		nodemask_t max_group = NODE_MASK_NONE;
 2837		int a, b;
 2838
 2839		/* Are there nodes at this distance from each other? */
 2840		if (!find_numa_distance(dist))
 2841			continue;
 2842
 2843		for_each_node_mask(a, nodes) {
 2844			unsigned long faults = 0;
 2845			nodemask_t this_group;
 2846			nodes_clear(this_group);
 2847
 2848			/* Sum group's NUMA faults; includes a==b case. */
 2849			for_each_node_mask(b, nodes) {
 2850				if (node_distance(a, b) < dist) {
 2851					faults += group_faults(p, b);
 2852					node_set(b, this_group);
 2853					node_clear(b, nodes);
 2854				}
 2855			}
 2856
 2857			/* Remember the top group. */
 2858			if (faults > max_faults) {
 2859				max_faults = faults;
 2860				max_group = this_group;
 2861				/*
 2862				 * subtle: at the smallest distance there is
 2863				 * just one node left in each "group", the
 2864				 * winner is the preferred nid.
 2865				 */
 2866				nid = a;
 2867			}
 2868		}
 2869		/* Next round, evaluate the nodes within max_group. */
 2870		if (!max_faults)
 2871			break;
 2872		nodes = max_group;
 2873	}
 2874	return nid;
 2875}
 2876
 2877static void task_numa_placement(struct task_struct *p)
 2878{
 2879	int seq, nid, max_nid = NUMA_NO_NODE;
 2880	unsigned long max_faults = 0;
 2881	unsigned long fault_types[2] = { 0, 0 };
 2882	unsigned long total_faults;
 2883	u64 runtime, period;
 2884	spinlock_t *group_lock = NULL;
 2885	struct numa_group *ng;
 2886
 2887	/*
 2888	 * The p->mm->numa_scan_seq field gets updated without
 2889	 * exclusive access. Use READ_ONCE() here to ensure
 2890	 * that the field is read in a single access:
 2891	 */
 2892	seq = READ_ONCE(p->mm->numa_scan_seq);
 2893	if (p->numa_scan_seq == seq)
 2894		return;
 2895	p->numa_scan_seq = seq;
 2896	p->numa_scan_period_max = task_scan_max(p);
 2897
 2898	total_faults = p->numa_faults_locality[0] +
 2899		       p->numa_faults_locality[1];
 2900	runtime = numa_get_avg_runtime(p, &period);
 2901
 2902	/* If the task is part of a group prevent parallel updates to group stats */
 2903	ng = deref_curr_numa_group(p);
 2904	if (ng) {
 2905		group_lock = &ng->lock;
 2906		spin_lock_irq(group_lock);
 2907	}
 2908
 2909	/* Find the node with the highest number of faults */
 2910	for_each_online_node(nid) {
 2911		/* Keep track of the offsets in numa_faults array */
 2912		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
 2913		unsigned long faults = 0, group_faults = 0;
 2914		int priv;
 2915
 2916		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
 2917			long diff, f_diff, f_weight;
 2918
 2919			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
 2920			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
 2921			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
 2922			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
 2923
 2924			/* Decay existing window, copy faults since last scan */
 2925			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
 2926			fault_types[priv] += p->numa_faults[membuf_idx];
 2927			p->numa_faults[membuf_idx] = 0;
 2928
 2929			/*
 2930			 * Normalize the faults_from, so all tasks in a group
 2931			 * count according to CPU use, instead of by the raw
 2932			 * number of faults. Tasks with little runtime have
 2933			 * little over-all impact on throughput, and thus their
 2934			 * faults are less important.
 2935			 */
 2936			f_weight = div64_u64(runtime << 16, period + 1);
 2937			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
 2938				   (total_faults + 1);
 2939			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
 2940			p->numa_faults[cpubuf_idx] = 0;
 2941
 2942			p->numa_faults[mem_idx] += diff;
 2943			p->numa_faults[cpu_idx] += f_diff;
 2944			faults += p->numa_faults[mem_idx];
 2945			p->total_numa_faults += diff;
 2946			if (ng) {
 2947				/*
 2948				 * safe because we can only change our own group
 2949				 *
 2950				 * mem_idx represents the offset for a given
 2951				 * nid and priv in a specific region because it
 2952				 * is at the beginning of the numa_faults array.
 2953				 */
 2954				ng->faults[mem_idx] += diff;
 2955				ng->faults[cpu_idx] += f_diff;
 2956				ng->total_faults += diff;
 2957				group_faults += ng->faults[mem_idx];
 2958			}
 2959		}
 2960
 2961		if (!ng) {
 2962			if (faults > max_faults) {
 2963				max_faults = faults;
 2964				max_nid = nid;
 2965			}
 2966		} else if (group_faults > max_faults) {
 2967			max_faults = group_faults;
 2968			max_nid = nid;
 2969		}
 2970	}
 2971
 2972	/* Cannot migrate task to CPU-less node */
 2973	max_nid = numa_nearest_node(max_nid, N_CPU);
 2974
 2975	if (ng) {
 2976		numa_group_count_active_nodes(ng);
 2977		spin_unlock_irq(group_lock);
 2978		max_nid = preferred_group_nid(p, max_nid);
 2979	}
 2980
 2981	if (max_faults) {
 2982		/* Set the new preferred node */
 2983		if (max_nid != p->numa_preferred_nid)
 2984			sched_setnuma(p, max_nid);
 2985	}
 2986
 2987	update_task_scan_period(p, fault_types[0], fault_types[1]);
 2988}
 2989
 2990static inline int get_numa_group(struct numa_group *grp)
 2991{
 2992	return refcount_inc_not_zero(&grp->refcount);
 2993}
 2994
 2995static inline void put_numa_group(struct numa_group *grp)
 2996{
 2997	if (refcount_dec_and_test(&grp->refcount))
 2998		kfree_rcu(grp, rcu);
 2999}
 3000
 3001static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 3002			int *priv)
 3003{
 3004	struct numa_group *grp, *my_grp;
 3005	struct task_struct *tsk;
 3006	bool join = false;
 3007	int cpu = cpupid_to_cpu(cpupid);
 3008	int i;
 3009
 3010	if (unlikely(!deref_curr_numa_group(p))) {
 3011		unsigned int size = sizeof(struct numa_group) +
 3012				    NR_NUMA_HINT_FAULT_STATS *
 3013				    nr_node_ids * sizeof(unsigned long);
 3014
 3015		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
 3016		if (!grp)
 3017			return;
 3018
 3019		refcount_set(&grp->refcount, 1);
 3020		grp->active_nodes = 1;
 3021		grp->max_faults_cpu = 0;
 3022		spin_lock_init(&grp->lock);
 3023		grp->gid = p->pid;
 3024
 3025		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 3026			grp->faults[i] = p->numa_faults[i];
 3027
 3028		grp->total_faults = p->total_numa_faults;
 3029
 3030		grp->nr_tasks++;
 3031		rcu_assign_pointer(p->numa_group, grp);
 3032	}
 3033
 3034	rcu_read_lock();
 3035	tsk = READ_ONCE(cpu_rq(cpu)->curr);
 3036
 3037	if (!cpupid_match_pid(tsk, cpupid))
 3038		goto no_join;
 3039
 3040	grp = rcu_dereference(tsk->numa_group);
 3041	if (!grp)
 3042		goto no_join;
 3043
 3044	my_grp = deref_curr_numa_group(p);
 3045	if (grp == my_grp)
 3046		goto no_join;
 3047
 3048	/*
 3049	 * Only join the other group if its bigger; if we're the bigger group,
 3050	 * the other task will join us.
 3051	 */
 3052	if (my_grp->nr_tasks > grp->nr_tasks)
 3053		goto no_join;
 3054
 3055	/*
 3056	 * Tie-break on the grp address.
 3057	 */
 3058	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
 3059		goto no_join;
 3060
 3061	/* Always join threads in the same process. */
 3062	if (tsk->mm == current->mm)
 3063		join = true;
 3064
 3065	/* Simple filter to avoid false positives due to PID collisions */
 3066	if (flags & TNF_SHARED)
 3067		join = true;
 3068
 3069	/* Update priv based on whether false sharing was detected */
 3070	*priv = !join;
 3071
 3072	if (join && !get_numa_group(grp))
 3073		goto no_join;
 3074
 3075	rcu_read_unlock();
 3076
 3077	if (!join)
 3078		return;
 3079
 3080	WARN_ON_ONCE(irqs_disabled());
 3081	double_lock_irq(&my_grp->lock, &grp->lock);
 3082
 3083	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
 3084		my_grp->faults[i] -= p->numa_faults[i];
 3085		grp->faults[i] += p->numa_faults[i];
 3086	}
 3087	my_grp->total_faults -= p->total_numa_faults;
 3088	grp->total_faults += p->total_numa_faults;
 3089
 3090	my_grp->nr_tasks--;
 3091	grp->nr_tasks++;
 3092
 3093	spin_unlock(&my_grp->lock);
 3094	spin_unlock_irq(&grp->lock);
 3095
 3096	rcu_assign_pointer(p->numa_group, grp);
 3097
 3098	put_numa_group(my_grp);
 3099	return;
 3100
 3101no_join:
 3102	rcu_read_unlock();
 3103	return;
 3104}
 3105
 3106/*
 3107 * Get rid of NUMA statistics associated with a task (either current or dead).
 3108 * If @final is set, the task is dead and has reached refcount zero, so we can
 3109 * safely free all relevant data structures. Otherwise, there might be
 3110 * concurrent reads from places like load balancing and procfs, and we should
 3111 * reset the data back to default state without freeing ->numa_faults.
 3112 */
 3113void task_numa_free(struct task_struct *p, bool final)
 3114{
 3115	/* safe: p either is current or is being freed by current */
 3116	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
 3117	unsigned long *numa_faults = p->numa_faults;
 3118	unsigned long flags;
 3119	int i;
 3120
 3121	if (!numa_faults)
 3122		return;
 3123
 3124	if (grp) {
 3125		spin_lock_irqsave(&grp->lock, flags);
 3126		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 3127			grp->faults[i] -= p->numa_faults[i];
 3128		grp->total_faults -= p->total_numa_faults;
 3129
 3130		grp->nr_tasks--;
 3131		spin_unlock_irqrestore(&grp->lock, flags);
 3132		RCU_INIT_POINTER(p->numa_group, NULL);
 3133		put_numa_group(grp);
 3134	}
 3135
 3136	if (final) {
 3137		p->numa_faults = NULL;
 3138		kfree(numa_faults);
 3139	} else {
 3140		p->total_numa_faults = 0;
 3141		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 3142			numa_faults[i] = 0;
 3143	}
 3144}
 3145
 3146/*
 3147 * Got a PROT_NONE fault for a page on @node.
 3148 */
 3149void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 3150{
 3151	struct task_struct *p = current;
 3152	bool migrated = flags & TNF_MIGRATED;
 3153	int cpu_node = task_node(current);
 3154	int local = !!(flags & TNF_FAULT_LOCAL);
 3155	struct numa_group *ng;
 3156	int priv;
 3157
 3158	if (!static_branch_likely(&sched_numa_balancing))
 3159		return;
 3160
 3161	/* for example, ksmd faulting in a user's mm */
 3162	if (!p->mm)
 3163		return;
 3164
 3165	/*
 3166	 * NUMA faults statistics are unnecessary for the slow memory
 3167	 * node for memory tiering mode.
 3168	 */
 3169	if (!node_is_toptier(mem_node) &&
 3170	    (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
 3171	     !cpupid_valid(last_cpupid)))
 3172		return;
 3173
 3174	/* Allocate buffer to track faults on a per-node basis */
 3175	if (unlikely(!p->numa_faults)) {
 3176		int size = sizeof(*p->numa_faults) *
 3177			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
 3178
 3179		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
 3180		if (!p->numa_faults)
 3181			return;
 3182
 3183		p->total_numa_faults = 0;
 3184		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 3185	}
 3186
 3187	/*
 3188	 * First accesses are treated as private, otherwise consider accesses
 3189	 * to be private if the accessing pid has not changed
 3190	 */
 3191	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
 3192		priv = 1;
 3193	} else {
 3194		priv = cpupid_match_pid(p, last_cpupid);
 3195		if (!priv && !(flags & TNF_NO_GROUP))
 3196			task_numa_group(p, last_cpupid, flags, &priv);
 3197	}
 3198
 3199	/*
 3200	 * If a workload spans multiple NUMA nodes, a shared fault that
 3201	 * occurs wholly within the set of nodes that the workload is
 3202	 * actively using should be counted as local. This allows the
 3203	 * scan rate to slow down when a workload has settled down.
 3204	 */
 3205	ng = deref_curr_numa_group(p);
 3206	if (!priv && !local && ng && ng->active_nodes > 1 &&
 3207				numa_is_active_node(cpu_node, ng) &&
 3208				numa_is_active_node(mem_node, ng))
 3209		local = 1;
 3210
 3211	/*
 3212	 * Retry to migrate task to preferred node periodically, in case it
 3213	 * previously failed, or the scheduler moved us.
 3214	 */
 3215	if (time_after(jiffies, p->numa_migrate_retry)) {
 3216		task_numa_placement(p);
 3217		numa_migrate_preferred(p);
 3218	}
 3219
 3220	if (migrated)
 3221		p->numa_pages_migrated += pages;
 3222	if (flags & TNF_MIGRATE_FAIL)
 3223		p->numa_faults_locality[2] += pages;
 3224
 3225	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 3226	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
 3227	p->numa_faults_locality[local] += pages;
 3228}
 3229
 3230static void reset_ptenuma_scan(struct task_struct *p)
 3231{
 3232	/*
 3233	 * We only did a read acquisition of the mmap sem, so
 3234	 * p->mm->numa_scan_seq is written to without exclusive access
 3235	 * and the update is not guaranteed to be atomic. That's not
 3236	 * much of an issue though, since this is just used for
 3237	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
 3238	 * expensive, to avoid any form of compiler optimizations:
 3239	 */
 3240	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 3241	p->mm->numa_scan_offset = 0;
 3242}
 3243
 3244static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 3245{
 3246	unsigned long pids;
 3247	/*
 3248	 * Allow unconditional access first two times, so that all the (pages)
 3249	 * of VMAs get prot_none fault introduced irrespective of accesses.
 3250	 * This is also done to avoid any side effect of task scanning
 3251	 * amplifying the unfairness of disjoint set of VMAs' access.
 3252	 */
 3253	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
 3254		return true;
 3255
 3256	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
 3257	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
 3258		return true;
 3259
 3260	/*
 3261	 * Complete a scan that has already started regardless of PID access, or
 3262	 * some VMAs may never be scanned in multi-threaded applications:
 3263	 */
 3264	if (mm->numa_scan_offset > vma->vm_start) {
 3265		trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
 3266		return true;
 3267	}
 3268
 3269	/*
 3270	 * This vma has not been accessed for a while, and if the number
 3271	 * the threads in the same process is low, which means no other
 3272	 * threads can help scan this vma, force a vma scan.
 3273	 */
 3274	if (READ_ONCE(mm->numa_scan_seq) >
 3275	   (vma->numab_state->prev_scan_seq + get_nr_threads(current)))
 3276		return true;
 3277
 3278	return false;
 3279}
 3280
 3281#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
 3282
 3283/*
 3284 * The expensive part of numa migration is done from task_work context.
 3285 * Triggered from task_tick_numa().
 3286 */
 3287static void task_numa_work(struct callback_head *work)
 3288{
 3289	unsigned long migrate, next_scan, now = jiffies;
 3290	struct task_struct *p = current;
 3291	struct mm_struct *mm = p->mm;
 3292	u64 runtime = p->se.sum_exec_runtime;
 3293	struct vm_area_struct *vma;
 3294	unsigned long start, end;
 3295	unsigned long nr_pte_updates = 0;
 3296	long pages, virtpages;
 3297	struct vma_iterator vmi;
 3298	bool vma_pids_skipped;
 3299	bool vma_pids_forced = false;
 3300
 3301	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 3302
 3303	work->next = work;
 3304	/*
 3305	 * Who cares about NUMA placement when they're dying.
 3306	 *
 3307	 * NOTE: make sure not to dereference p->mm before this check,
 3308	 * exit_task_work() happens _after_ exit_mm() so we could be called
 3309	 * without p->mm even though we still had it when we enqueued this
 3310	 * work.
 3311	 */
 3312	if (p->flags & PF_EXITING)
 3313		return;
 3314
 3315	if (!mm->numa_next_scan) {
 3316		mm->numa_next_scan = now +
 3317			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 3318	}
 3319
 3320	/*
 3321	 * Enforce maximal scan/migration frequency..
 3322	 */
 3323	migrate = mm->numa_next_scan;
 3324	if (time_before(now, migrate))
 3325		return;
 3326
 3327	if (p->numa_scan_period == 0) {
 3328		p->numa_scan_period_max = task_scan_max(p);
 3329		p->numa_scan_period = task_scan_start(p);
 3330	}
 3331
 3332	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
 3333	if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
 3334		return;
 3335
 3336	/*
 3337	 * Delay this task enough that another task of this mm will likely win
 3338	 * the next time around.
 3339	 */
 3340	p->node_stamp += 2 * TICK_NSEC;
 3341
 3342	pages = sysctl_numa_balancing_scan_size;
 3343	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 3344	virtpages = pages * 8;	   /* Scan up to this much virtual space */
 3345	if (!pages)
 3346		return;
 3347
 3348
 3349	if (!mmap_read_trylock(mm))
 3350		return;
 3351
 3352	/*
 3353	 * VMAs are skipped if the current PID has not trapped a fault within
 3354	 * the VMA recently. Allow scanning to be forced if there is no
 3355	 * suitable VMA remaining.
 3356	 */
 3357	vma_pids_skipped = false;
 3358
 3359retry_pids:
 3360	start = mm->numa_scan_offset;
 3361	vma_iter_init(&vmi, mm, start);
 3362	vma = vma_next(&vmi);
 3363	if (!vma) {
 3364		reset_ptenuma_scan(p);
 3365		start = 0;
 3366		vma_iter_set(&vmi, start);
 3367		vma = vma_next(&vmi);
 3368	}
 3369
 3370	for (; vma; vma = vma_next(&vmi)) {
 3371		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
 3372			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
 3373			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
 3374			continue;
 3375		}
 3376
 3377		/*
 3378		 * Shared library pages mapped by multiple processes are not
 3379		 * migrated as it is expected they are cache replicated. Avoid
 3380		 * hinting faults in read-only file-backed mappings or the vDSO
 3381		 * as migrating the pages will be of marginal benefit.
 3382		 */
 3383		if (!vma->vm_mm ||
 3384		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
 3385			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
 3386			continue;
 3387		}
 3388
 3389		/*
 3390		 * Skip inaccessible VMAs to avoid any confusion between
 3391		 * PROT_NONE and NUMA hinting PTEs
 3392		 */
 3393		if (!vma_is_accessible(vma)) {
 3394			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
 3395			continue;
 3396		}
 3397
 3398		/* Initialise new per-VMA NUMAB state. */
 3399		if (!vma->numab_state) {
 3400			struct vma_numab_state *ptr;
 3401
 3402			ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
 3403			if (!ptr)
 3404				continue;
 3405
 3406			if (cmpxchg(&vma->numab_state, NULL, ptr)) {
 3407				kfree(ptr);
 3408				continue;
 3409			}
 3410
 3411			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
 3412
 3413			vma->numab_state->next_scan = now +
 3414				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 3415
 3416			/* Reset happens after 4 times scan delay of scan start */
 3417			vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
 3418				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
 3419
 3420			/*
 3421			 * Ensure prev_scan_seq does not match numa_scan_seq,
 3422			 * to prevent VMAs being skipped prematurely on the
 3423			 * first scan:
 3424			 */
 3425			 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
 3426		}
 3427
 3428		/*
 3429		 * Scanning the VMAs of short lived tasks add more overhead. So
 3430		 * delay the scan for new VMAs.
 3431		 */
 3432		if (mm->numa_scan_seq && time_before(jiffies,
 3433						vma->numab_state->next_scan)) {
 3434			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
 3435			continue;
 3436		}
 3437
 3438		/* RESET access PIDs regularly for old VMAs. */
 3439		if (mm->numa_scan_seq &&
 3440				time_after(jiffies, vma->numab_state->pids_active_reset)) {
 3441			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
 3442				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
 3443			vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
 3444			vma->numab_state->pids_active[1] = 0;
 3445		}
 3446
 3447		/* Do not rescan VMAs twice within the same sequence. */
 3448		if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
 3449			mm->numa_scan_offset = vma->vm_end;
 3450			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
 3451			continue;
 3452		}
 3453
 3454		/*
 3455		 * Do not scan the VMA if task has not accessed it, unless no other
 3456		 * VMA candidate exists.
 3457		 */
 3458		if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
 3459			vma_pids_skipped = true;
 3460			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
 3461			continue;
 3462		}
 3463
 3464		do {
 3465			start = max(start, vma->vm_start);
 3466			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
 3467			end = min(end, vma->vm_end);
 3468			nr_pte_updates = change_prot_numa(vma, start, end);
 3469
 3470			/*
 3471			 * Try to scan sysctl_numa_balancing_size worth of
 3472			 * hpages that have at least one present PTE that
 3473			 * is not already PTE-numa. If the VMA contains
 3474			 * areas that are unused or already full of prot_numa
 3475			 * PTEs, scan up to virtpages, to skip through those
 3476			 * areas faster.
 3477			 */
 3478			if (nr_pte_updates)
 3479				pages -= (end - start) >> PAGE_SHIFT;
 3480			virtpages -= (end - start) >> PAGE_SHIFT;
 3481
 3482			start = end;
 3483			if (pages <= 0 || virtpages <= 0)
 3484				goto out;
 3485
 3486			cond_resched();
 3487		} while (end != vma->vm_end);
 3488
 3489		/* VMA scan is complete, do not scan until next sequence. */
 3490		vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
 3491
 3492		/*
 3493		 * Only force scan within one VMA at a time, to limit the
 3494		 * cost of scanning a potentially uninteresting VMA.
 3495		 */
 3496		if (vma_pids_forced)
 3497			break;
 3498	}
 3499
 3500	/*
 3501	 * If no VMAs are remaining and VMAs were skipped due to the PID
 3502	 * not accessing the VMA previously, then force a scan to ensure
 3503	 * forward progress:
 3504	 */
 3505	if (!vma && !vma_pids_forced && vma_pids_skipped) {
 3506		vma_pids_forced = true;
 3507		goto retry_pids;
 3508	}
 3509
 3510out:
 3511	/*
 3512	 * It is possible to reach the end of the VMA list but the last few
 3513	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
 3514	 * would find the !migratable VMA on the next scan but not reset the
 3515	 * scanner to the start so check it now.
 3516	 */
 3517	if (vma)
 3518		mm->numa_scan_offset = start;
 3519	else
 3520		reset_ptenuma_scan(p);
 3521	mmap_read_unlock(mm);
 3522
 3523	/*
 3524	 * Make sure tasks use at least 32x as much time to run other code
 3525	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
 3526	 * Usually update_task_scan_period slows down scanning enough; on an
 3527	 * overloaded system we need to limit overhead on a per task basis.
 3528	 */
 3529	if (unlikely(p->se.sum_exec_runtime != runtime)) {
 3530		u64 diff = p->se.sum_exec_runtime - runtime;
 3531		p->node_stamp += 32 * diff;
 3532	}
 3533}
 3534
 3535void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 3536{
 3537	int mm_users = 0;
 3538	struct mm_struct *mm = p->mm;
 3539
 3540	if (mm) {
 3541		mm_users = atomic_read(&mm->mm_users);
 3542		if (mm_users == 1) {
 3543			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 3544			mm->numa_scan_seq = 0;
 3545		}
 3546	}
 3547	p->node_stamp			= 0;
 3548	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
 3549	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
 3550	p->numa_migrate_retry		= 0;
 3551	/* Protect against double add, see task_tick_numa and task_numa_work */
 3552	p->numa_work.next		= &p->numa_work;
 3553	p->numa_faults			= NULL;
 3554	p->numa_pages_migrated		= 0;
 3555	p->total_numa_faults		= 0;
 3556	RCU_INIT_POINTER(p->numa_group, NULL);
 3557	p->last_task_numa_placement	= 0;
 3558	p->last_sum_exec_runtime	= 0;
 3559
 3560	init_task_work(&p->numa_work, task_numa_work);
 3561
 3562	/* New address space, reset the preferred nid */
 3563	if (!(clone_flags & CLONE_VM)) {
 3564		p->numa_preferred_nid = NUMA_NO_NODE;
 3565		return;
 3566	}
 3567
 3568	/*
 3569	 * New thread, keep existing numa_preferred_nid which should be copied
 3570	 * already by arch_dup_task_struct but stagger when scans start.
 3571	 */
 3572	if (mm) {
 3573		unsigned int delay;
 3574
 3575		delay = min_t(unsigned int, task_scan_max(current),
 3576			current->numa_scan_period * mm_users * NSEC_PER_MSEC);
 3577		delay += 2 * TICK_NSEC;
 3578		p->node_stamp = delay;
 3579	}
 3580}
 3581
 3582/*
 3583 * Drive the periodic memory faults..
 3584 */
 3585static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 3586{
 3587	struct callback_head *work = &curr->numa_work;
 3588	u64 period, now;
 3589
 3590	/*
 3591	 * We don't care about NUMA placement if we don't have memory.
 3592	 */
 3593	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
 3594		return;
 3595
 3596	/*
 3597	 * Using runtime rather than walltime has the dual advantage that
 3598	 * we (mostly) drive the selection from busy threads and that the
 3599	 * task needs to have done some actual work before we bother with
 3600	 * NUMA placement.
 3601	 */
 3602	now = curr->se.sum_exec_runtime;
 3603	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 3604
 3605	if (now > curr->node_stamp + period) {
 3606		if (!curr->node_stamp)
 3607			curr->numa_scan_period = task_scan_start(curr);
 3608		curr->node_stamp += period;
 3609
 3610		if (!time_before(jiffies, curr->mm->numa_next_scan))
 3611			task_work_add(curr, work, TWA_RESUME);
 3612	}
 3613}
 3614
 3615static void update_scan_period(struct task_struct *p, int new_cpu)
 3616{
 3617	int src_nid = cpu_to_node(task_cpu(p));
 3618	int dst_nid = cpu_to_node(new_cpu);
 3619
 3620	if (!static_branch_likely(&sched_numa_balancing))
 3621		return;
 3622
 3623	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
 3624		return;
 3625
 3626	if (src_nid == dst_nid)
 3627		return;
 3628
 3629	/*
 3630	 * Allow resets if faults have been trapped before one scan
 3631	 * has completed. This is most likely due to a new task that
 3632	 * is pulled cross-node due to wakeups or load balancing.
 3633	 */
 3634	if (p->numa_scan_seq) {
 3635		/*
 3636		 * Avoid scan adjustments if moving to the preferred
 3637		 * node or if the task was not previously running on
 3638		 * the preferred node.
 3639		 */
 3640		if (dst_nid == p->numa_preferred_nid ||
 3641		    (p->numa_preferred_nid != NUMA_NO_NODE &&
 3642			src_nid != p->numa_preferred_nid))
 3643			return;
 3644	}
 3645
 3646	p->numa_scan_period = task_scan_start(p);
 3647}
 3648
 3649#else
 3650static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 3651{
 3652}
 3653
 3654static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 3655{
 3656}
 3657
 3658static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 3659{
 3660}
 3661
 3662static inline void update_scan_period(struct task_struct *p, int new_cpu)
 3663{
 3664}
 3665
 3666#endif /* CONFIG_NUMA_BALANCING */
 3667
 3668static void
 3669account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 3670{
 3671	update_load_add(&cfs_rq->load, se->load.weight);
 3672#ifdef CONFIG_SMP
 3673	if (entity_is_task(se)) {
 3674		struct rq *rq = rq_of(cfs_rq);
 3675
 3676		account_numa_enqueue(rq, task_of(se));
 3677		list_add(&se->group_node, &rq->cfs_tasks);
 3678	}
 3679#endif
 3680	cfs_rq->nr_running++;
 3681	if (se_is_idle(se))
 3682		cfs_rq->idle_nr_running++;
 3683}
 3684
 3685static void
 3686account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 3687{
 3688	update_load_sub(&cfs_rq->load, se->load.weight);
 3689#ifdef CONFIG_SMP
 3690	if (entity_is_task(se)) {
 3691		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 3692		list_del_init(&se->group_node);
 3693	}
 3694#endif
 3695	cfs_rq->nr_running--;
 3696	if (se_is_idle(se))
 3697		cfs_rq->idle_nr_running--;
 3698}
 3699
 3700/*
 3701 * Signed add and clamp on underflow.
 3702 *
 3703 * Explicitly do a load-store to ensure the intermediate value never hits
 3704 * memory. This allows lockless observations without ever seeing the negative
 3705 * values.
 3706 */
 3707#define add_positive(_ptr, _val) do {                           \
 3708	typeof(_ptr) ptr = (_ptr);                              \
 3709	typeof(_val) val = (_val);                              \
 3710	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
 3711								\
 3712	res = var + val;                                        \
 3713								\
 3714	if (val < 0 && res > var)                               \
 3715		res = 0;                                        \
 3716								\
 3717	WRITE_ONCE(*ptr, res);                                  \
 3718} while (0)
 3719
 3720/*
 3721 * Unsigned subtract and clamp on underflow.
 3722 *
 3723 * Explicitly do a load-store to ensure the intermediate value never hits
 3724 * memory. This allows lockless observations without ever seeing the negative
 3725 * values.
 3726 */
 3727#define sub_positive(_ptr, _val) do {				\
 3728	typeof(_ptr) ptr = (_ptr);				\
 3729	typeof(*ptr) val = (_val);				\
 3730	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
 3731	res = var - val;					\
 3732	if (res > var)						\
 3733		res = 0;					\
 3734	WRITE_ONCE(*ptr, res);					\
 3735} while (0)
 3736
 3737/*
 3738 * Remove and clamp on negative, from a local variable.
 3739 *
 3740 * A variant of sub_positive(), which does not use explicit load-store
 3741 * and is thus optimized for local variable updates.
 3742 */
 3743#define lsub_positive(_ptr, _val) do {				\
 3744	typeof(_ptr) ptr = (_ptr);				\
 3745	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
 3746} while (0)
 3747
 3748#ifdef CONFIG_SMP
 3749static inline void
 3750enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 3751{
 3752	cfs_rq->avg.load_avg += se->avg.load_avg;
 3753	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
 3754}
 3755
 3756static inline void
 3757dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 3758{
 3759	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
 3760	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
 3761	/* See update_cfs_rq_load_avg() */
 3762	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
 3763					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
 3764}
 3765#else
 3766static inline void
 3767enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 3768static inline void
 3769dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 3770#endif
 3771
 3772static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
 3773
 3774static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 3775			    unsigned long weight)
 3776{
 3777	bool curr = cfs_rq->curr == se;
 3778
 3779	if (se->on_rq) {
 3780		/* commit outstanding execution time */
 3781		update_curr(cfs_rq);
 3782		update_entity_lag(cfs_rq, se);
 3783		se->deadline -= se->vruntime;
 3784		se->rel_deadline = 1;
 3785		if (!curr)
 3786			__dequeue_entity(cfs_rq, se);
 3787		update_load_sub(&cfs_rq->load, se->load.weight);
 3788	}
 3789	dequeue_load_avg(cfs_rq, se);
 3790
 3791	/*
 3792	 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
 3793	 * we need to scale se->vlag when w_i changes.
 3794	 */
 3795	se->vlag = div_s64(se->vlag * se->load.weight, weight);
 3796	if (se->rel_deadline)
 3797		se->deadline = div_s64(se->deadline * se->load.weight, weight);
 3798
 3799	update_load_set(&se->load, weight);
 3800
 3801#ifdef CONFIG_SMP
 3802	do {
 3803		u32 divider = get_pelt_divider(&se->avg);
 3804
 3805		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
 3806	} while (0);
 3807#endif
 3808
 3809	enqueue_load_avg(cfs_rq, se);
 3810	if (se->on_rq) {
 3811		update_load_add(&cfs_rq->load, se->load.weight);
 3812		place_entity(cfs_rq, se, 0);
 3813		if (!curr)
 3814			__enqueue_entity(cfs_rq, se);
 3815
 3816		/*
 3817		 * The entity's vruntime has been adjusted, so let's check
 3818		 * whether the rq-wide min_vruntime needs updated too. Since
 3819		 * the calculations above require stable min_vruntime rather
 3820		 * than up-to-date one, we do the update at the end of the
 3821		 * reweight process.
 3822		 */
 3823		update_min_vruntime(cfs_rq);
 3824	}
 3825}
 3826
 3827static void reweight_task_fair(struct rq *rq, struct task_struct *p,
 3828			       const struct load_weight *lw)
 3829{
 3830	struct sched_entity *se = &p->se;
 3831	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 3832	struct load_weight *load = &se->load;
 3833
 3834	reweight_entity(cfs_rq, se, lw->weight);
 3835	load->inv_weight = lw->inv_weight;
 3836}
 3837
 3838static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 3839
 3840#ifdef CONFIG_FAIR_GROUP_SCHED
 3841#ifdef CONFIG_SMP
 3842/*
 3843 * All this does is approximate the hierarchical proportion which includes that
 3844 * global sum we all love to hate.
 3845 *
 3846 * That is, the weight of a group entity, is the proportional share of the
 3847 * group weight based on the group runqueue weights. That is:
 3848 *
 3849 *                     tg->weight * grq->load.weight
 3850 *   ge->load.weight = -----------------------------               (1)
 3851 *                       \Sum grq->load.weight
 3852 *
 3853 * Now, because computing that sum is prohibitively expensive to compute (been
 3854 * there, done that) we approximate it with this average stuff. The average
 3855 * moves slower and therefore the approximation is cheaper and more stable.
 3856 *
 3857 * So instead of the above, we substitute:
 3858 *
 3859 *   grq->load.weight -> grq->avg.load_avg                         (2)
 3860 *
 3861 * which yields the following:
 3862 *
 3863 *                     tg->weight * grq->avg.load_avg
 3864 *   ge->load.weight = ------------------------------              (3)
 3865 *                             tg->load_avg
 3866 *
 3867 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
 3868 *
 3869 * That is shares_avg, and it is right (given the approximation (2)).
 3870 *
 3871 * The problem with it is that because the average is slow -- it was designed
 3872 * to be exactly that of course -- this leads to transients in boundary
 3873 * conditions. In specific, the case where the group was idle and we start the
 3874 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
 3875 * yielding bad latency etc..
 3876 *
 3877 * Now, in that special case (1) reduces to:
 3878 *
 3879 *                     tg->weight * grq->load.weight
 3880 *   ge->load.weight = ----------------------------- = tg->weight   (4)
 3881 *                         grp->load.weight
 3882 *
 3883 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
 3884 *
 3885 * So what we do is modify our approximation (3) to approach (4) in the (near)
 3886 * UP case, like:
 3887 *
 3888 *   ge->load.weight =
 3889 *
 3890 *              tg->weight * grq->load.weight
 3891 *     ---------------------------------------------------         (5)
 3892 *     tg->load_avg - grq->avg.load_avg + grq->load.weight
 3893 *
 3894 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
 3895 * we need to use grq->avg.load_avg as its lower bound, which then gives:
 3896 *
 3897 *
 3898 *                     tg->weight * grq->load.weight
 3899 *   ge->load.weight = -----------------------------		   (6)
 3900 *                             tg_load_avg'
 3901 *
 3902 * Where:
 3903 *
 3904 *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
 3905 *                  max(grq->load.weight, grq->avg.load_avg)
 3906 *
 3907 * And that is shares_weight and is icky. In the (near) UP case it approaches
 3908 * (4) while in the normal case it approaches (3). It consistently
 3909 * overestimates the ge->load.weight and therefore:
 3910 *
 3911 *   \Sum ge->load.weight >= tg->weight
 3912 *
 3913 * hence icky!
 3914 */
 3915static long calc_group_shares(struct cfs_rq *cfs_rq)
 3916{
 3917	long tg_weight, tg_shares, load, shares;
 3918	struct task_group *tg = cfs_rq->tg;
 3919
 3920	tg_shares = READ_ONCE(tg->shares);
 3921
 3922	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 3923
 3924	tg_weight = atomic_long_read(&tg->load_avg);
 3925
 3926	/* Ensure tg_weight >= load */
 3927	tg_weight -= cfs_rq->tg_load_avg_contrib;
 3928	tg_weight += load;
 3929
 3930	shares = (tg_shares * load);
 3931	if (tg_weight)
 3932		shares /= tg_weight;
 3933
 3934	/*
 3935	 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
 3936	 * of a group with small tg->shares value. It is a floor value which is
 3937	 * assigned as a minimum load.weight to the sched_entity representing
 3938	 * the group on a CPU.
 3939	 *
 3940	 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
 3941	 * on an 8-core system with 8 tasks each runnable on one CPU shares has
 3942	 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
 3943	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
 3944	 * instead of 0.
 3945	 */
 3946	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 3947}
 3948#endif /* CONFIG_SMP */
 3949
 3950/*
 3951 * Recomputes the group entity based on the current state of its group
 3952 * runqueue.
 3953 */
 3954static void update_cfs_group(struct sched_entity *se)
 3955{
 3956	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 3957	long shares;
 3958
 3959	/*
 3960	 * When a group becomes empty, preserve its weight. This matters for
 3961	 * DELAY_DEQUEUE.
 3962	 */
 3963	if (!gcfs_rq || !gcfs_rq->load.weight)
 3964		return;
 3965
 3966	if (throttled_hierarchy(gcfs_rq))
 3967		return;
 3968
 3969#ifndef CONFIG_SMP
 3970	shares = READ_ONCE(gcfs_rq->tg->shares);
 3971#else
 3972	shares = calc_group_shares(gcfs_rq);
 3973#endif
 3974	if (unlikely(se->load.weight != shares))
 3975		reweight_entity(cfs_rq_of(se), se, shares);
 3976}
 3977
 3978#else /* CONFIG_FAIR_GROUP_SCHED */
 3979static inline void update_cfs_group(struct sched_entity *se)
 3980{
 3981}
 3982#endif /* CONFIG_FAIR_GROUP_SCHED */
 3983
 3984static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 3985{
 3986	struct rq *rq = rq_of(cfs_rq);
 3987
 3988	if (&rq->cfs == cfs_rq) {
 3989		/*
 3990		 * There are a few boundary cases this might miss but it should
 3991		 * get called often enough that that should (hopefully) not be
 3992		 * a real problem.
 3993		 *
 3994		 * It will not get called when we go idle, because the idle
 3995		 * thread is a different class (!fair), nor will the utilization
 3996		 * number include things like RT tasks.
 3997		 *
 3998		 * As is, the util number is not freq-invariant (we'd have to
 3999		 * implement arch_scale_freq_capacity() for that).
 4000		 *
 4001		 * See cpu_util_cfs().
 4002		 */
 4003		cpufreq_update_util(rq, flags);
 4004	}
 4005}
 4006
 4007#ifdef CONFIG_SMP
 4008static inline bool load_avg_is_decayed(struct sched_avg *sa)
 4009{
 4010	if (sa->load_sum)
 4011		return false;
 4012
 4013	if (sa->util_sum)
 4014		return false;
 4015
 4016	if (sa->runnable_sum)
 4017		return false;
 4018
 4019	/*
 4020	 * _avg must be null when _sum are null because _avg = _sum / divider
 4021	 * Make sure that rounding and/or propagation of PELT values never
 4022	 * break this.
 4023	 */
 4024	SCHED_WARN_ON(sa->load_avg ||
 4025		      sa->util_avg ||
 4026		      sa->runnable_avg);
 4027
 4028	return true;
 4029}
 4030
 4031static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 4032{
 4033	return u64_u32_load_copy(cfs_rq->avg.last_update_time,
 4034				 cfs_rq->last_update_time_copy);
 4035}
 4036#ifdef CONFIG_FAIR_GROUP_SCHED
 4037/*
 4038 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
 4039 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
 4040 * bottom-up, we only have to test whether the cfs_rq before us on the list
 4041 * is our child.
 4042 * If cfs_rq is not on the list, test whether a child needs its to be added to
 4043 * connect a branch to the tree  * (see list_add_leaf_cfs_rq() for details).
 4044 */
 4045static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
 4046{
 4047	struct cfs_rq *prev_cfs_rq;
 4048	struct list_head *prev;
 4049	struct rq *rq = rq_of(cfs_rq);
 4050
 4051	if (cfs_rq->on_list) {
 4052		prev = cfs_rq->leaf_cfs_rq_list.prev;
 4053	} else {
 4054		prev = rq->tmp_alone_branch;
 4055	}
 4056
 4057	if (prev == &rq->leaf_cfs_rq_list)
 4058		return false;
 4059
 4060	prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
 4061
 4062	return (prev_cfs_rq->tg->parent == cfs_rq->tg);
 4063}
 4064
 4065static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 4066{
 4067	if (cfs_rq->load.weight)
 4068		return false;
 4069
 4070	if (!load_avg_is_decayed(&cfs_rq->avg))
 4071		return false;
 4072
 4073	if (child_cfs_rq_on_list(cfs_rq))
 4074		return false;
 4075
 4076	return true;
 4077}
 4078
 4079/**
 4080 * update_tg_load_avg - update the tg's load avg
 4081 * @cfs_rq: the cfs_rq whose avg changed
 4082 *
 4083 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
 4084 * However, because tg->load_avg is a global value there are performance
 4085 * considerations.
 4086 *
 4087 * In order to avoid having to look at the other cfs_rq's, we use a
 4088 * differential update where we store the last value we propagated. This in
 4089 * turn allows skipping updates if the differential is 'small'.
 4090 *
 4091 * Updating tg's load_avg is necessary before update_cfs_share().
 4092 */
 4093static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 4094{
 4095	long delta;
 4096	u64 now;
 4097
 4098	/*
 4099	 * No need to update load_avg for root_task_group as it is not used.
 4100	 */
 4101	if (cfs_rq->tg == &root_task_group)
 4102		return;
 4103
 4104	/* rq has been offline and doesn't contribute to the share anymore: */
 4105	if (!cpu_active(cpu_of(rq_of(cfs_rq))))
 4106		return;
 4107
 4108	/*
 4109	 * For migration heavy workloads, access to tg->load_avg can be
 4110	 * unbound. Limit the update rate to at most once per ms.
 4111	 */
 4112	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
 4113	if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
 4114		return;
 4115
 4116	delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 4117	if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 4118		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 4119		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
 4120		cfs_rq->last_update_tg_load_avg = now;
 4121	}
 4122}
 4123
 4124static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
 4125{
 4126	long delta;
 4127	u64 now;
 4128
 4129	/*
 4130	 * No need to update load_avg for root_task_group, as it is not used.
 4131	 */
 4132	if (cfs_rq->tg == &root_task_group)
 4133		return;
 4134
 4135	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
 4136	delta = 0 - cfs_rq->tg_load_avg_contrib;
 4137	atomic_long_add(delta, &cfs_rq->tg->load_avg);
 4138	cfs_rq->tg_load_avg_contrib = 0;
 4139	cfs_rq->last_update_tg_load_avg = now;
 4140}
 4141
 4142/* CPU offline callback: */
 4143static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
 4144{
 4145	struct task_group *tg;
 4146
 4147	lockdep_assert_rq_held(rq);
 4148
 4149	/*
 4150	 * The rq clock has already been updated in
 4151	 * set_rq_offline(), so we should skip updating
 4152	 * the rq clock again in unthrottle_cfs_rq().
 4153	 */
 4154	rq_clock_start_loop_update(rq);
 4155
 4156	rcu_read_lock();
 4157	list_for_each_entry_rcu(tg, &task_groups, list) {
 4158		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 4159
 4160		clear_tg_load_avg(cfs_rq);
 4161	}
 4162	rcu_read_unlock();
 4163
 4164	rq_clock_stop_loop_update(rq);
 4165}
 4166
 4167/*
 4168 * Called within set_task_rq() right before setting a task's CPU. The
 4169 * caller only guarantees p->pi_lock is held; no other assumptions,
 4170 * including the state of rq->lock, should be made.
 4171 */
 4172void set_task_rq_fair(struct sched_entity *se,
 4173		      struct cfs_rq *prev, struct cfs_rq *next)
 4174{
 4175	u64 p_last_update_time;
 4176	u64 n_last_update_time;
 4177
 4178	if (!sched_feat(ATTACH_AGE_LOAD))
 4179		return;
 4180
 4181	/*
 4182	 * We are supposed to update the task to "current" time, then its up to
 4183	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
 4184	 * getting what current time is, so simply throw away the out-of-date
 4185	 * time. This will result in the wakee task is less decayed, but giving
 4186	 * the wakee more load sounds not bad.
 4187	 */
 4188	if (!(se->avg.last_update_time && prev))
 4189		return;
 4190
 4191	p_last_update_time = cfs_rq_last_update_time(prev);
 4192	n_last_update_time = cfs_rq_last_update_time(next);
 4193
 4194	__update_load_avg_blocked_se(p_last_update_time, se);
 4195	se->avg.last_update_time = n_last_update_time;
 4196}
 4197
 4198/*
 4199 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
 4200 * propagate its contribution. The key to this propagation is the invariant
 4201 * that for each group:
 4202 *
 4203 *   ge->avg == grq->avg						(1)
 4204 *
 4205 * _IFF_ we look at the pure running and runnable sums. Because they
 4206 * represent the very same entity, just at different points in the hierarchy.
 4207 *
 4208 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
 4209 * and simply copies the running/runnable sum over (but still wrong, because
 4210 * the group entity and group rq do not have their PELT windows aligned).
 4211 *
 4212 * However, update_tg_cfs_load() is more complex. So we have:
 4213 *
 4214 *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
 4215 *
 4216 * And since, like util, the runnable part should be directly transferable,
 4217 * the following would _appear_ to be the straight forward approach:
 4218 *
 4219 *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg	(3)
 4220 *
 4221 * And per (1) we have:
 4222 *
 4223 *   ge->avg.runnable_avg == grq->avg.runnable_avg
 4224 *
 4225 * Which gives:
 4226 *
 4227 *                      ge->load.weight * grq->avg.load_avg
 4228 *   ge->avg.load_avg = -----------------------------------		(4)
 4229 *                               grq->load.weight
 4230 *
 4231 * Except that is wrong!
 4232 *
 4233 * Because while for entities historical weight is not important and we
 4234 * really only care about our future and therefore can consider a pure
 4235 * runnable sum, runqueues can NOT do this.
 4236 *
 4237 * We specifically want runqueues to have a load_avg that includes
 4238 * historical weights. Those represent the blocked load, the load we expect
 4239 * to (shortly) return to us. This only works by keeping the weights as
 4240 * integral part of the sum. We therefore cannot decompose as per (3).
 4241 *
 4242 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
 4243 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
 4244 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
 4245 * runnable section of these tasks overlap (or not). If they were to perfectly
 4246 * align the rq as a whole would be runnable 2/3 of the time. If however we
 4247 * always have at least 1 runnable task, the rq as a whole is always runnable.
 4248 *
 4249 * So we'll have to approximate.. :/
 4250 *
 4251 * Given the constraint:
 4252 *
 4253 *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
 4254 *
 4255 * We can construct a rule that adds runnable to a rq by assuming minimal
 4256 * overlap.
 4257 *
 4258 * On removal, we'll assume each task is equally runnable; which yields:
 4259 *
 4260 *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
 4261 *
 4262 * XXX: only do this for the part of runnable > running ?
 4263 *
 4264 */
 4265static inline void
 4266update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 4267{
 4268	long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
 4269	u32 new_sum, divider;
 4270
 4271	/* Nothing to update */
 4272	if (!delta_avg)
 4273		return;
 4274
 4275	/*
 4276	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
 4277	 * See ___update_load_avg() for details.
 4278	 */
 4279	divider = get_pelt_divider(&cfs_rq->avg);
 4280
 4281
 4282	/* Set new sched_entity's utilization */
 4283	se->avg.util_avg = gcfs_rq->avg.util_avg;
 4284	new_sum = se->avg.util_avg * divider;
 4285	delta_sum = (long)new_sum - (long)se->avg.util_sum;
 4286	se->avg.util_sum = new_sum;
 4287
 4288	/* Update parent cfs_rq utilization */
 4289	add_positive(&cfs_rq->avg.util_avg, delta_avg);
 4290	add_positive(&cfs_rq->avg.util_sum, delta_sum);
 4291
 4292	/* See update_cfs_rq_load_avg() */
 4293	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
 4294					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
 4295}
 4296
 4297static inline void
 4298update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 4299{
 4300	long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
 4301	u32 new_sum, divider;
 4302
 4303	/* Nothing to update */
 4304	if (!delta_avg)
 4305		return;
 4306
 4307	/*
 4308	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
 4309	 * See ___update_load_avg() for details.
 4310	 */
 4311	divider = get_pelt_divider(&cfs_rq->avg);
 4312
 4313	/* Set new sched_entity's runnable */
 4314	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
 4315	new_sum = se->avg.runnable_avg * divider;
 4316	delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
 4317	se->avg.runnable_sum = new_sum;
 4318
 4319	/* Update parent cfs_rq runnable */
 4320	add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
 4321	add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
 4322	/* See update_cfs_rq_load_avg() */
 4323	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
 4324					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
 4325}
 4326
 4327static inline void
 4328update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 4329{
 4330	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
 4331	unsigned long load_avg;
 4332	u64 load_sum = 0;
 4333	s64 delta_sum;
 4334	u32 divider;
 4335
 4336	if (!runnable_sum)
 4337		return;
 4338
 4339	gcfs_rq->prop_runnable_sum = 0;
 4340
 4341	/*
 4342	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
 4343	 * See ___update_load_avg() for details.
 4344	 */
 4345	divider = get_pelt_divider(&cfs_rq->avg);
 4346
 4347	if (runnable_sum >= 0) {
 4348		/*
 4349		 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
 4350		 * the CPU is saturated running == runnable.
 4351		 */
 4352		runnable_sum += se->avg.load_sum;
 4353		runnable_sum = min_t(long, runnable_sum, divider);
 4354	} else {
 4355		/*
 4356		 * Estimate the new unweighted runnable_sum of the gcfs_rq by
 4357		 * assuming all tasks are equally runnable.
 4358		 */
 4359		if (scale_load_down(gcfs_rq->load.weight)) {
 4360			load_sum = div_u64(gcfs_rq->avg.load_sum,
 4361				scale_load_down(gcfs_rq->load.weight));
 4362		}
 4363
 4364		/* But make sure to not inflate se's runnable */
 4365		runnable_sum = min(se->avg.load_sum, load_sum);
 4366	}
 4367
 4368	/*
 4369	 * runnable_sum can't be lower than running_sum
 4370	 * Rescale running sum to be in the same range as runnable sum
 4371	 * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
 4372	 * runnable_sum is in [0 : LOAD_AVG_MAX]
 4373	 */
 4374	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
 4375	runnable_sum = max(runnable_sum, running_sum);
 4376
 4377	load_sum = se_weight(se) * runnable_sum;
 4378	load_avg = div_u64(load_sum, divider);
 4379
 4380	delta_avg = load_avg - se->avg.load_avg;
 4381	if (!delta_avg)
 4382		return;
 4383
 4384	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
 4385
 4386	se->avg.load_sum = runnable_sum;
 4387	se->avg.load_avg = load_avg;
 4388	add_positive(&cfs_rq->avg.load_avg, delta_avg);
 4389	add_positive(&cfs_rq->avg.load_sum, delta_sum);
 4390	/* See update_cfs_rq_load_avg() */
 4391	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
 4392					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
 4393}
 4394
 4395static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
 4396{
 4397	cfs_rq->propagate = 1;
 4398	cfs_rq->prop_runnable_sum += runnable_sum;
 4399}
 4400
 4401/* Update task and its cfs_rq load average */
 4402static inline int propagate_entity_load_avg(struct sched_entity *se)
 4403{
 4404	struct cfs_rq *cfs_rq, *gcfs_rq;
 4405
 4406	if (entity_is_task(se))
 4407		return 0;
 4408
 4409	gcfs_rq = group_cfs_rq(se);
 4410	if (!gcfs_rq->propagate)
 4411		return 0;
 4412
 4413	gcfs_rq->propagate = 0;
 4414
 4415	cfs_rq = cfs_rq_of(se);
 4416
 4417	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
 4418
 4419	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
 4420	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
 4421	update_tg_cfs_load(cfs_rq, se, gcfs_rq);
 4422
 4423	trace_pelt_cfs_tp(cfs_rq);
 4424	trace_pelt_se_tp(se);
 4425
 4426	return 1;
 4427}
 4428
 4429/*
 4430 * Check if we need to update the load and the utilization of a blocked
 4431 * group_entity:
 4432 */
 4433static inline bool skip_blocked_update(struct sched_entity *se)
 4434{
 4435	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 4436
 4437	/*
 4438	 * If sched_entity still have not zero load or utilization, we have to
 4439	 * decay it:
 4440	 */
 4441	if (se->avg.load_avg || se->avg.util_avg)
 4442		return false;
 4443
 4444	/*
 4445	 * If there is a pending propagation, we have to update the load and
 4446	 * the utilization of the sched_entity:
 4447	 */
 4448	if (gcfs_rq->propagate)
 4449		return false;
 4450
 4451	/*
 4452	 * Otherwise, the load and the utilization of the sched_entity is
 4453	 * already zero and there is no pending propagation, so it will be a
 4454	 * waste of time to try to decay it:
 4455	 */
 4456	return true;
 4457}
 4458
 4459#else /* CONFIG_FAIR_GROUP_SCHED */
 4460
 4461static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
 4462
 4463static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
 4464
 4465static inline int propagate_entity_load_avg(struct sched_entity *se)
 4466{
 4467	return 0;
 4468}
 4469
 4470static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
 4471
 4472#endif /* CONFIG_FAIR_GROUP_SCHED */
 4473
 4474#ifdef CONFIG_NO_HZ_COMMON
 4475static inline void migrate_se_pelt_lag(struct sched_entity *se)
 4476{
 4477	u64 throttled = 0, now, lut;
 4478	struct cfs_rq *cfs_rq;
 4479	struct rq *rq;
 4480	bool is_idle;
 4481
 4482	if (load_avg_is_decayed(&se->avg))
 4483		return;
 4484
 4485	cfs_rq = cfs_rq_of(se);
 4486	rq = rq_of(cfs_rq);
 4487
 4488	rcu_read_lock();
 4489	is_idle = is_idle_task(rcu_dereference(rq->curr));
 4490	rcu_read_unlock();
 4491
 4492	/*
 4493	 * The lag estimation comes with a cost we don't want to pay all the
 4494	 * time. Hence, limiting to the case where the source CPU is idle and
 4495	 * we know we are at the greatest risk to have an outdated clock.
 4496	 */
 4497	if (!is_idle)
 4498		return;
 4499
 4500	/*
 4501	 * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
 4502	 *
 4503	 *   last_update_time (the cfs_rq's last_update_time)
 4504	 *	= cfs_rq_clock_pelt()@cfs_rq_idle
 4505	 *      = rq_clock_pelt()@cfs_rq_idle
 4506	 *        - cfs->throttled_clock_pelt_time@cfs_rq_idle
 4507	 *
 4508	 *   cfs_idle_lag (delta between rq's update and cfs_rq's update)
 4509	 *      = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
 4510	 *
 4511	 *   rq_idle_lag (delta between now and rq's update)
 4512	 *      = sched_clock_cpu() - rq_clock()@rq_idle
 4513	 *
 4514	 * We can then write:
 4515	 *
 4516	 *    now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
 4517	 *          sched_clock_cpu() - rq_clock()@rq_idle
 4518	 * Where:
 4519	 *      rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
 4520	 *      rq_clock()@rq_idle      is rq->clock_idle
 4521	 *      cfs->throttled_clock_pelt_time@cfs_rq_idle
 4522	 *                              is cfs_rq->throttled_pelt_idle
 4523	 */
 4524
 4525#ifdef CONFIG_CFS_BANDWIDTH
 4526	throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
 4527	/* The clock has been stopped for throttling */
 4528	if (throttled == U64_MAX)
 4529		return;
 4530#endif
 4531	now = u64_u32_load(rq->clock_pelt_idle);
 4532	/*
 4533	 * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
 4534	 * is observed the old clock_pelt_idle value and the new clock_idle,
 4535	 * which lead to an underestimation. The opposite would lead to an
 4536	 * overestimation.
 4537	 */
 4538	smp_rmb();
 4539	lut = cfs_rq_last_update_time(cfs_rq);
 4540
 4541	now -= throttled;
 4542	if (now < lut)
 4543		/*
 4544		 * cfs_rq->avg.last_update_time is more recent than our
 4545		 * estimation, let's use it.
 4546		 */
 4547		now = lut;
 4548	else
 4549		now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
 4550
 4551	__update_load_avg_blocked_se(now, se);
 4552}
 4553#else
 4554static void migrate_se_pelt_lag(struct sched_entity *se) {}
 4555#endif
 4556
 4557/**
 4558 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
 4559 * @now: current time, as per cfs_rq_clock_pelt()
 4560 * @cfs_rq: cfs_rq to update
 4561 *
 4562 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
 4563 * avg. The immediate corollary is that all (fair) tasks must be attached.
 4564 *
 4565 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
 4566 *
 4567 * Return: true if the load decayed or we removed load.
 4568 *
 4569 * Since both these conditions indicate a changed cfs_rq->avg.load we should
 4570 * call update_tg_load_avg() when this function returns true.
 4571 */
 4572static inline int
 4573update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 4574{
 4575	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
 4576	struct sched_avg *sa = &cfs_rq->avg;
 4577	int decayed = 0;
 4578
 4579	if (cfs_rq->removed.nr) {
 4580		unsigned long r;
 4581		u32 divider = get_pelt_divider(&cfs_rq->avg);
 4582
 4583		raw_spin_lock(&cfs_rq->removed.lock);
 4584		swap(cfs_rq->removed.util_avg, removed_util);
 4585		swap(cfs_rq->removed.load_avg, removed_load);
 4586		swap(cfs_rq->removed.runnable_avg, removed_runnable);
 4587		cfs_rq->removed.nr = 0;
 4588		raw_spin_unlock(&cfs_rq->removed.lock);
 4589
 4590		r = removed_load;
 4591		sub_positive(&sa->load_avg, r);
 4592		sub_positive(&sa->load_sum, r * divider);
 4593		/* See sa->util_sum below */
 4594		sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
 4595
 4596		r = removed_util;
 4597		sub_positive(&sa->util_avg, r);
 4598		sub_positive(&sa->util_sum, r * divider);
 4599		/*
 4600		 * Because of rounding, se->util_sum might ends up being +1 more than
 4601		 * cfs->util_sum. Although this is not a problem by itself, detaching
 4602		 * a lot of tasks with the rounding problem between 2 updates of
 4603		 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
 4604		 * cfs_util_avg is not.
 4605		 * Check that util_sum is still above its lower bound for the new
 4606		 * util_avg. Given that period_contrib might have moved since the last
 4607		 * sync, we are only sure that util_sum must be above or equal to
 4608		 *    util_avg * minimum possible divider
 4609		 */
 4610		sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
 4611
 4612		r = removed_runnable;
 4613		sub_positive(&sa->runnable_avg, r);
 4614		sub_positive(&sa->runnable_sum, r * divider);
 4615		/* See sa->util_sum above */
 4616		sa->runnable_sum = max_t(u32, sa->runnable_sum,
 4617					      sa->runnable_avg * PELT_MIN_DIVIDER);
 4618
 4619		/*
 4620		 * removed_runnable is the unweighted version of removed_load so we
 4621		 * can use it to estimate removed_load_sum.
 4622		 */
 4623		add_tg_cfs_propagate(cfs_rq,
 4624			-(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
 4625
 4626		decayed = 1;
 4627	}
 4628
 4629	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
 4630	u64_u32_store_copy(sa->last_update_time,
 4631			   cfs_rq->last_update_time_copy,
 4632			   sa->last_update_time);
 4633	return decayed;
 4634}
 4635
 4636/**
 4637 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
 4638 * @cfs_rq: cfs_rq to attach to
 4639 * @se: sched_entity to attach
 4640 *
 4641 * Must call update_cfs_rq_load_avg() before this, since we rely on
 4642 * cfs_rq->avg.last_update_time being current.
 4643 */
 4644static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 4645{
 4646	/*
 4647	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
 4648	 * See ___update_load_avg() for details.
 4649	 */
 4650	u32 divider = get_pelt_divider(&cfs_rq->avg);
 4651
 4652	/*
 4653	 * When we attach the @se to the @cfs_rq, we must align the decay
 4654	 * window because without that, really weird and wonderful things can
 4655	 * happen.
 4656	 *
 4657	 * XXX illustrate
 4658	 */
 4659	se->avg.last_update_time = cfs_rq->avg.last_update_time;
 4660	se->avg.period_contrib = cfs_rq->avg.period_contrib;
 4661
 4662	/*
 4663	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
 4664	 * period_contrib. This isn't strictly correct, but since we're
 4665	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
 4666	 * _sum a little.
 4667	 */
 4668	se->avg.util_sum = se->avg.util_avg * divider;
 4669
 4670	se->avg.runnable_sum = se->avg.runnable_avg * divider;
 4671
 4672	se->avg.load_sum = se->avg.load_avg * divider;
 4673	if (se_weight(se) < se->avg.load_sum)
 4674		se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
 4675	else
 4676		se->avg.load_sum = 1;
 4677
 4678	enqueue_load_avg(cfs_rq, se);
 4679	cfs_rq->avg.util_avg += se->avg.util_avg;
 4680	cfs_rq->avg.util_sum += se->avg.util_sum;
 4681	cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
 4682	cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
 4683
 4684	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 4685
 4686	cfs_rq_util_change(cfs_rq, 0);
 4687
 4688	trace_pelt_cfs_tp(cfs_rq);
 4689}
 4690
 4691/**
 4692 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
 4693 * @cfs_rq: cfs_rq to detach from
 4694 * @se: sched_entity to detach
 4695 *
 4696 * Must call update_cfs_rq_load_avg() before this, since we rely on
 4697 * cfs_rq->avg.last_update_time being current.
 4698 */
 4699static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 4700{
 4701	dequeue_load_avg(cfs_rq, se);
 4702	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 4703	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 4704	/* See update_cfs_rq_load_avg() */
 4705	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
 4706					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
 4707
 4708	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
 4709	sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
 4710	/* See update_cfs_rq_load_avg() */
 4711	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
 4712					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
 4713
 4714	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 4715
 4716	cfs_rq_util_change(cfs_rq, 0);
 4717
 4718	trace_pelt_cfs_tp(cfs_rq);
 4719}
 4720
 4721/*
 4722 * Optional action to be done while updating the load average
 4723 */
 4724#define UPDATE_TG	0x1
 4725#define SKIP_AGE_LOAD	0x2
 4726#define DO_ATTACH	0x4
 4727#define DO_DETACH	0x8
 4728
 4729/* Update task and its cfs_rq load average */
 4730static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 4731{
 4732	u64 now = cfs_rq_clock_pelt(cfs_rq);
 4733	int decayed;
 4734
 4735	/*
 4736	 * Track task load average for carrying it to new CPU after migrated, and
 4737	 * track group sched_entity load average for task_h_load calculation in migration
 4738	 */
 4739	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
 4740		__update_load_avg_se(now, cfs_rq, se);
 4741
 4742	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
 4743	decayed |= propagate_entity_load_avg(se);
 4744
 4745	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
 4746
 4747		/*
 4748		 * DO_ATTACH means we're here from enqueue_entity().
 4749		 * !last_update_time means we've passed through
 4750		 * migrate_task_rq_fair() indicating we migrated.
 4751		 *
 4752		 * IOW we're enqueueing a task on a new CPU.
 4753		 */
 4754		attach_entity_load_avg(cfs_rq, se);
 4755		update_tg_load_avg(cfs_rq);
 4756
 4757	} else if (flags & DO_DETACH) {
 4758		/*
 4759		 * DO_DETACH means we're here from dequeue_entity()
 4760		 * and we are migrating task out of the CPU.
 4761		 */
 4762		detach_entity_load_avg(cfs_rq, se);
 4763		update_tg_load_avg(cfs_rq);
 4764	} else if (decayed) {
 4765		cfs_rq_util_change(cfs_rq, 0);
 4766
 4767		if (flags & UPDATE_TG)
 4768			update_tg_load_avg(cfs_rq);
 4769	}
 4770}
 4771
 4772/*
 4773 * Synchronize entity load avg of dequeued entity without locking
 4774 * the previous rq.
 4775 */
 4776static void sync_entity_load_avg(struct sched_entity *se)
 4777{
 4778	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 4779	u64 last_update_time;
 4780
 4781	last_update_time = cfs_rq_last_update_time(cfs_rq);
 4782	__update_load_avg_blocked_se(last_update_time, se);
 4783}
 4784
 4785/*
 4786 * Task first catches up with cfs_rq, and then subtract
 4787 * itself from the cfs_rq (task must be off the queue now).
 4788 */
 4789static void remove_entity_load_avg(struct sched_entity *se)
 4790{
 4791	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 4792	unsigned long flags;
 4793
 4794	/*
 4795	 * tasks cannot exit without having gone through wake_up_new_task() ->
 4796	 * enqueue_task_fair() which will have added things to the cfs_rq,
 4797	 * so we can remove unconditionally.
 4798	 */
 4799
 4800	sync_entity_load_avg(se);
 4801
 4802	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
 4803	++cfs_rq->removed.nr;
 4804	cfs_rq->removed.util_avg	+= se->avg.util_avg;
 4805	cfs_rq->removed.load_avg	+= se->avg.load_avg;
 4806	cfs_rq->removed.runnable_avg	+= se->avg.runnable_avg;
 4807	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 4808}
 4809
 4810static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
 4811{
 4812	return cfs_rq->avg.runnable_avg;
 4813}
 4814
 4815static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 4816{
 4817	return cfs_rq->avg.load_avg;
 4818}
 4819
 4820static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
 4821
 4822static inline unsigned long task_util(struct task_struct *p)
 4823{
 4824	return READ_ONCE(p->se.avg.util_avg);
 4825}
 4826
 4827static inline unsigned long task_runnable(struct task_struct *p)
 4828{
 4829	return READ_ONCE(p->se.avg.runnable_avg);
 4830}
 4831
 4832static inline unsigned long _task_util_est(struct task_struct *p)
 4833{
 4834	return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
 4835}
 4836
 4837static inline unsigned long task_util_est(struct task_struct *p)
 4838{
 4839	return max(task_util(p), _task_util_est(p));
 4840}
 4841
 4842static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 4843				    struct task_struct *p)
 4844{
 4845	unsigned int enqueued;
 4846
 4847	if (!sched_feat(UTIL_EST))
 4848		return;
 4849
 4850	/* Update root cfs_rq's estimated utilization */
 4851	enqueued  = cfs_rq->avg.util_est;
 4852	enqueued += _task_util_est(p);
 4853	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
 4854
 4855	trace_sched_util_est_cfs_tp(cfs_rq);
 4856}
 4857
 4858static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
 4859				    struct task_struct *p)
 4860{
 4861	unsigned int enqueued;
 4862
 4863	if (!sched_feat(UTIL_EST))
 4864		return;
 4865
 4866	/* Update root cfs_rq's estimated utilization */
 4867	enqueued  = cfs_rq->avg.util_est;
 4868	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
 4869	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
 4870
 4871	trace_sched_util_est_cfs_tp(cfs_rq);
 4872}
 4873
 4874#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
 4875
 4876static inline void util_est_update(struct cfs_rq *cfs_rq,
 4877				   struct task_struct *p,
 4878				   bool task_sleep)
 4879{
 4880	unsigned int ewma, dequeued, last_ewma_diff;
 4881
 4882	if (!sched_feat(UTIL_EST))
 4883		return;
 4884
 4885	/*
 4886	 * Skip update of task's estimated utilization when the task has not
 4887	 * yet completed an activation, e.g. being migrated.
 4888	 */
 4889	if (!task_sleep)
 4890		return;
 4891
 4892	/* Get current estimate of utilization */
 4893	ewma = READ_ONCE(p->se.avg.util_est);
 4894
 4895	/*
 4896	 * If the PELT values haven't changed since enqueue time,
 4897	 * skip the util_est update.
 4898	 */
 4899	if (ewma & UTIL_AVG_UNCHANGED)
 4900		return;
 4901
 4902	/* Get utilization at dequeue */
 4903	dequeued = task_util(p);
 4904
 4905	/*
 4906	 * Reset EWMA on utilization increases, the moving average is used only
 4907	 * to smooth utilization decreases.
 4908	 */
 4909	if (ewma <= dequeued) {
 4910		ewma = dequeued;
 4911		goto done;
 4912	}
 4913
 4914	/*
 4915	 * Skip update of task's estimated utilization when its members are
 4916	 * already ~1% close to its last activation value.
 4917	 */
 4918	last_ewma_diff = ewma - dequeued;
 4919	if (last_ewma_diff < UTIL_EST_MARGIN)
 4920		goto done;
 4921
 4922	/*
 4923	 * To avoid overestimation of actual task utilization, skip updates if
 4924	 * we cannot grant there is idle time in this CPU.
 4925	 */
 4926	if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
 4927		return;
 4928
 4929	/*
 4930	 * To avoid underestimate of task utilization, skip updates of EWMA if
 4931	 * we cannot grant that thread got all CPU time it wanted.
 4932	 */
 4933	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
 4934		goto done;
 4935
 4936
 4937	/*
 4938	 * Update Task's estimated utilization
 4939	 *
 4940	 * When *p completes an activation we can consolidate another sample
 4941	 * of the task size. This is done by using this value to update the
 4942	 * Exponential Weighted Moving Average (EWMA):
 4943	 *
 4944	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
 4945	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
 4946	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
 4947	 *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
 4948	 *          = w * (-last_ewma_diff +  ewma(t-1) / w)
 4949	 *
 4950	 * Where 'w' is the weight of new samples, which is configured to be
 4951	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
 4952	 */
 4953	ewma <<= UTIL_EST_WEIGHT_SHIFT;
 4954	ewma  -= last_ewma_diff;
 4955	ewma >>= UTIL_EST_WEIGHT_SHIFT;
 4956done:
 4957	ewma |= UTIL_AVG_UNCHANGED;
 4958	WRITE_ONCE(p->se.avg.util_est, ewma);
 4959
 4960	trace_sched_util_est_se_tp(&p->se);
 4961}
 4962
 4963static inline unsigned long get_actual_cpu_capacity(int cpu)
 4964{
 4965	unsigned long capacity = arch_scale_cpu_capacity(cpu);
 4966
 4967	capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
 4968
 4969	return capacity;
 4970}
 4971
 4972static inline int util_fits_cpu(unsigned long util,
 4973				unsigned long uclamp_min,
 4974				unsigned long uclamp_max,
 4975				int cpu)
 4976{
 4977	unsigned long capacity = capacity_of(cpu);
 4978	unsigned long capacity_orig;
 4979	bool fits, uclamp_max_fits;
 4980
 4981	/*
 4982	 * Check if the real util fits without any uclamp boost/cap applied.
 4983	 */
 4984	fits = fits_capacity(util, capacity);
 4985
 4986	if (!uclamp_is_used())
 4987		return fits;
 4988
 4989	/*
 4990	 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
 4991	 * uclamp_max. We only care about capacity pressure (by using
 4992	 * capacity_of()) for comparing against the real util.
 4993	 *
 4994	 * If a task is boosted to 1024 for example, we don't want a tiny
 4995	 * pressure to skew the check whether it fits a CPU or not.
 4996	 *
 4997	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
 4998	 * should fit a little cpu even if there's some pressure.
 4999	 *
 5000	 * Only exception is for HW or cpufreq pressure since it has a direct impact
 5001	 * on available OPP of the system.
 5002	 *
 5003	 * We honour it for uclamp_min only as a drop in performance level
 5004	 * could result in not getting the requested minimum performance level.
 5005	 *
 5006	 * For uclamp_max, we can tolerate a drop in performance level as the
 5007	 * goal is to cap the task. So it's okay if it's getting less.
 5008	 */
 5009	capacity_orig = arch_scale_cpu_capacity(cpu);
 5010
 5011	/*
 5012	 * We want to force a task to fit a cpu as implied by uclamp_max.
 5013	 * But we do have some corner cases to cater for..
 5014	 *
 5015	 *
 5016	 *                                 C=z
 5017	 *   |                             ___
 5018	 *   |                  C=y       |   |
 5019	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
 5020	 *   |      C=x        |   |      |   |
 5021	 *   |      ___        |   |      |   |
 5022	 *   |     |   |       |   |      |   |    (util somewhere in this region)
 5023	 *   |     |   |       |   |      |   |
 5024	 *   |     |   |       |   |      |   |
 5025	 *   +----------------------------------------
 5026	 *         CPU0        CPU1       CPU2
 5027	 *
 5028	 *   In the above example if a task is capped to a specific performance
 5029	 *   point, y, then when:
 5030	 *
 5031	 *   * util = 80% of x then it does not fit on CPU0 and should migrate
 5032	 *     to CPU1
 5033	 *   * util = 80% of y then it is forced to fit on CPU1 to honour
 5034	 *     uclamp_max request.
 5035	 *
 5036	 *   which is what we're enforcing here. A task always fits if
 5037	 *   uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
 5038	 *   the normal upmigration rules should withhold still.
 5039	 *
 5040	 *   Only exception is when we are on max capacity, then we need to be
 5041	 *   careful not to block overutilized state. This is so because:
 5042	 *
 5043	 *     1. There's no concept of capping at max_capacity! We can't go
 5044	 *        beyond this performance level anyway.
 5045	 *     2. The system is being saturated when we're operating near
 5046	 *        max capacity, it doesn't make sense to block overutilized.
 5047	 */
 5048	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
 5049	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
 5050	fits = fits || uclamp_max_fits;
 5051
 5052	/*
 5053	 *
 5054	 *                                 C=z
 5055	 *   |                             ___       (region a, capped, util >= uclamp_max)
 5056	 *   |                  C=y       |   |
 5057	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
 5058	 *   |      C=x        |   |      |   |
 5059	 *   |      ___        |   |      |   |      (region b, uclamp_min <= util <= uclamp_max)
 5060	 *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
 5061	 *   |     |   |       |   |      |   |
 5062	 *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min)
 5063	 *   +----------------------------------------
 5064	 *         CPU0        CPU1       CPU2
 5065	 *
 5066	 * a) If util > uclamp_max, then we're capped, we don't care about
 5067	 *    actual fitness value here. We only care if uclamp_max fits
 5068	 *    capacity without taking margin/pressure into account.
 5069	 *    See comment above.
 5070	 *
 5071	 * b) If uclamp_min <= util <= uclamp_max, then the normal
 5072	 *    fits_capacity() rules apply. Except we need to ensure that we
 5073	 *    enforce we remain within uclamp_max, see comment above.
 5074	 *
 5075	 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
 5076	 *    need to take into account the boosted value fits the CPU without
 5077	 *    taking margin/pressure into account.
 5078	 *
 5079	 * Cases (a) and (b) are handled in the 'fits' variable already. We
 5080	 * just need to consider an extra check for case (c) after ensuring we
 5081	 * handle the case uclamp_min > uclamp_max.
 5082	 */
 5083	uclamp_min = min(uclamp_min, uclamp_max);
 5084	if (fits && (util < uclamp_min) &&
 5085	    (uclamp_min > get_actual_cpu_capacity(cpu)))
 5086		return -1;
 5087
 5088	return fits;
 5089}
 5090
 5091static inline int task_fits_cpu(struct task_struct *p, int cpu)
 5092{
 5093	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
 5094	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
 5095	unsigned long util = task_util_est(p);
 5096	/*
 5097	 * Return true only if the cpu fully fits the task requirements, which
 5098	 * include the utilization but also the performance hints.
 5099	 */
 5100	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
 5101}
 5102
 5103static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 5104{
 5105	int cpu = cpu_of(rq);
 5106
 5107	if (!sched_asym_cpucap_active())
 5108		return;
 5109
 5110	/*
 5111	 * Affinity allows us to go somewhere higher?  Or are we on biggest
 5112	 * available CPU already? Or do we fit into this CPU ?
 5113	 */
 5114	if (!p || (p->nr_cpus_allowed == 1) ||
 5115	    (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
 5116	    task_fits_cpu(p, cpu)) {
 5117
 5118		rq->misfit_task_load = 0;
 5119		return;
 5120	}
 5121
 5122	/*
 5123	 * Make sure that misfit_task_load will not be null even if
 5124	 * task_h_load() returns 0.
 5125	 */
 5126	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
 5127}
 5128
 5129#else /* CONFIG_SMP */
 5130
 5131static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 5132{
 5133	return !cfs_rq->nr_running;
 5134}
 5135
 5136#define UPDATE_TG	0x0
 5137#define SKIP_AGE_LOAD	0x0
 5138#define DO_ATTACH	0x0
 5139#define DO_DETACH	0x0
 5140
 5141static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 5142{
 5143	cfs_rq_util_change(cfs_rq, 0);
 5144}
 5145
 5146static inline void remove_entity_load_avg(struct sched_entity *se) {}
 5147
 5148static inline void
 5149attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 5150static inline void
 5151detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 5152
 5153static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
 5154{
 5155	return 0;
 5156}
 5157
 5158static inline void
 5159util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 5160
 5161static inline void
 5162util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 5163
 5164static inline void
 5165util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
 5166		bool task_sleep) {}
 5167static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
 5168
 5169#endif /* CONFIG_SMP */
 5170
 5171static void
 5172place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 5173{
 5174	u64 vslice, vruntime = avg_vruntime(cfs_rq);
 5175	s64 lag = 0;
 5176
 5177	if (!se->custom_slice)
 5178		se->slice = sysctl_sched_base_slice;
 5179	vslice = calc_delta_fair(se->slice, se);
 5180
 5181	/*
 5182	 * Due to how V is constructed as the weighted average of entities,
 5183	 * adding tasks with positive lag, or removing tasks with negative lag
 5184	 * will move 'time' backwards, this can screw around with the lag of
 5185	 * other tasks.
 5186	 *
 5187	 * EEVDF: placement strategy #1 / #2
 5188	 */
 5189	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) {
 5190		struct sched_entity *curr = cfs_rq->curr;
 5191		unsigned long load;
 5192
 5193		lag = se->vlag;
 5194
 5195		/*
 5196		 * If we want to place a task and preserve lag, we have to
 5197		 * consider the effect of the new entity on the weighted
 5198		 * average and compensate for this, otherwise lag can quickly
 5199		 * evaporate.
 5200		 *
 5201		 * Lag is defined as:
 5202		 *
 5203		 *   lag_i = S - s_i = w_i * (V - v_i)
 5204		 *
 5205		 * To avoid the 'w_i' term all over the place, we only track
 5206		 * the virtual lag:
 5207		 *
 5208		 *   vl_i = V - v_i <=> v_i = V - vl_i
 5209		 *
 5210		 * And we take V to be the weighted average of all v:
 5211		 *
 5212		 *   V = (\Sum w_j*v_j) / W
 5213		 *
 5214		 * Where W is: \Sum w_j
 5215		 *
 5216		 * Then, the weighted average after adding an entity with lag
 5217		 * vl_i is given by:
 5218		 *
 5219		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
 5220		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
 5221		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
 5222		 *      = (V*(W + w_i) - w_i*l) / (W + w_i)
 5223		 *      = V - w_i*vl_i / (W + w_i)
 5224		 *
 5225		 * And the actual lag after adding an entity with vl_i is:
 5226		 *
 5227		 *   vl'_i = V' - v_i
 5228		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
 5229		 *         = vl_i - w_i*vl_i / (W + w_i)
 5230		 *
 5231		 * Which is strictly less than vl_i. So in order to preserve lag
 5232		 * we should inflate the lag before placement such that the
 5233		 * effective lag after placement comes out right.
 5234		 *
 5235		 * As such, invert the above relation for vl'_i to get the vl_i
 5236		 * we need to use such that the lag after placement is the lag
 5237		 * we computed before dequeue.
 5238		 *
 5239		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
 5240		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
 5241		 *
 5242		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
 5243		 *                   = W*vl_i
 5244		 *
 5245		 *   vl_i = (W + w_i)*vl'_i / W
 5246		 */
 5247		load = cfs_rq->avg_load;
 5248		if (curr && curr->on_rq)
 5249			load += scale_load_down(curr->load.weight);
 5250
 5251		lag *= load + scale_load_down(se->load.weight);
 5252		if (WARN_ON_ONCE(!load))
 5253			load = 1;
 5254		lag = div_s64(lag, load);
 5255	}
 5256
 5257	se->vruntime = vruntime - lag;
 5258
 5259	if (se->rel_deadline) {
 5260		se->deadline += se->vruntime;
 5261		se->rel_deadline = 0;
 5262		return;
 5263	}
 5264
 5265	/*
 5266	 * When joining the competition; the existing tasks will be,
 5267	 * on average, halfway through their slice, as such start tasks
 5268	 * off with half a slice to ease into the competition.
 5269	 */
 5270	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
 5271		vslice /= 2;
 5272
 5273	/*
 5274	 * EEVDF: vd_i = ve_i + r_i/w_i
 5275	 */
 5276	se->deadline = se->vruntime + vslice;
 5277}
 5278
 5279static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 5280static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 5281
 5282static inline bool cfs_bandwidth_used(void);
 5283
 5284static void
 5285requeue_delayed_entity(struct sched_entity *se);
 5286
 5287static void
 5288enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 5289{
 5290	bool curr = cfs_rq->curr == se;
 5291
 5292	/*
 5293	 * If we're the current task, we must renormalise before calling
 5294	 * update_curr().
 5295	 */
 5296	if (curr)
 5297		place_entity(cfs_rq, se, flags);
 5298
 5299	update_curr(cfs_rq);
 5300
 5301	/*
 5302	 * When enqueuing a sched_entity, we must:
 5303	 *   - Update loads to have both entity and cfs_rq synced with now.
 5304	 *   - For group_entity, update its runnable_weight to reflect the new
 5305	 *     h_nr_running of its group cfs_rq.
 5306	 *   - For group_entity, update its weight to reflect the new share of
 5307	 *     its group cfs_rq
 5308	 *   - Add its new weight to cfs_rq->load.weight
 5309	 */
 5310	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
 5311	se_update_runnable(se);
 5312	/*
 5313	 * XXX update_load_avg() above will have attached us to the pelt sum;
 5314	 * but update_cfs_group() here will re-adjust the weight and have to
 5315	 * undo/redo all that. Seems wasteful.
 5316	 */
 5317	update_cfs_group(se);
 5318
 5319	/*
 5320	 * XXX now that the entity has been re-weighted, and it's lag adjusted,
 5321	 * we can place the entity.
 5322	 */
 5323	if (!curr)
 5324		place_entity(cfs_rq, se, flags);
 5325
 5326	account_entity_enqueue(cfs_rq, se);
 5327
 5328	/* Entity has migrated, no longer consider this task hot */
 5329	if (flags & ENQUEUE_MIGRATED)
 5330		se->exec_start = 0;
 5331
 5332	check_schedstat_required();
 5333	update_stats_enqueue_fair(cfs_rq, se, flags);
 5334	if (!curr)
 5335		__enqueue_entity(cfs_rq, se);
 5336	se->on_rq = 1;
 5337
 5338	if (cfs_rq->nr_running == 1) {
 5339		check_enqueue_throttle(cfs_rq);
 5340		if (!throttled_hierarchy(cfs_rq)) {
 5341			list_add_leaf_cfs_rq(cfs_rq);
 5342		} else {
 5343#ifdef CONFIG_CFS_BANDWIDTH
 5344			struct rq *rq = rq_of(cfs_rq);
 5345
 5346			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
 5347				cfs_rq->throttled_clock = rq_clock(rq);
 5348			if (!cfs_rq->throttled_clock_self)
 5349				cfs_rq->throttled_clock_self = rq_clock(rq);
 5350#endif
 5351		}
 5352	}
 5353}
 5354
 5355static void __clear_buddies_next(struct sched_entity *se)
 5356{
 5357	for_each_sched_entity(se) {
 5358		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 5359		if (cfs_rq->next != se)
 5360			break;
 5361
 5362		cfs_rq->next = NULL;
 5363	}
 5364}
 5365
 5366static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 5367{
 5368	if (cfs_rq->next == se)
 5369		__clear_buddies_next(se);
 5370}
 5371
 5372static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 5373
 5374static void set_delayed(struct sched_entity *se)
 5375{
 5376	se->sched_delayed = 1;
 5377
 5378	/*
 5379	 * Delayed se of cfs_rq have no tasks queued on them.
 5380	 * Do not adjust h_nr_runnable since dequeue_entities()
 5381	 * will account it for blocked tasks.
 5382	 */
 5383	if (!entity_is_task(se))
 5384		return;
 5385
 5386	for_each_sched_entity(se) {
 5387		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 5388
 5389		cfs_rq->h_nr_delayed++;
 5390		if (cfs_rq_throttled(cfs_rq))
 5391			break;
 5392	}
 5393}
 5394
 5395static void clear_delayed(struct sched_entity *se)
 5396{
 5397	se->sched_delayed = 0;
 5398
 5399	/*
 5400	 * Delayed se of cfs_rq have no tasks queued on them.
 5401	 * Do not adjust h_nr_runnable since a dequeue has
 5402	 * already accounted for it or an enqueue of a task
 5403	 * below it will account for it in enqueue_task_fair().
 5404	 */
 5405	if (!entity_is_task(se))
 5406		return;
 5407
 5408	for_each_sched_entity(se) {
 5409		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 5410
 5411		cfs_rq->h_nr_delayed--;
 5412		if (cfs_rq_throttled(cfs_rq))
 5413			break;
 5414	}
 5415}
 5416
 5417static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
 5418{
 5419	clear_delayed(se);
 5420	if (sched_feat(DELAY_ZERO) && se->vlag > 0)
 5421		se->vlag = 0;
 5422}
 5423
 5424static bool
 5425dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 5426{
 5427	bool sleep = flags & DEQUEUE_SLEEP;
 5428
 5429	update_curr(cfs_rq);
 5430	clear_buddies(cfs_rq, se);
 5431
 5432	if (flags & DEQUEUE_DELAYED) {
 5433		SCHED_WARN_ON(!se->sched_delayed);
 5434	} else {
 5435		bool delay = sleep;
 5436		/*
 5437		 * DELAY_DEQUEUE relies on spurious wakeups, special task
 5438		 * states must not suffer spurious wakeups, excempt them.
 5439		 */
 5440		if (flags & DEQUEUE_SPECIAL)
 5441			delay = false;
 5442
 5443		SCHED_WARN_ON(delay && se->sched_delayed);
 5444
 5445		if (sched_feat(DELAY_DEQUEUE) && delay &&
 5446		    !entity_eligible(cfs_rq, se)) {
 5447			update_load_avg(cfs_rq, se, 0);
 5448			set_delayed(se);
 5449			return false;
 5450		}
 5451	}
 5452
 5453	int action = UPDATE_TG;
 5454	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
 5455		action |= DO_DETACH;
 5456
 5457	/*
 5458	 * When dequeuing a sched_entity, we must:
 5459	 *   - Update loads to have both entity and cfs_rq synced with now.
 5460	 *   - For group_entity, update its runnable_weight to reflect the new
 5461	 *     h_nr_running of its group cfs_rq.
 5462	 *   - Subtract its previous weight from cfs_rq->load.weight.
 5463	 *   - For group entity, update its weight to reflect the new share
 5464	 *     of its group cfs_rq.
 5465	 */
 5466	update_load_avg(cfs_rq, se, action);
 5467	se_update_runnable(se);
 5468
 5469	update_stats_dequeue_fair(cfs_rq, se, flags);
 5470
 5471	update_entity_lag(cfs_rq, se);
 5472	if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
 5473		se->deadline -= se->vruntime;
 5474		se->rel_deadline = 1;
 5475	}
 5476
 5477	if (se != cfs_rq->curr)
 5478		__dequeue_entity(cfs_rq, se);
 5479	se->on_rq = 0;
 5480	account_entity_dequeue(cfs_rq, se);
 5481
 5482	/* return excess runtime on last dequeue */
 5483	return_cfs_rq_runtime(cfs_rq);
 5484
 5485	update_cfs_group(se);
 5486
 5487	/*
 5488	 * Now advance min_vruntime if @se was the entity holding it back,
 5489	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
 5490	 * put back on, and if we advance min_vruntime, we'll be placed back
 5491	 * further than we started -- i.e. we'll be penalized.
 5492	 */
 5493	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
 5494		update_min_vruntime(cfs_rq);
 5495
 5496	if (flags & DEQUEUE_DELAYED)
 5497		finish_delayed_dequeue_entity(se);
 5498
 5499	if (cfs_rq->nr_running == 0)
 5500		update_idle_cfs_rq_clock_pelt(cfs_rq);
 5501
 5502	return true;
 5503}
 5504
 5505static void
 5506set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 5507{
 5508	clear_buddies(cfs_rq, se);
 5509
 5510	/* 'current' is not kept within the tree. */
 5511	if (se->on_rq) {
 5512		/*
 5513		 * Any task has to be enqueued before it get to execute on
 5514		 * a CPU. So account for the time it spent waiting on the
 5515		 * runqueue.
 5516		 */
 5517		update_stats_wait_end_fair(cfs_rq, se);
 5518		__dequeue_entity(cfs_rq, se);
 5519		update_load_avg(cfs_rq, se, UPDATE_TG);
 5520		/*
 5521		 * HACK, stash a copy of deadline at the point of pick in vlag,
 5522		 * which isn't used until dequeue.
 5523		 */
 5524		se->vlag = se->deadline;
 5525	}
 5526
 5527	update_stats_curr_start(cfs_rq, se);
 5528	SCHED_WARN_ON(cfs_rq->curr);
 5529	cfs_rq->curr = se;
 5530
 5531	/*
 5532	 * Track our maximum slice length, if the CPU's load is at
 5533	 * least twice that of our own weight (i.e. don't track it
 5534	 * when there are only lesser-weight tasks around):
 5535	 */
 5536	if (schedstat_enabled() &&
 5537	    rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
 5538		struct sched_statistics *stats;
 5539
 5540		stats = __schedstats_from_se(se);
 5541		__schedstat_set(stats->slice_max,
 5542				max((u64)stats->slice_max,
 5543				    se->sum_exec_runtime - se->prev_sum_exec_runtime));
 5544	}
 5545
 5546	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 5547}
 5548
 5549static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
 5550
 5551/*
 5552 * Pick the next process, keeping these things in mind, in this order:
 5553 * 1) keep things fair between processes/task groups
 5554 * 2) pick the "next" process, since someone really wants that to run
 5555 * 3) pick the "last" process, for cache locality
 5556 * 4) do not run the "skip" process, if something else is available
 5557 */
 5558static struct sched_entity *
 5559pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
 5560{
 5561	/*
 5562	 * Picking the ->next buddy will affect latency but not fairness.
 5563	 */
 5564	if (sched_feat(PICK_BUDDY) &&
 5565	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
 5566		/* ->next will never be delayed */
 5567		SCHED_WARN_ON(cfs_rq->next->sched_delayed);
 5568		return cfs_rq->next;
 5569	}
 5570
 5571	struct sched_entity *se = pick_eevdf(cfs_rq);
 5572	if (se->sched_delayed) {
 5573		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
 5574		/*
 5575		 * Must not reference @se again, see __block_task().
 5576		 */
 5577		return NULL;
 5578	}
 5579	return se;
 5580}
 5581
 5582static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 5583
 5584static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 5585{
 5586	/*
 5587	 * If still on the runqueue then deactivate_task()
 5588	 * was not called and update_curr() has to be done:
 5589	 */
 5590	if (prev->on_rq)
 5591		update_curr(cfs_rq);
 5592
 5593	/* throttle cfs_rqs exceeding runtime */
 5594	check_cfs_rq_runtime(cfs_rq);
 5595
 5596	if (prev->on_rq) {
 5597		update_stats_wait_start_fair(cfs_rq, prev);
 5598		/* Put 'current' back into the tree. */
 5599		__enqueue_entity(cfs_rq, prev);
 5600		/* in !on_rq case, update occurred at dequeue */
 5601		update_load_avg(cfs_rq, prev, 0);
 5602	}
 5603	SCHED_WARN_ON(cfs_rq->curr != prev);
 5604	cfs_rq->curr = NULL;
 5605}
 5606
 5607static void
 5608entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 5609{
 5610	/*
 5611	 * Update run-time statistics of the 'current'.
 5612	 */
 5613	update_curr(cfs_rq);
 5614
 5615	/*
 5616	 * Ensure that runnable average is periodically updated.
 5617	 */
 5618	update_load_avg(cfs_rq, curr, UPDATE_TG);
 5619	update_cfs_group(curr);
 5620
 5621#ifdef CONFIG_SCHED_HRTICK
 5622	/*
 5623	 * queued ticks are scheduled to match the slice, so don't bother
 5624	 * validating it and just reschedule.
 5625	 */
 5626	if (queued) {
 5627		resched_curr_lazy(rq_of(cfs_rq));
 5628		return;
 5629	}
 5630#endif
 5631}
 5632
 5633
 5634/**************************************************
 5635 * CFS bandwidth control machinery
 5636 */
 5637
 5638#ifdef CONFIG_CFS_BANDWIDTH
 5639
 5640#ifdef CONFIG_JUMP_LABEL
 5641static struct static_key __cfs_bandwidth_used;
 5642
 5643static inline bool cfs_bandwidth_used(void)
 5644{
 5645	return static_key_false(&__cfs_bandwidth_used);
 5646}
 5647
 5648void cfs_bandwidth_usage_inc(void)
 5649{
 5650	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
 5651}
 5652
 5653void cfs_bandwidth_usage_dec(void)
 5654{
 5655	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 5656}
 5657#else /* CONFIG_JUMP_LABEL */
 5658static bool cfs_bandwidth_used(void)
 5659{
 5660	return true;
 5661}
 5662
 5663void cfs_bandwidth_usage_inc(void) {}
 5664void cfs_bandwidth_usage_dec(void) {}
 5665#endif /* CONFIG_JUMP_LABEL */
 5666
 5667/*
 5668 * default period for cfs group bandwidth.
 5669 * default: 0.1s, units: nanoseconds
 5670 */
 5671static inline u64 default_cfs_period(void)
 5672{
 5673	return 100000000ULL;
 5674}
 5675
 5676static inline u64 sched_cfs_bandwidth_slice(void)
 5677{
 5678	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
 5679}
 5680
 5681/*
 5682 * Replenish runtime according to assigned quota. We use sched_clock_cpu
 5683 * directly instead of rq->clock to avoid adding additional synchronization
 5684 * around rq->lock.
 5685 *
 5686 * requires cfs_b->lock
 5687 */
 5688void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 5689{
 5690	s64 runtime;
 5691
 5692	if (unlikely(cfs_b->quota == RUNTIME_INF))
 5693		return;
 5694
 5695	cfs_b->runtime += cfs_b->quota;
 5696	runtime = cfs_b->runtime_snap - cfs_b->runtime;
 5697	if (runtime > 0) {
 5698		cfs_b->burst_time += runtime;
 5699		cfs_b->nr_burst++;
 5700	}
 5701
 5702	cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
 5703	cfs_b->runtime_snap = cfs_b->runtime;
 5704}
 5705
 5706static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 5707{
 5708	return &tg->cfs_bandwidth;
 5709}
 5710
 5711/* returns 0 on failure to allocate runtime */
 5712static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
 5713				   struct cfs_rq *cfs_rq, u64 target_runtime)
 5714{
 5715	u64 min_amount, amount = 0;
 5716
 5717	lockdep_assert_held(&cfs_b->lock);
 5718
 5719	/* note: this is a positive sum as runtime_remaining <= 0 */
 5720	min_amount = target_runtime - cfs_rq->runtime_remaining;
 5721
 5722	if (cfs_b->quota == RUNTIME_INF)
 5723		amount = min_amount;
 5724	else {
 5725		start_cfs_bandwidth(cfs_b);
 5726
 5727		if (cfs_b->runtime > 0) {
 5728			amount = min(cfs_b->runtime, min_amount);
 5729			cfs_b->runtime -= amount;
 5730			cfs_b->idle = 0;
 5731		}
 5732	}
 5733
 5734	cfs_rq->runtime_remaining += amount;
 5735
 5736	return cfs_rq->runtime_remaining > 0;
 5737}
 5738
 5739/* returns 0 on failure to allocate runtime */
 5740static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 5741{
 5742	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 5743	int ret;
 5744
 5745	raw_spin_lock(&cfs_b->lock);
 5746	ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
 5747	raw_spin_unlock(&cfs_b->lock);
 5748
 5749	return ret;
 5750}
 5751
 5752static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 5753{
 5754	/* dock delta_exec before expiring quota (as it could span periods) */
 5755	cfs_rq->runtime_remaining -= delta_exec;
 5756
 5757	if (likely(cfs_rq->runtime_remaining > 0))
 5758		return;
 5759
 5760	if (cfs_rq->throttled)
 5761		return;
 5762	/*
 5763	 * if we're unable to extend our runtime we resched so that the active
 5764	 * hierarchy can be throttled
 5765	 */
 5766	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
 5767		resched_curr(rq_of(cfs_rq));
 5768}
 5769
 5770static __always_inline
 5771void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 5772{
 5773	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 5774		return;
 5775
 5776	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 5777}
 5778
 5779static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 5780{
 5781	return cfs_bandwidth_used() && cfs_rq->throttled;
 5782}
 5783
 5784/* check whether cfs_rq, or any parent, is throttled */
 5785static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 5786{
 5787	return cfs_bandwidth_used() && cfs_rq->throttle_count;
 5788}
 5789
 5790/*
 5791 * Ensure that neither of the group entities corresponding to src_cpu or
 5792 * dest_cpu are members of a throttled hierarchy when performing group
 5793 * load-balance operations.
 5794 */
 5795static inline int throttled_lb_pair(struct task_group *tg,
 5796				    int src_cpu, int dest_cpu)
 5797{
 5798	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
 5799
 5800	src_cfs_rq = tg->cfs_rq[src_cpu];
 5801	dest_cfs_rq = tg->cfs_rq[dest_cpu];
 5802
 5803	return throttled_hierarchy(src_cfs_rq) ||
 5804	       throttled_hierarchy(dest_cfs_rq);
 5805}
 5806
 5807static int tg_unthrottle_up(struct task_group *tg, void *data)
 5808{
 5809	struct rq *rq = data;
 5810	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 5811
 5812	cfs_rq->throttle_count--;
 5813	if (!cfs_rq->throttle_count) {
 5814		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 5815					     cfs_rq->throttled_clock_pelt;
 5816
 5817		/* Add cfs_rq with load or one or more already running entities to the list */
 5818		if (!cfs_rq_is_decayed(cfs_rq))
 5819			list_add_leaf_cfs_rq(cfs_rq);
 5820
 5821		if (cfs_rq->throttled_clock_self) {
 5822			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
 5823
 5824			cfs_rq->throttled_clock_self = 0;
 5825
 5826			if (SCHED_WARN_ON((s64)delta < 0))
 5827				delta = 0;
 5828
 5829			cfs_rq->throttled_clock_self_time += delta;
 5830		}
 5831	}
 5832
 5833	return 0;
 5834}
 5835
 5836static int tg_throttle_down(struct task_group *tg, void *data)
 5837{
 5838	struct rq *rq = data;
 5839	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 5840
 5841	/* group is entering throttled state, stop time */
 5842	if (!cfs_rq->throttle_count) {
 5843		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
 5844		list_del_leaf_cfs_rq(cfs_rq);
 5845
 5846		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
 5847		if (cfs_rq->nr_running)
 5848			cfs_rq->throttled_clock_self = rq_clock(rq);
 5849	}
 5850	cfs_rq->throttle_count++;
 5851
 5852	return 0;
 5853}
 5854
 5855static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 5856{
 5857	struct rq *rq = rq_of(cfs_rq);
 5858	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 5859	struct sched_entity *se;
 5860	long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
 5861	long rq_h_nr_running = rq->cfs.h_nr_running;
 5862
 5863	raw_spin_lock(&cfs_b->lock);
 5864	/* This will start the period timer if necessary */
 5865	if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
 5866		/*
 5867		 * We have raced with bandwidth becoming available, and if we
 5868		 * actually throttled the timer might not unthrottle us for an
 5869		 * entire period. We additionally needed to make sure that any
 5870		 * subsequent check_cfs_rq_runtime calls agree not to throttle
 5871		 * us, as we may commit to do cfs put_prev+pick_next, so we ask
 5872		 * for 1ns of runtime rather than just check cfs_b.
 5873		 */
 5874		dequeue = 0;
 5875	} else {
 5876		list_add_tail_rcu(&cfs_rq->throttled_list,
 5877				  &cfs_b->throttled_cfs_rq);
 5878	}
 5879	raw_spin_unlock(&cfs_b->lock);
 5880
 5881	if (!dequeue)
 5882		return false;  /* Throttle no longer required. */
 5883
 5884	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 5885
 5886	/* freeze hierarchy runnable averages while throttled */
 5887	rcu_read_lock();
 5888	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 5889	rcu_read_unlock();
 5890
 5891	task_delta = cfs_rq->h_nr_running;
 5892	idle_task_delta = cfs_rq->idle_h_nr_running;
 5893	delayed_delta = cfs_rq->h_nr_delayed;
 5894	for_each_sched_entity(se) {
 5895		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 5896		int flags;
 5897
 5898		/* throttled entity or throttle-on-deactivate */
 5899		if (!se->on_rq)
 5900			goto done;
 5901
 5902		/*
 5903		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
 5904		 * This avoids teaching dequeue_entities() about throttled
 5905		 * entities and keeps things relatively simple.
 5906		 */
 5907		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
 5908		if (se->sched_delayed)
 5909			flags |= DEQUEUE_DELAYED;
 5910		dequeue_entity(qcfs_rq, se, flags);
 5911
 5912		if (cfs_rq_is_idle(group_cfs_rq(se)))
 5913			idle_task_delta = cfs_rq->h_nr_running;
 5914
 5915		qcfs_rq->h_nr_running -= task_delta;
 5916		qcfs_rq->idle_h_nr_running -= idle_task_delta;
 5917		qcfs_rq->h_nr_delayed -= delayed_delta;
 5918
 5919		if (qcfs_rq->load.weight) {
 5920			/* Avoid re-evaluating load for this entity: */
 5921			se = parent_entity(se);
 5922			break;
 5923		}
 5924	}
 5925
 5926	for_each_sched_entity(se) {
 5927		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 5928		/* throttled entity or throttle-on-deactivate */
 5929		if (!se->on_rq)
 5930			goto done;
 5931
 5932		update_load_avg(qcfs_rq, se, 0);
 5933		se_update_runnable(se);
 5934
 5935		if (cfs_rq_is_idle(group_cfs_rq(se)))
 5936			idle_task_delta = cfs_rq->h_nr_running;
 5937
 5938		qcfs_rq->h_nr_running -= task_delta;
 5939		qcfs_rq->idle_h_nr_running -= idle_task_delta;
 5940		qcfs_rq->h_nr_delayed -= delayed_delta;
 5941	}
 5942
 5943	/* At this point se is NULL and we are at root level*/
 5944	sub_nr_running(rq, task_delta);
 5945
 5946	/* Stop the fair server if throttling resulted in no runnable tasks */
 5947	if (rq_h_nr_running && !rq->cfs.h_nr_running)
 5948		dl_server_stop(&rq->fair_server);
 5949done:
 5950	/*
 5951	 * Note: distribution will already see us throttled via the
 5952	 * throttled-list.  rq->lock protects completion.
 5953	 */
 5954	cfs_rq->throttled = 1;
 5955	SCHED_WARN_ON(cfs_rq->throttled_clock);
 5956	if (cfs_rq->nr_running)
 5957		cfs_rq->throttled_clock = rq_clock(rq);
 5958	return true;
 5959}
 5960
 5961void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 5962{
 5963	struct rq *rq = rq_of(cfs_rq);
 5964	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 5965	struct sched_entity *se;
 5966	long task_delta, idle_task_delta, delayed_delta;
 5967	long rq_h_nr_running = rq->cfs.h_nr_running;
 5968
 5969	se = cfs_rq->tg->se[cpu_of(rq)];
 5970
 5971	cfs_rq->throttled = 0;
 5972
 5973	update_rq_clock(rq);
 5974
 5975	raw_spin_lock(&cfs_b->lock);
 5976	if (cfs_rq->throttled_clock) {
 5977		cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
 5978		cfs_rq->throttled_clock = 0;
 5979	}
 5980	list_del_rcu(&cfs_rq->throttled_list);
 5981	raw_spin_unlock(&cfs_b->lock);
 5982
 5983	/* update hierarchical throttle state */
 5984	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
 5985
 5986	if (!cfs_rq->load.weight) {
 5987		if (!cfs_rq->on_list)
 5988			return;
 5989		/*
 5990		 * Nothing to run but something to decay (on_list)?
 5991		 * Complete the branch.
 5992		 */
 5993		for_each_sched_entity(se) {
 5994			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
 5995				break;
 5996		}
 5997		goto unthrottle_throttle;
 5998	}
 5999
 6000	task_delta = cfs_rq->h_nr_running;
 6001	idle_task_delta = cfs_rq->idle_h_nr_running;
 6002	delayed_delta = cfs_rq->h_nr_delayed;
 6003	for_each_sched_entity(se) {
 6004		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 6005
 6006		/* Handle any unfinished DELAY_DEQUEUE business first. */
 6007		if (se->sched_delayed) {
 6008			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
 6009
 6010			dequeue_entity(qcfs_rq, se, flags);
 6011		} else if (se->on_rq)
 6012			break;
 6013		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
 6014
 6015		if (cfs_rq_is_idle(group_cfs_rq(se)))
 6016			idle_task_delta = cfs_rq->h_nr_running;
 6017
 6018		qcfs_rq->h_nr_running += task_delta;
 6019		qcfs_rq->idle_h_nr_running += idle_task_delta;
 6020		qcfs_rq->h_nr_delayed += delayed_delta;
 6021
 6022		/* end evaluation on encountering a throttled cfs_rq */
 6023		if (cfs_rq_throttled(qcfs_rq))
 6024			goto unthrottle_throttle;
 6025	}
 6026
 6027	for_each_sched_entity(se) {
 6028		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 6029
 6030		update_load_avg(qcfs_rq, se, UPDATE_TG);
 6031		se_update_runnable(se);
 6032
 6033		if (cfs_rq_is_idle(group_cfs_rq(se)))
 6034			idle_task_delta = cfs_rq->h_nr_running;
 6035
 6036		qcfs_rq->h_nr_running += task_delta;
 6037		qcfs_rq->idle_h_nr_running += idle_task_delta;
 6038		qcfs_rq->h_nr_delayed += delayed_delta;
 6039
 6040		/* end evaluation on encountering a throttled cfs_rq */
 6041		if (cfs_rq_throttled(qcfs_rq))
 6042			goto unthrottle_throttle;
 6043	}
 6044
 6045	/* Start the fair server if un-throttling resulted in new runnable tasks */
 6046	if (!rq_h_nr_running && rq->cfs.h_nr_running)
 6047		dl_server_start(&rq->fair_server);
 6048
 6049	/* At this point se is NULL and we are at root level*/
 6050	add_nr_running(rq, task_delta);
 6051
 6052unthrottle_throttle:
 6053	assert_list_leaf_cfs_rq(rq);
 6054
 6055	/* Determine whether we need to wake up potentially idle CPU: */
 6056	if (rq->curr == rq->idle && rq->cfs.nr_running)
 6057		resched_curr(rq);
 6058}
 6059
 6060#ifdef CONFIG_SMP
 6061static void __cfsb_csd_unthrottle(void *arg)
 6062{
 6063	struct cfs_rq *cursor, *tmp;
 6064	struct rq *rq = arg;
 6065	struct rq_flags rf;
 6066
 6067	rq_lock(rq, &rf);
 6068
 6069	/*
 6070	 * Iterating over the list can trigger several call to
 6071	 * update_rq_clock() in unthrottle_cfs_rq().
 6072	 * Do it once and skip the potential next ones.
 6073	 */
 6074	update_rq_clock(rq);
 6075	rq_clock_start_loop_update(rq);
 6076
 6077	/*
 6078	 * Since we hold rq lock we're safe from concurrent manipulation of
 6079	 * the CSD list. However, this RCU critical section annotates the
 6080	 * fact that we pair with sched_free_group_rcu(), so that we cannot
 6081	 * race with group being freed in the window between removing it
 6082	 * from the list and advancing to the next entry in the list.
 6083	 */
 6084	rcu_read_lock();
 6085
 6086	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
 6087				 throttled_csd_list) {
 6088		list_del_init(&cursor->throttled_csd_list);
 6089
 6090		if (cfs_rq_throttled(cursor))
 6091			unthrottle_cfs_rq(cursor);
 6092	}
 6093
 6094	rcu_read_unlock();
 6095
 6096	rq_clock_stop_loop_update(rq);
 6097	rq_unlock(rq, &rf);
 6098}
 6099
 6100static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
 6101{
 6102	struct rq *rq = rq_of(cfs_rq);
 6103	bool first;
 6104
 6105	if (rq == this_rq()) {
 6106		unthrottle_cfs_rq(cfs_rq);
 6107		return;
 6108	}
 6109
 6110	/* Already enqueued */
 6111	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
 6112		return;
 6113
 6114	first = list_empty(&rq->cfsb_csd_list);
 6115	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
 6116	if (first)
 6117		smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
 6118}
 6119#else
 6120static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
 6121{
 6122	unthrottle_cfs_rq(cfs_rq);
 6123}
 6124#endif
 6125
 6126static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
 6127{
 6128	lockdep_assert_rq_held(rq_of(cfs_rq));
 6129
 6130	if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
 6131	    cfs_rq->runtime_remaining <= 0))
 6132		return;
 6133
 6134	__unthrottle_cfs_rq_async(cfs_rq);
 6135}
 6136
 6137static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 6138{
 6139	int this_cpu = smp_processor_id();
 6140	u64 runtime, remaining = 1;
 6141	bool throttled = false;
 6142	struct cfs_rq *cfs_rq, *tmp;
 6143	struct rq_flags rf;
 6144	struct rq *rq;
 6145	LIST_HEAD(local_unthrottle);
 6146
 6147	rcu_read_lock();
 6148	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
 6149				throttled_list) {
 6150		rq = rq_of(cfs_rq);
 6151
 6152		if (!remaining) {
 6153			throttled = true;
 6154			break;
 6155		}
 6156
 6157		rq_lock_irqsave(rq, &rf);
 6158		if (!cfs_rq_throttled(cfs_rq))
 6159			goto next;
 6160
 6161		/* Already queued for async unthrottle */
 6162		if (!list_empty(&cfs_rq->throttled_csd_list))
 6163			goto next;
 6164
 6165		/* By the above checks, this should never be true */
 6166		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
 6167
 6168		raw_spin_lock(&cfs_b->lock);
 6169		runtime = -cfs_rq->runtime_remaining + 1;
 6170		if (runtime > cfs_b->runtime)
 6171			runtime = cfs_b->runtime;
 6172		cfs_b->runtime -= runtime;
 6173		remaining = cfs_b->runtime;
 6174		raw_spin_unlock(&cfs_b->lock);
 6175
 6176		cfs_rq->runtime_remaining += runtime;
 6177
 6178		/* we check whether we're throttled above */
 6179		if (cfs_rq->runtime_remaining > 0) {
 6180			if (cpu_of(rq) != this_cpu) {
 6181				unthrottle_cfs_rq_async(cfs_rq);
 6182			} else {
 6183				/*
 6184				 * We currently only expect to be unthrottling
 6185				 * a single cfs_rq locally.
 6186				 */
 6187				SCHED_WARN_ON(!list_empty(&local_unthrottle));
 6188				list_add_tail(&cfs_rq->throttled_csd_list,
 6189					      &local_unthrottle);
 6190			}
 6191		} else {
 6192			throttled = true;
 6193		}
 6194
 6195next:
 6196		rq_unlock_irqrestore(rq, &rf);
 6197	}
 6198
 6199	list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
 6200				 throttled_csd_list) {
 6201		struct rq *rq = rq_of(cfs_rq);
 6202
 6203		rq_lock_irqsave(rq, &rf);
 6204
 6205		list_del_init(&cfs_rq->throttled_csd_list);
 6206
 6207		if (cfs_rq_throttled(cfs_rq))
 6208			unthrottle_cfs_rq(cfs_rq);
 6209
 6210		rq_unlock_irqrestore(rq, &rf);
 6211	}
 6212	SCHED_WARN_ON(!list_empty(&local_unthrottle));
 6213
 6214	rcu_read_unlock();
 6215
 6216	return throttled;
 6217}
 6218
 6219/*
 6220 * Responsible for refilling a task_group's bandwidth and unthrottling its
 6221 * cfs_rqs as appropriate. If there has been no activity within the last
 6222 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
 6223 * used to track this state.
 6224 */
 6225static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
 6226{
 6227	int throttled;
 6228
 6229	/* no need to continue the timer with no bandwidth constraint */
 6230	if (cfs_b->quota == RUNTIME_INF)
 6231		goto out_deactivate;
 6232
 6233	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 6234	cfs_b->nr_periods += overrun;
 6235
 6236	/* Refill extra burst quota even if cfs_b->idle */
 6237	__refill_cfs_bandwidth_runtime(cfs_b);
 6238
 6239	/*
 6240	 * idle depends on !throttled (for the case of a large deficit), and if
 6241	 * we're going inactive then everything else can be deferred
 6242	 */
 6243	if (cfs_b->idle && !throttled)
 6244		goto out_deactivate;
 6245
 6246	if (!throttled) {
 6247		/* mark as potentially idle for the upcoming period */
 6248		cfs_b->idle = 1;
 6249		return 0;
 6250	}
 6251
 6252	/* account preceding periods in which throttling occurred */
 6253	cfs_b->nr_throttled += overrun;
 6254
 6255	/*
 6256	 * This check is repeated as we release cfs_b->lock while we unthrottle.
 6257	 */
 6258	while (throttled && cfs_b->runtime > 0) {
 6259		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 6260		/* we can't nest cfs_b->lock while distributing bandwidth */
 6261		throttled = distribute_cfs_runtime(cfs_b);
 6262		raw_spin_lock_irqsave(&cfs_b->lock, flags);
 6263	}
 6264
 6265	/*
 6266	 * While we are ensured activity in the period following an
 6267	 * unthrottle, this also covers the case in which the new bandwidth is
 6268	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
 6269	 * timer to remain active while there are any throttled entities.)
 6270	 */
 6271	cfs_b->idle = 0;
 6272
 6273	return 0;
 6274
 6275out_deactivate:
 6276	return 1;
 6277}
 6278
 6279/* a cfs_rq won't donate quota below this amount */
 6280static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
 6281/* minimum remaining period time to redistribute slack quota */
 6282static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 6283/* how long we wait to gather additional slack before distributing */
 6284static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
 6285
 6286/*
 6287 * Are we near the end of the current quota period?
 6288 *
 6289 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
 6290 * hrtimer base being cleared by hrtimer_start. In the case of
 6291 * migrate_hrtimers, base is never cleared, so we are fine.
 6292 */
 6293static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 6294{
 6295	struct hrtimer *refresh_timer = &cfs_b->period_timer;
 6296	s64 remaining;
 6297
 6298	/* if the call-back is running a quota refresh is already occurring */
 6299	if (hrtimer_callback_running(refresh_timer))
 6300		return 1;
 6301
 6302	/* is a quota refresh about to occur? */
 6303	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
 6304	if (remaining < (s64)min_expire)
 6305		return 1;
 6306
 6307	return 0;
 6308}
 6309
 6310static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
 6311{
 6312	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
 6313
 6314	/* if there's a quota refresh soon don't bother with slack */
 6315	if (runtime_refresh_within(cfs_b, min_left))
 6316		return;
 6317
 6318	/* don't push forwards an existing deferred unthrottle */
 6319	if (cfs_b->slack_started)
 6320		return;
 6321	cfs_b->slack_started = true;
 6322
 6323	hrtimer_start(&cfs_b->slack_timer,
 6324			ns_to_ktime(cfs_bandwidth_slack_period),
 6325			HRTIMER_MODE_REL);
 6326}
 6327
 6328/* we know any runtime found here is valid as update_curr() precedes return */
 6329static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 6330{
 6331	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 6332	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
 6333
 6334	if (slack_runtime <= 0)
 6335		return;
 6336
 6337	raw_spin_lock(&cfs_b->lock);
 6338	if (cfs_b->quota != RUNTIME_INF) {
 6339		cfs_b->runtime += slack_runtime;
 6340
 6341		/* we are under rq->lock, defer unthrottling using a timer */
 6342		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
 6343		    !list_empty(&cfs_b->throttled_cfs_rq))
 6344			start_cfs_slack_bandwidth(cfs_b);
 6345	}
 6346	raw_spin_unlock(&cfs_b->lock);
 6347
 6348	/* even if it's not valid for return we don't want to try again */
 6349	cfs_rq->runtime_remaining -= slack_runtime;
 6350}
 6351
 6352static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 6353{
 6354	if (!cfs_bandwidth_used())
 6355		return;
 6356
 6357	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
 6358		return;
 6359
 6360	__return_cfs_rq_runtime(cfs_rq);
 6361}
 6362
 6363/*
 6364 * This is done with a timer (instead of inline with bandwidth return) since
 6365 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
 6366 */
 6367static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 6368{
 6369	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
 6370	unsigned long flags;
 6371
 6372	/* confirm we're still not at a refresh boundary */
 6373	raw_spin_lock_irqsave(&cfs_b->lock, flags);
 6374	cfs_b->slack_started = false;
 6375
 6376	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 6377		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 6378		return;
 6379	}
 6380
 6381	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
 6382		runtime = cfs_b->runtime;
 6383
 6384	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 6385
 6386	if (!runtime)
 6387		return;
 6388
 6389	distribute_cfs_runtime(cfs_b);
 6390}
 6391
 6392/*
 6393 * When a group wakes up we want to make sure that its quota is not already
 6394 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
 6395 * runtime as update_curr() throttling can not trigger until it's on-rq.
 6396 */
 6397static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 6398{
 6399	if (!cfs_bandwidth_used())
 6400		return;
 6401
 6402	/* an active group must be handled by the update_curr()->put() path */
 6403	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 6404		return;
 6405
 6406	/* ensure the group is not already throttled */
 6407	if (cfs_rq_throttled(cfs_rq))
 6408		return;
 6409
 6410	/* update runtime allocation */
 6411	account_cfs_rq_runtime(cfs_rq, 0);
 6412	if (cfs_rq->runtime_remaining <= 0)
 6413		throttle_cfs_rq(cfs_rq);
 6414}
 6415
 6416static void sync_throttle(struct task_group *tg, int cpu)
 6417{
 6418	struct cfs_rq *pcfs_rq, *cfs_rq;
 6419
 6420	if (!cfs_bandwidth_used())
 6421		return;
 6422
 6423	if (!tg->parent)
 6424		return;
 6425
 6426	cfs_rq = tg->cfs_rq[cpu];
 6427	pcfs_rq = tg->parent->cfs_rq[cpu];
 6428
 6429	cfs_rq->throttle_count = pcfs_rq->throttle_count;
 6430	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
 6431}
 6432
 6433/* conditionally throttle active cfs_rq's from put_prev_entity() */
 6434static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 6435{
 6436	if (!cfs_bandwidth_used())
 6437		return false;
 6438
 6439	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
 6440		return false;
 6441
 6442	/*
 6443	 * it's possible for a throttled entity to be forced into a running
 6444	 * state (e.g. set_curr_task), in this case we're finished.
 6445	 */
 6446	if (cfs_rq_throttled(cfs_rq))
 6447		return true;
 6448
 6449	return throttle_cfs_rq(cfs_rq);
 6450}
 6451
 6452static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 6453{
 6454	struct cfs_bandwidth *cfs_b =
 6455		container_of(timer, struct cfs_bandwidth, slack_timer);
 6456
 6457	do_sched_cfs_slack_timer(cfs_b);
 6458
 6459	return HRTIMER_NORESTART;
 6460}
 6461
 6462extern const u64 max_cfs_quota_period;
 6463
 6464static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 6465{
 6466	struct cfs_bandwidth *cfs_b =
 6467		container_of(timer, struct cfs_bandwidth, period_timer);
 6468	unsigned long flags;
 6469	int overrun;
 6470	int idle = 0;
 6471	int count = 0;
 6472
 6473	raw_spin_lock_irqsave(&cfs_b->lock, flags);
 6474	for (;;) {
 6475		overrun = hrtimer_forward_now(timer, cfs_b->period);
 6476		if (!overrun)
 6477			break;
 6478
 6479		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
 6480
 6481		if (++count > 3) {
 6482			u64 new, old = ktime_to_ns(cfs_b->period);
 6483
 6484			/*
 6485			 * Grow period by a factor of 2 to avoid losing precision.
 6486			 * Precision loss in the quota/period ratio can cause __cfs_schedulable
 6487			 * to fail.
 6488			 */
 6489			new = old * 2;
 6490			if (new < max_cfs_quota_period) {
 6491				cfs_b->period = ns_to_ktime(new);
 6492				cfs_b->quota *= 2;
 6493				cfs_b->burst *= 2;
 6494
 6495				pr_warn_ratelimited(
 6496	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
 6497					smp_processor_id(),
 6498					div_u64(new, NSEC_PER_USEC),
 6499					div_u64(cfs_b->quota, NSEC_PER_USEC));
 6500			} else {
 6501				pr_warn_ratelimited(
 6502	"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
 6503					smp_processor_id(),
 6504					div_u64(old, NSEC_PER_USEC),
 6505					div_u64(cfs_b->quota, NSEC_PER_USEC));
 6506			}
 6507
 6508			/* reset count so we don't come right back in here */
 6509			count = 0;
 6510		}
 6511	}
 6512	if (idle)
 6513		cfs_b->period_active = 0;
 6514	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 6515
 6516	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 6517}
 6518
 6519void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
 6520{
 6521	raw_spin_lock_init(&cfs_b->lock);
 6522	cfs_b->runtime = 0;
 6523	cfs_b->quota = RUNTIME_INF;
 6524	cfs_b->period = ns_to_ktime(default_cfs_period());
 6525	cfs_b->burst = 0;
 6526	cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
 6527
 6528	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 6529	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 6530	cfs_b->period_timer.function = sched_cfs_period_timer;
 6531
 6532	/* Add a random offset so that timers interleave */
 6533	hrtimer_set_expires(&cfs_b->period_timer,
 6534			    get_random_u32_below(cfs_b->period));
 6535	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 6536	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 6537	cfs_b->slack_started = false;
 6538}
 6539
 6540static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 6541{
 6542	cfs_rq->runtime_enabled = 0;
 6543	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 6544	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
 6545}
 6546
 6547void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 6548{
 6549	lockdep_assert_held(&cfs_b->lock);
 6550
 6551	if (cfs_b->period_active)
 6552		return;
 6553
 6554	cfs_b->period_active = 1;
 6555	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
 6556	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
 6557}
 6558
 6559static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 6560{
 6561	int __maybe_unused i;
 6562
 6563	/* init_cfs_bandwidth() was not called */
 6564	if (!cfs_b->throttled_cfs_rq.next)
 6565		return;
 6566
 6567	hrtimer_cancel(&cfs_b->period_timer);
 6568	hrtimer_cancel(&cfs_b->slack_timer);
 6569
 6570	/*
 6571	 * It is possible that we still have some cfs_rq's pending on a CSD
 6572	 * list, though this race is very rare. In order for this to occur, we
 6573	 * must have raced with the last task leaving the group while there
 6574	 * exist throttled cfs_rq(s), and the period_timer must have queued the
 6575	 * CSD item but the remote cpu has not yet processed it. To handle this,
 6576	 * we can simply flush all pending CSD work inline here. We're
 6577	 * guaranteed at this point that no additional cfs_rq of this group can
 6578	 * join a CSD list.
 6579	 */
 6580#ifdef CONFIG_SMP
 6581	for_each_possible_cpu(i) {
 6582		struct rq *rq = cpu_rq(i);
 6583		unsigned long flags;
 6584
 6585		if (list_empty(&rq->cfsb_csd_list))
 6586			continue;
 6587
 6588		local_irq_save(flags);
 6589		__cfsb_csd_unthrottle(rq);
 6590		local_irq_restore(flags);
 6591	}
 6592#endif
 6593}
 6594
 6595/*
 6596 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
 6597 *
 6598 * The race is harmless, since modifying bandwidth settings of unhooked group
 6599 * bits doesn't do much.
 6600 */
 6601
 6602/* cpu online callback */
 6603static void __maybe_unused update_runtime_enabled(struct rq *rq)
 6604{
 6605	struct task_group *tg;
 6606
 6607	lockdep_assert_rq_held(rq);
 6608
 6609	rcu_read_lock();
 6610	list_for_each_entry_rcu(tg, &task_groups, list) {
 6611		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 6612		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 6613
 6614		raw_spin_lock(&cfs_b->lock);
 6615		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 6616		raw_spin_unlock(&cfs_b->lock);
 6617	}
 6618	rcu_read_unlock();
 6619}
 6620
 6621/* cpu offline callback */
 6622static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 6623{
 6624	struct task_group *tg;
 6625
 6626	lockdep_assert_rq_held(rq);
 6627
 6628	/*
 6629	 * The rq clock has already been updated in the
 6630	 * set_rq_offline(), so we should skip updating
 6631	 * the rq clock again in unthrottle_cfs_rq().
 6632	 */
 6633	rq_clock_start_loop_update(rq);
 6634
 6635	rcu_read_lock();
 6636	list_for_each_entry_rcu(tg, &task_groups, list) {
 6637		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 6638
 6639		if (!cfs_rq->runtime_enabled)
 6640			continue;
 6641
 6642		/*
 6643		 * clock_task is not advancing so we just need to make sure
 6644		 * there's some valid quota amount
 6645		 */
 6646		cfs_rq->runtime_remaining = 1;
 6647		/*
 6648		 * Offline rq is schedulable till CPU is completely disabled
 6649		 * in take_cpu_down(), so we prevent new cfs throttling here.
 6650		 */
 6651		cfs_rq->runtime_enabled = 0;
 6652
 6653		if (cfs_rq_throttled(cfs_rq))
 6654			unthrottle_cfs_rq(cfs_rq);
 6655	}
 6656	rcu_read_unlock();
 6657
 6658	rq_clock_stop_loop_update(rq);
 6659}
 6660
 6661bool cfs_task_bw_constrained(struct task_struct *p)
 6662{
 6663	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 6664
 6665	if (!cfs_bandwidth_used())
 6666		return false;
 6667
 6668	if (cfs_rq->runtime_enabled ||
 6669	    tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
 6670		return true;
 6671
 6672	return false;
 6673}
 6674
 6675#ifdef CONFIG_NO_HZ_FULL
 6676/* called from pick_next_task_fair() */
 6677static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
 6678{
 6679	int cpu = cpu_of(rq);
 6680
 6681	if (!cfs_bandwidth_used())
 6682		return;
 6683
 6684	if (!tick_nohz_full_cpu(cpu))
 6685		return;
 6686
 6687	if (rq->nr_running != 1)
 6688		return;
 6689
 6690	/*
 6691	 *  We know there is only one task runnable and we've just picked it. The
 6692	 *  normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
 6693	 *  be otherwise able to stop the tick. Just need to check if we are using
 6694	 *  bandwidth control.
 6695	 */
 6696	if (cfs_task_bw_constrained(p))
 6697		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
 6698}
 6699#endif
 6700
 6701#else /* CONFIG_CFS_BANDWIDTH */
 6702
 6703static inline bool cfs_bandwidth_used(void)
 6704{
 6705	return false;
 6706}
 6707
 6708static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 6709static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 6710static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 6711static inline void sync_throttle(struct task_group *tg, int cpu) {}
 6712static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 6713
 6714static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 6715{
 6716	return 0;
 6717}
 6718
 6719static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 6720{
 6721	return 0;
 6722}
 6723
 6724static inline int throttled_lb_pair(struct task_group *tg,
 6725				    int src_cpu, int dest_cpu)
 6726{
 6727	return 0;
 6728}
 6729
 6730#ifdef CONFIG_FAIR_GROUP_SCHED
 6731void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
 6732static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 6733#endif
 6734
 6735static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 6736{
 6737	return NULL;
 6738}
 6739static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 6740static inline void update_runtime_enabled(struct rq *rq) {}
 6741static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 6742#ifdef CONFIG_CGROUP_SCHED
 6743bool cfs_task_bw_constrained(struct task_struct *p)
 6744{
 6745	return false;
 6746}
 6747#endif
 6748#endif /* CONFIG_CFS_BANDWIDTH */
 6749
 6750#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
 6751static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
 6752#endif
 6753
 6754/**************************************************
 6755 * CFS operations on tasks:
 6756 */
 6757
 6758#ifdef CONFIG_SCHED_HRTICK
 6759static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 6760{
 6761	struct sched_entity *se = &p->se;
 6762
 6763	SCHED_WARN_ON(task_rq(p) != rq);
 6764
 6765	if (rq->cfs.h_nr_running > 1) {
 6766		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
 6767		u64 slice = se->slice;
 6768		s64 delta = slice - ran;
 6769
 6770		if (delta < 0) {
 6771			if (task_current_donor(rq, p))
 6772				resched_curr(rq);
 6773			return;
 6774		}
 6775		hrtick_start(rq, delta);
 6776	}
 6777}
 6778
 6779/*
 6780 * called from enqueue/dequeue and updates the hrtick when the
 6781 * current task is from our class and nr_running is low enough
 6782 * to matter.
 6783 */
 6784static void hrtick_update(struct rq *rq)
 6785{
 6786	struct task_struct *donor = rq->donor;
 6787
 6788	if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class)
 6789		return;
 6790
 6791	hrtick_start_fair(rq, donor);
 6792}
 6793#else /* !CONFIG_SCHED_HRTICK */
 6794static inline void
 6795hrtick_start_fair(struct rq *rq, struct task_struct *p)
 6796{
 6797}
 6798
 6799static inline void hrtick_update(struct rq *rq)
 6800{
 6801}
 6802#endif
 6803
 6804#ifdef CONFIG_SMP
 6805static inline bool cpu_overutilized(int cpu)
 6806{
 6807	unsigned long  rq_util_min, rq_util_max;
 6808
 6809	if (!sched_energy_enabled())
 6810		return false;
 6811
 6812	rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
 6813	rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
 6814
 6815	/* Return true only if the utilization doesn't fit CPU's capacity */
 6816	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
 6817}
 6818
 6819/*
 6820 * overutilized value make sense only if EAS is enabled
 6821 */
 6822static inline bool is_rd_overutilized(struct root_domain *rd)
 6823{
 6824	return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
 6825}
 6826
 6827static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
 6828{
 6829	if (!sched_energy_enabled())
 6830		return;
 6831
 6832	WRITE_ONCE(rd->overutilized, flag);
 6833	trace_sched_overutilized_tp(rd, flag);
 6834}
 6835
 6836static inline void check_update_overutilized_status(struct rq *rq)
 6837{
 6838	/*
 6839	 * overutilized field is used for load balancing decisions only
 6840	 * if energy aware scheduler is being used
 6841	 */
 6842
 6843	if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
 6844		set_rd_overutilized(rq->rd, 1);
 6845}
 6846#else
 6847static inline void check_update_overutilized_status(struct rq *rq) { }
 6848#endif
 6849
 6850/* Runqueue only has SCHED_IDLE tasks enqueued */
 6851static int sched_idle_rq(struct rq *rq)
 6852{
 6853	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
 6854			rq->nr_running);
 6855}
 6856
 6857#ifdef CONFIG_SMP
 6858static int sched_idle_cpu(int cpu)
 6859{
 6860	return sched_idle_rq(cpu_rq(cpu));
 6861}
 6862#endif
 6863
 6864static void
 6865requeue_delayed_entity(struct sched_entity *se)
 6866{
 6867	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 6868
 6869	/*
 6870	 * se->sched_delayed should imply: se->on_rq == 1.
 6871	 * Because a delayed entity is one that is still on
 6872	 * the runqueue competing until elegibility.
 6873	 */
 6874	SCHED_WARN_ON(!se->sched_delayed);
 6875	SCHED_WARN_ON(!se->on_rq);
 6876
 6877	if (sched_feat(DELAY_ZERO)) {
 6878		update_entity_lag(cfs_rq, se);
 6879		if (se->vlag > 0) {
 6880			cfs_rq->nr_running--;
 6881			if (se != cfs_rq->curr)
 6882				__dequeue_entity(cfs_rq, se);
 6883			se->vlag = 0;
 6884			place_entity(cfs_rq, se, 0);
 6885			if (se != cfs_rq->curr)
 6886				__enqueue_entity(cfs_rq, se);
 6887			cfs_rq->nr_running++;
 6888		}
 6889	}
 6890
 6891	update_load_avg(cfs_rq, se, 0);
 6892	clear_delayed(se);
 6893}
 6894
 6895/*
 6896 * The enqueue_task method is called before nr_running is
 6897 * increased. Here we update the fair scheduling stats and
 6898 * then put the task into the rbtree:
 6899 */
 6900static void
 6901enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 6902{
 6903	struct cfs_rq *cfs_rq;
 6904	struct sched_entity *se = &p->se;
 6905	int idle_h_nr_running = task_has_idle_policy(p);
 6906	int h_nr_delayed = 0;
 6907	int task_new = !(flags & ENQUEUE_WAKEUP);
 6908	int rq_h_nr_running = rq->cfs.h_nr_running;
 6909	u64 slice = 0;
 6910
 6911	/*
 6912	 * The code below (indirectly) updates schedutil which looks at
 6913	 * the cfs_rq utilization to select a frequency.
 6914	 * Let's add the task's estimated utilization to the cfs_rq's
 6915	 * estimated utilization, before we update schedutil.
 6916	 */
 6917	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
 6918		util_est_enqueue(&rq->cfs, p);
 6919
 6920	if (flags & ENQUEUE_DELAYED) {
 6921		requeue_delayed_entity(se);
 6922		return;
 6923	}
 6924
 6925	/*
 6926	 * If in_iowait is set, the code below may not trigger any cpufreq
 6927	 * utilization updates, so do it here explicitly with the IOWAIT flag
 6928	 * passed.
 6929	 */
 6930	if (p->in_iowait)
 6931		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 6932
 6933	if (task_new)
 6934		h_nr_delayed = !!se->sched_delayed;
 6935
 6936	for_each_sched_entity(se) {
 6937		if (se->on_rq) {
 6938			if (se->sched_delayed)
 6939				requeue_delayed_entity(se);
 6940			break;
 6941		}
 6942		cfs_rq = cfs_rq_of(se);
 6943
 6944		/*
 6945		 * Basically set the slice of group entries to the min_slice of
 6946		 * their respective cfs_rq. This ensures the group can service
 6947		 * its entities in the desired time-frame.
 6948		 */
 6949		if (slice) {
 6950			se->slice = slice;
 6951			se->custom_slice = 1;
 6952		}
 6953		enqueue_entity(cfs_rq, se, flags);
 6954		slice = cfs_rq_min_slice(cfs_rq);
 6955
 6956		cfs_rq->h_nr_running++;
 6957		cfs_rq->idle_h_nr_running += idle_h_nr_running;
 6958		cfs_rq->h_nr_delayed += h_nr_delayed;
 6959
 6960		if (cfs_rq_is_idle(cfs_rq))
 6961			idle_h_nr_running = 1;
 6962
 6963		/* end evaluation on encountering a throttled cfs_rq */
 6964		if (cfs_rq_throttled(cfs_rq))
 6965			goto enqueue_throttle;
 6966
 6967		flags = ENQUEUE_WAKEUP;
 6968	}
 6969
 6970	for_each_sched_entity(se) {
 6971		cfs_rq = cfs_rq_of(se);
 6972
 6973		update_load_avg(cfs_rq, se, UPDATE_TG);
 6974		se_update_runnable(se);
 6975		update_cfs_group(se);
 6976
 6977		se->slice = slice;
 6978		slice = cfs_rq_min_slice(cfs_rq);
 6979
 6980		cfs_rq->h_nr_running++;
 6981		cfs_rq->idle_h_nr_running += idle_h_nr_running;
 6982		cfs_rq->h_nr_delayed += h_nr_delayed;
 6983
 6984		if (cfs_rq_is_idle(cfs_rq))
 6985			idle_h_nr_running = 1;
 6986
 6987		/* end evaluation on encountering a throttled cfs_rq */
 6988		if (cfs_rq_throttled(cfs_rq))
 6989			goto enqueue_throttle;
 6990	}
 6991
 6992	if (!rq_h_nr_running && rq->cfs.h_nr_running) {
 6993		/* Account for idle runtime */
 6994		if (!rq->nr_running)
 6995			dl_server_update_idle_time(rq, rq->curr);
 6996		dl_server_start(&rq->fair_server);
 6997	}
 6998
 6999	/* At this point se is NULL and we are at root level*/
 7000	add_nr_running(rq, 1);
 7001
 7002	/*
 7003	 * Since new tasks are assigned an initial util_avg equal to
 7004	 * half of the spare capacity of their CPU, tiny tasks have the
 7005	 * ability to cross the overutilized threshold, which will
 7006	 * result in the load balancer ruining all the task placement
 7007	 * done by EAS. As a way to mitigate that effect, do not account
 7008	 * for the first enqueue operation of new tasks during the
 7009	 * overutilized flag detection.
 7010	 *
 7011	 * A better way of solving this problem would be to wait for
 7012	 * the PELT signals of tasks to converge before taking them
 7013	 * into account, but that is not straightforward to implement,
 7014	 * and the following generally works well enough in practice.
 7015	 */
 7016	if (!task_new)
 7017		check_update_overutilized_status(rq);
 7018
 7019enqueue_throttle:
 7020	assert_list_leaf_cfs_rq(rq);
 7021
 7022	hrtick_update(rq);
 7023}
 7024
 7025static void set_next_buddy(struct sched_entity *se);
 7026
 7027/*
 7028 * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
 7029 * failing half-way through and resume the dequeue later.
 7030 *
 7031 * Returns:
 7032 * -1 - dequeue delayed
 7033 *  0 - dequeue throttled
 7034 *  1 - dequeue complete
 7035 */
 7036static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 7037{
 7038	bool was_sched_idle = sched_idle_rq(rq);
 7039	int rq_h_nr_running = rq->cfs.h_nr_running;
 7040	bool task_sleep = flags & DEQUEUE_SLEEP;
 7041	bool task_delayed = flags & DEQUEUE_DELAYED;
 7042	struct task_struct *p = NULL;
 7043	int idle_h_nr_running = 0;
 7044	int h_nr_running = 0;
 7045	int h_nr_delayed = 0;
 7046	struct cfs_rq *cfs_rq;
 7047	u64 slice = 0;
 7048
 7049	if (entity_is_task(se)) {
 7050		p = task_of(se);
 7051		h_nr_running = 1;
 7052		idle_h_nr_running = task_has_idle_policy(p);
 7053		if (!task_sleep && !task_delayed)
 7054			h_nr_delayed = !!se->sched_delayed;
 7055	} else {
 7056		cfs_rq = group_cfs_rq(se);
 7057		slice = cfs_rq_min_slice(cfs_rq);
 7058	}
 7059
 7060	for_each_sched_entity(se) {
 7061		cfs_rq = cfs_rq_of(se);
 7062
 7063		if (!dequeue_entity(cfs_rq, se, flags)) {
 7064			if (p && &p->se == se)
 7065				return -1;
 7066
 7067			break;
 7068		}
 7069
 7070		cfs_rq->h_nr_running -= h_nr_running;
 7071		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 7072		cfs_rq->h_nr_delayed -= h_nr_delayed;
 7073
 7074		if (cfs_rq_is_idle(cfs_rq))
 7075			idle_h_nr_running = h_nr_running;
 7076
 7077		/* end evaluation on encountering a throttled cfs_rq */
 7078		if (cfs_rq_throttled(cfs_rq))
 7079			return 0;
 7080
 7081		/* Don't dequeue parent if it has other entities besides us */
 7082		if (cfs_rq->load.weight) {
 7083			slice = cfs_rq_min_slice(cfs_rq);
 7084
 7085			/* Avoid re-evaluating load for this entity: */
 7086			se = parent_entity(se);
 7087			/*
 7088			 * Bias pick_next to pick a task from this cfs_rq, as
 7089			 * p is sleeping when it is within its sched_slice.
 7090			 */
 7091			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
 7092				set_next_buddy(se);
 7093			break;
 7094		}
 7095		flags |= DEQUEUE_SLEEP;
 7096		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
 7097	}
 7098
 7099	for_each_sched_entity(se) {
 7100		cfs_rq = cfs_rq_of(se);
 7101
 7102		update_load_avg(cfs_rq, se, UPDATE_TG);
 7103		se_update_runnable(se);
 7104		update_cfs_group(se);
 7105
 7106		se->slice = slice;
 7107		slice = cfs_rq_min_slice(cfs_rq);
 7108
 7109		cfs_rq->h_nr_running -= h_nr_running;
 7110		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 7111		cfs_rq->h_nr_delayed -= h_nr_delayed;
 7112
 7113		if (cfs_rq_is_idle(cfs_rq))
 7114			idle_h_nr_running = h_nr_running;
 7115
 7116		/* end evaluation on encountering a throttled cfs_rq */
 7117		if (cfs_rq_throttled(cfs_rq))
 7118			return 0;
 7119	}
 7120
 7121	sub_nr_running(rq, h_nr_running);
 7122
 7123	if (rq_h_nr_running && !rq->cfs.h_nr_running)
 7124		dl_server_stop(&rq->fair_server);
 7125
 7126	/* balance early to pull high priority tasks */
 7127	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
 7128		rq->next_balance = jiffies;
 7129
 7130	if (p && task_delayed) {
 7131		SCHED_WARN_ON(!task_sleep);
 7132		SCHED_WARN_ON(p->on_rq != 1);
 7133
 7134		/* Fix-up what dequeue_task_fair() skipped */
 7135		hrtick_update(rq);
 7136
 7137		/*
 7138		 * Fix-up what block_task() skipped.
 7139		 *
 7140		 * Must be last, @p might not be valid after this.
 7141		 */
 7142		__block_task(rq, p);
 7143	}
 7144
 7145	return 1;
 7146}
 7147
 7148/*
 7149 * The dequeue_task method is called before nr_running is
 7150 * decreased. We remove the task from the rbtree and
 7151 * update the fair scheduling stats:
 7152 */
 7153static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 7154{
 7155	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
 7156		util_est_dequeue(&rq->cfs, p);
 7157
 7158	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
 7159	if (dequeue_entities(rq, &p->se, flags) < 0)
 7160		return false;
 7161
 7162	/*
 7163	 * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
 7164	 */
 7165
 7166	hrtick_update(rq);
 7167	return true;
 7168}
 7169
 7170#ifdef CONFIG_SMP
 7171
 7172/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
 7173static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 7174static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
 7175static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
 7176
 7177#ifdef CONFIG_NO_HZ_COMMON
 7178
 7179static struct {
 7180	cpumask_var_t idle_cpus_mask;
 7181	atomic_t nr_cpus;
 7182	int has_blocked;		/* Idle CPUS has blocked load */
 7183	int needs_update;		/* Newly idle CPUs need their next_balance collated */
 7184	unsigned long next_balance;     /* in jiffy units */
 7185	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 7186} nohz ____cacheline_aligned;
 7187
 7188#endif /* CONFIG_NO_HZ_COMMON */
 7189
 7190static unsigned long cpu_load(struct rq *rq)
 7191{
 7192	return cfs_rq_load_avg(&rq->cfs);
 7193}
 7194
 7195/*
 7196 * cpu_load_without - compute CPU load without any contributions from *p
 7197 * @cpu: the CPU which load is requested
 7198 * @p: the task which load should be discounted
 7199 *
 7200 * The load of a CPU is defined by the load of tasks currently enqueued on that
 7201 * CPU as well as tasks which are currently sleeping after an execution on that
 7202 * CPU.
 7203 *
 7204 * This method returns the load of the specified CPU by discounting the load of
 7205 * the specified task, whenever the task is currently contributing to the CPU
 7206 * load.
 7207 */
 7208static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
 7209{
 7210	struct cfs_rq *cfs_rq;
 7211	unsigned int load;
 7212
 7213	/* Task has no contribution or is new */
 7214	if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 7215		return cpu_load(rq);
 7216
 7217	cfs_rq = &rq->cfs;
 7218	load = READ_ONCE(cfs_rq->avg.load_avg);
 7219
 7220	/* Discount task's util from CPU's util */
 7221	lsub_positive(&load, task_h_load(p));
 7222
 7223	return load;
 7224}
 7225
 7226static unsigned long cpu_runnable(struct rq *rq)
 7227{
 7228	return cfs_rq_runnable_avg(&rq->cfs);
 7229}
 7230
 7231static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
 7232{
 7233	struct cfs_rq *cfs_rq;
 7234	unsigned int runnable;
 7235
 7236	/* Task has no contribution or is new */
 7237	if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 7238		return cpu_runnable(rq);
 7239
 7240	cfs_rq = &rq->cfs;
 7241	runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
 7242
 7243	/* Discount task's runnable from CPU's runnable */
 7244	lsub_positive(&runnable, p->se.avg.runnable_avg);
 7245
 7246	return runnable;
 7247}
 7248
 7249static unsigned long capacity_of(int cpu)
 7250{
 7251	return cpu_rq(cpu)->cpu_capacity;
 7252}
 7253
 7254static void record_wakee(struct task_struct *p)
 7255{
 7256	/*
 7257	 * Only decay a single time; tasks that have less then 1 wakeup per
 7258	 * jiffy will not have built up many flips.
 7259	 */
 7260	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
 7261		current->wakee_flips >>= 1;
 7262		current->wakee_flip_decay_ts = jiffies;
 7263	}
 7264
 7265	if (current->last_wakee != p) {
 7266		current->last_wakee = p;
 7267		current->wakee_flips++;
 7268	}
 7269}
 7270
 7271/*
 7272 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
 7273 *
 7274 * A waker of many should wake a different task than the one last awakened
 7275 * at a frequency roughly N times higher than one of its wakees.
 7276 *
 7277 * In order to determine whether we should let the load spread vs consolidating
 7278 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
 7279 * partner, and a factor of lls_size higher frequency in the other.
 7280 *
 7281 * With both conditions met, we can be relatively sure that the relationship is
 7282 * non-monogamous, with partner count exceeding socket size.
 7283 *
 7284 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
 7285 * whatever is irrelevant, spread criteria is apparent partner count exceeds
 7286 * socket size.
 7287 */
 7288static int wake_wide(struct task_struct *p)
 7289{
 7290	unsigned int master = current->wakee_flips;
 7291	unsigned int slave = p->wakee_flips;
 7292	int factor = __this_cpu_read(sd_llc_size);
 7293
 7294	if (master < slave)
 7295		swap(master, slave);
 7296	if (slave < factor || master < slave * factor)
 7297		return 0;
 7298	return 1;
 7299}
 7300
 7301/*
 7302 * The purpose of wake_affine() is to quickly determine on which CPU we can run
 7303 * soonest. For the purpose of speed we only consider the waking and previous
 7304 * CPU.
 7305 *
 7306 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
 7307 *			cache-affine and is (or	will be) idle.
 7308 *
 7309 * wake_affine_weight() - considers the weight to reflect the average
 7310 *			  scheduling latency of the CPUs. This seems to work
 7311 *			  for the overloaded case.
 7312 */
 7313static int
 7314wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 7315{
 7316	/*
 7317	 * If this_cpu is idle, it implies the wakeup is from interrupt
 7318	 * context. Only allow the move if cache is shared. Otherwise an
 7319	 * interrupt intensive workload could force all tasks onto one
 7320	 * node depending on the IO topology or IRQ affinity settings.
 7321	 *
 7322	 * If the prev_cpu is idle and cache affine then avoid a migration.
 7323	 * There is no guarantee that the cache hot data from an interrupt
 7324	 * is more important than cache hot data on the prev_cpu and from
 7325	 * a cpufreq perspective, it's better to have higher utilisation
 7326	 * on one CPU.
 7327	 */
 7328	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
 7329		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 7330
 7331	if (sync && cpu_rq(this_cpu)->nr_running == 1)
 7332		return this_cpu;
 7333
 7334	if (available_idle_cpu(prev_cpu))
 7335		return prev_cpu;
 7336
 7337	return nr_cpumask_bits;
 7338}
 7339
 7340static int
 7341wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 7342		   int this_cpu, int prev_cpu, int sync)
 7343{
 7344	s64 this_eff_load, prev_eff_load;
 7345	unsigned long task_load;
 7346
 7347	this_eff_load = cpu_load(cpu_rq(this_cpu));
 7348
 7349	if (sync) {
 7350		unsigned long current_load = task_h_load(current);
 7351
 7352		if (current_load > this_eff_load)
 7353			return this_cpu;
 7354
 7355		this_eff_load -= current_load;
 7356	}
 7357
 7358	task_load = task_h_load(p);
 7359
 7360	this_eff_load += task_load;
 7361	if (sched_feat(WA_BIAS))
 7362		this_eff_load *= 100;
 7363	this_eff_load *= capacity_of(prev_cpu);
 7364
 7365	prev_eff_load = cpu_load(cpu_rq(prev_cpu));
 7366	prev_eff_load -= task_load;
 7367	if (sched_feat(WA_BIAS))
 7368		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
 7369	prev_eff_load *= capacity_of(this_cpu);
 7370
 7371	/*
 7372	 * If sync, adjust the weight of prev_eff_load such that if
 7373	 * prev_eff == this_eff that select_idle_sibling() will consider
 7374	 * stacking the wakee on top of the waker if no other CPU is
 7375	 * idle.
 7376	 */
 7377	if (sync)
 7378		prev_eff_load += 1;
 7379
 7380	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 7381}
 7382
 7383static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 7384		       int this_cpu, int prev_cpu, int sync)
 7385{
 7386	int target = nr_cpumask_bits;
 7387
 7388	if (sched_feat(WA_IDLE))
 7389		target = wake_affine_idle(this_cpu, prev_cpu, sync);
 7390
 7391	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
 7392		target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
 7393
 7394	schedstat_inc(p->stats.nr_wakeups_affine_attempts);
 7395	if (target != this_cpu)
 7396		return prev_cpu;
 7397
 7398	schedstat_inc(sd->ttwu_move_affine);
 7399	schedstat_inc(p->stats.nr_wakeups_affine);
 7400	return target;
 7401}
 7402
 7403static struct sched_group *
 7404sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
 7405
 7406/*
 7407 * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
 7408 */
 7409static int
 7410sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 7411{
 7412	unsigned long load, min_load = ULONG_MAX;
 7413	unsigned int min_exit_latency = UINT_MAX;
 7414	u64 latest_idle_timestamp = 0;
 7415	int least_loaded_cpu = this_cpu;
 7416	int shallowest_idle_cpu = -1;
 7417	int i;
 7418
 7419	/* Check if we have any choice: */
 7420	if (group->group_weight == 1)
 7421		return cpumask_first(sched_group_span(group));
 7422
 7423	/* Traverse only the allowed CPUs */
 7424	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
 7425		struct rq *rq = cpu_rq(i);
 7426
 7427		if (!sched_core_cookie_match(rq, p))
 7428			continue;
 7429
 7430		if (sched_idle_cpu(i))
 7431			return i;
 7432
 7433		if (available_idle_cpu(i)) {
 7434			struct cpuidle_state *idle = idle_get_state(rq);
 7435			if (idle && idle->exit_latency < min_exit_latency) {
 7436				/*
 7437				 * We give priority to a CPU whose idle state
 7438				 * has the smallest exit latency irrespective
 7439				 * of any idle timestamp.
 7440				 */
 7441				min_exit_latency = idle->exit_latency;
 7442				latest_idle_timestamp = rq->idle_stamp;
 7443				shallowest_idle_cpu = i;
 7444			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
 7445				   rq->idle_stamp > latest_idle_timestamp) {
 7446				/*
 7447				 * If equal or no active idle state, then
 7448				 * the most recently idled CPU might have
 7449				 * a warmer cache.
 7450				 */
 7451				latest_idle_timestamp = rq->idle_stamp;
 7452				shallowest_idle_cpu = i;
 7453			}
 7454		} else if (shallowest_idle_cpu == -1) {
 7455			load = cpu_load(cpu_rq(i));
 7456			if (load < min_load) {
 7457				min_load = load;
 7458				least_loaded_cpu = i;
 7459			}
 7460		}
 7461	}
 7462
 7463	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 7464}
 7465
 7466static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
 7467				  int cpu, int prev_cpu, int sd_flag)
 7468{
 7469	int new_cpu = cpu;
 7470
 7471	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
 7472		return prev_cpu;
 7473
 7474	/*
 7475	 * We need task's util for cpu_util_without, sync it up to
 7476	 * prev_cpu's last_update_time.
 7477	 */
 7478	if (!(sd_flag & SD_BALANCE_FORK))
 7479		sync_entity_load_avg(&p->se);
 7480
 7481	while (sd) {
 7482		struct sched_group *group;
 7483		struct sched_domain *tmp;
 7484		int weight;
 7485
 7486		if (!(sd->flags & sd_flag)) {
 7487			sd = sd->child;
 7488			continue;
 7489		}
 7490
 7491		group = sched_balance_find_dst_group(sd, p, cpu);
 7492		if (!group) {
 7493			sd = sd->child;
 7494			continue;
 7495		}
 7496
 7497		new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
 7498		if (new_cpu == cpu) {
 7499			/* Now try balancing at a lower domain level of 'cpu': */
 7500			sd = sd->child;
 7501			continue;
 7502		}
 7503
 7504		/* Now try balancing at a lower domain level of 'new_cpu': */
 7505		cpu = new_cpu;
 7506		weight = sd->span_weight;
 7507		sd = NULL;
 7508		for_each_domain(cpu, tmp) {
 7509			if (weight <= tmp->span_weight)
 7510				break;
 7511			if (tmp->flags & sd_flag)
 7512				sd = tmp;
 7513		}
 7514	}
 7515
 7516	return new_cpu;
 7517}
 7518
 7519static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 7520{
 7521	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
 7522	    sched_cpu_cookie_match(cpu_rq(cpu), p))
 7523		return cpu;
 7524
 7525	return -1;
 7526}
 7527
 7528#ifdef CONFIG_SCHED_SMT
 7529DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 7530EXPORT_SYMBOL_GPL(sched_smt_present);
 7531
 7532static inline void set_idle_cores(int cpu, int val)
 7533{
 7534	struct sched_domain_shared *sds;
 7535
 7536	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 7537	if (sds)
 7538		WRITE_ONCE(sds->has_idle_cores, val);
 7539}
 7540
 7541static inline bool test_idle_cores(int cpu)
 7542{
 7543	struct sched_domain_shared *sds;
 7544
 7545	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 7546	if (sds)
 7547		return READ_ONCE(sds->has_idle_cores);
 7548
 7549	return false;
 7550}
 7551
 7552/*
 7553 * Scans the local SMT mask to see if the entire core is idle, and records this
 7554 * information in sd_llc_shared->has_idle_cores.
 7555 *
 7556 * Since SMT siblings share all cache levels, inspecting this limited remote
 7557 * state should be fairly cheap.
 7558 */
 7559void __update_idle_core(struct rq *rq)
 7560{
 7561	int core = cpu_of(rq);
 7562	int cpu;
 7563
 7564	rcu_read_lock();
 7565	if (test_idle_cores(core))
 7566		goto unlock;
 7567
 7568	for_each_cpu(cpu, cpu_smt_mask(core)) {
 7569		if (cpu == core)
 7570			continue;
 7571
 7572		if (!available_idle_cpu(cpu))
 7573			goto unlock;
 7574	}
 7575
 7576	set_idle_cores(core, 1);
 7577unlock:
 7578	rcu_read_unlock();
 7579}
 7580
 7581/*
 7582 * Scan the entire LLC domain for idle cores; this dynamically switches off if
 7583 * there are no idle cores left in the system; tracked through
 7584 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
 7585 */
 7586static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 7587{
 7588	bool idle = true;
 7589	int cpu;
 7590
 7591	for_each_cpu(cpu, cpu_smt_mask(core)) {
 7592		if (!available_idle_cpu(cpu)) {
 7593			idle = false;
 7594			if (*idle_cpu == -1) {
 7595				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
 7596					*idle_cpu = cpu;
 7597					break;
 7598				}
 7599				continue;
 7600			}
 7601			break;
 7602		}
 7603		if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
 7604			*idle_cpu = cpu;
 7605	}
 7606
 7607	if (idle)
 7608		return core;
 7609
 7610	cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
 7611	return -1;
 7612}
 7613
 7614/*
 7615 * Scan the local SMT mask for idle CPUs.
 7616 */
 7617static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 7618{
 7619	int cpu;
 7620
 7621	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
 7622		if (cpu == target)
 7623			continue;
 7624		/*
 7625		 * Check if the CPU is in the LLC scheduling domain of @target.
 7626		 * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
 7627		 */
 7628		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 7629			continue;
 7630		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
 7631			return cpu;
 7632	}
 7633
 7634	return -1;
 7635}
 7636
 7637#else /* CONFIG_SCHED_SMT */
 7638
 7639static inline void set_idle_cores(int cpu, int val)
 7640{
 7641}
 7642
 7643static inline bool test_idle_cores(int cpu)
 7644{
 7645	return false;
 7646}
 7647
 7648static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 7649{
 7650	return __select_idle_cpu(core, p);
 7651}
 7652
 7653static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 7654{
 7655	return -1;
 7656}
 7657
 7658#endif /* CONFIG_SCHED_SMT */
 7659
 7660/*
 7661 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
 7662 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
 7663 * average idle time for this rq (as found in rq->avg_idle).
 7664 */
 7665static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
 7666{
 7667	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 7668	int i, cpu, idle_cpu = -1, nr = INT_MAX;
 7669	struct sched_domain_shared *sd_share;
 7670
 7671	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 7672
 7673	if (sched_feat(SIS_UTIL)) {
 7674		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
 7675		if (sd_share) {
 7676			/* because !--nr is the condition to stop scan */
 7677			nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
 7678			/* overloaded LLC is unlikely to have idle cpu/core */
 7679			if (nr == 1)
 7680				return -1;
 7681		}
 7682	}
 7683
 7684	if (static_branch_unlikely(&sched_cluster_active)) {
 7685		struct sched_group *sg = sd->groups;
 7686
 7687		if (sg->flags & SD_CLUSTER) {
 7688			for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
 7689				if (!cpumask_test_cpu(cpu, cpus))
 7690					continue;
 7691
 7692				if (has_idle_core) {
 7693					i = select_idle_core(p, cpu, cpus, &idle_cpu);
 7694					if ((unsigned int)i < nr_cpumask_bits)
 7695						return i;
 7696				} else {
 7697					if (--nr <= 0)
 7698						return -1;
 7699					idle_cpu = __select_idle_cpu(cpu, p);
 7700					if ((unsigned int)idle_cpu < nr_cpumask_bits)
 7701						return idle_cpu;
 7702				}
 7703			}
 7704			cpumask_andnot(cpus, cpus, sched_group_span(sg));
 7705		}
 7706	}
 7707
 7708	for_each_cpu_wrap(cpu, cpus, target + 1) {
 7709		if (has_idle_core) {
 7710			i = select_idle_core(p, cpu, cpus, &idle_cpu);
 7711			if ((unsigned int)i < nr_cpumask_bits)
 7712				return i;
 7713
 7714		} else {
 7715			if (--nr <= 0)
 7716				return -1;
 7717			idle_cpu = __select_idle_cpu(cpu, p);
 7718			if ((unsigned int)idle_cpu < nr_cpumask_bits)
 7719				break;
 7720		}
 7721	}
 7722
 7723	if (has_idle_core)
 7724		set_idle_cores(target, false);
 7725
 7726	return idle_cpu;
 7727}
 7728
 7729/*
 7730 * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
 7731 * the task fits. If no CPU is big enough, but there are idle ones, try to
 7732 * maximize capacity.
 7733 */
 7734static int
 7735select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 7736{
 7737	unsigned long task_util, util_min, util_max, best_cap = 0;
 7738	int fits, best_fits = 0;
 7739	int cpu, best_cpu = -1;
 7740	struct cpumask *cpus;
 7741
 7742	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 7743	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 7744
 7745	task_util = task_util_est(p);
 7746	util_min = uclamp_eff_value(p, UCLAMP_MIN);
 7747	util_max = uclamp_eff_value(p, UCLAMP_MAX);
 7748
 7749	for_each_cpu_wrap(cpu, cpus, target) {
 7750		unsigned long cpu_cap = capacity_of(cpu);
 7751
 7752		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
 7753			continue;
 7754
 7755		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
 7756
 7757		/* This CPU fits with all requirements */
 7758		if (fits > 0)
 7759			return cpu;
 7760		/*
 7761		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
 7762		 * Look for the CPU with best capacity.
 7763		 */
 7764		else if (fits < 0)
 7765			cpu_cap = get_actual_cpu_capacity(cpu);
 7766
 7767		/*
 7768		 * First, select CPU which fits better (-1 being better than 0).
 7769		 * Then, select the one with best capacity at same level.
 7770		 */
 7771		if ((fits < best_fits) ||
 7772		    ((fits == best_fits) && (cpu_cap > best_cap))) {
 7773			best_cap = cpu_cap;
 7774			best_cpu = cpu;
 7775			best_fits = fits;
 7776		}
 7777	}
 7778
 7779	return best_cpu;
 7780}
 7781
 7782static inline bool asym_fits_cpu(unsigned long util,
 7783				 unsigned long util_min,
 7784				 unsigned long util_max,
 7785				 int cpu)
 7786{
 7787	if (sched_asym_cpucap_active())
 7788		/*
 7789		 * Return true only if the cpu fully fits the task requirements
 7790		 * which include the utilization and the performance hints.
 7791		 */
 7792		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
 7793
 7794	return true;
 7795}
 7796
 7797/*
 7798 * Try and locate an idle core/thread in the LLC cache domain.
 7799 */
 7800static int select_idle_sibling(struct task_struct *p, int prev, int target)
 7801{
 7802	bool has_idle_core = false;
 7803	struct sched_domain *sd;
 7804	unsigned long task_util, util_min, util_max;
 7805	int i, recent_used_cpu, prev_aff = -1;
 7806
 7807	/*
 7808	 * On asymmetric system, update task utilization because we will check
 7809	 * that the task fits with CPU's capacity.
 7810	 */
 7811	if (sched_asym_cpucap_active()) {
 7812		sync_entity_load_avg(&p->se);
 7813		task_util = task_util_est(p);
 7814		util_min = uclamp_eff_value(p, UCLAMP_MIN);
 7815		util_max = uclamp_eff_value(p, UCLAMP_MAX);
 7816	}
 7817
 7818	/*
 7819	 * per-cpu select_rq_mask usage
 7820	 */
 7821	lockdep_assert_irqs_disabled();
 7822
 7823	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
 7824	    asym_fits_cpu(task_util, util_min, util_max, target))
 7825		return target;
 7826
 7827	/*
 7828	 * If the previous CPU is cache affine and idle, don't be stupid:
 7829	 */
 7830	if (prev != target && cpus_share_cache(prev, target) &&
 7831	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
 7832	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
 7833
 7834		if (!static_branch_unlikely(&sched_cluster_active) ||
 7835		    cpus_share_resources(prev, target))
 7836			return prev;
 7837
 7838		prev_aff = prev;
 7839	}
 7840
 7841	/*
 7842	 * Allow a per-cpu kthread to stack with the wakee if the
 7843	 * kworker thread and the tasks previous CPUs are the same.
 7844	 * The assumption is that the wakee queued work for the
 7845	 * per-cpu kthread that is now complete and the wakeup is
 7846	 * essentially a sync wakeup. An obvious example of this
 7847	 * pattern is IO completions.
 7848	 */
 7849	if (is_per_cpu_kthread(current) &&
 7850	    in_task() &&
 7851	    prev == smp_processor_id() &&
 7852	    this_rq()->nr_running <= 1 &&
 7853	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
 7854		return prev;
 7855	}
 7856
 7857	/* Check a recently used CPU as a potential idle candidate: */
 7858	recent_used_cpu = p->recent_used_cpu;
 7859	p->recent_used_cpu = prev;
 7860	if (recent_used_cpu != prev &&
 7861	    recent_used_cpu != target &&
 7862	    cpus_share_cache(recent_used_cpu, target) &&
 7863	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 7864	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
 7865	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
 7866
 7867		if (!static_branch_unlikely(&sched_cluster_active) ||
 7868		    cpus_share_resources(recent_used_cpu, target))
 7869			return recent_used_cpu;
 7870
 7871	} else {
 7872		recent_used_cpu = -1;
 7873	}
 7874
 7875	/*
 7876	 * For asymmetric CPU capacity systems, our domain of interest is
 7877	 * sd_asym_cpucapacity rather than sd_llc.
 7878	 */
 7879	if (sched_asym_cpucap_active()) {
 7880		sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
 7881		/*
 7882		 * On an asymmetric CPU capacity system where an exclusive
 7883		 * cpuset defines a symmetric island (i.e. one unique
 7884		 * capacity_orig value through the cpuset), the key will be set
 7885		 * but the CPUs within that cpuset will not have a domain with
 7886		 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
 7887		 * capacity path.
 7888		 */
 7889		if (sd) {
 7890			i = select_idle_capacity(p, sd, target);
 7891			return ((unsigned)i < nr_cpumask_bits) ? i : target;
 7892		}
 7893	}
 7894
 7895	sd = rcu_dereference(per_cpu(sd_llc, target));
 7896	if (!sd)
 7897		return target;
 7898
 7899	if (sched_smt_active()) {
 7900		has_idle_core = test_idle_cores(target);
 7901
 7902		if (!has_idle_core && cpus_share_cache(prev, target)) {
 7903			i = select_idle_smt(p, sd, prev);
 7904			if ((unsigned int)i < nr_cpumask_bits)
 7905				return i;
 7906		}
 7907	}
 7908
 7909	i = select_idle_cpu(p, sd, has_idle_core, target);
 7910	if ((unsigned)i < nr_cpumask_bits)
 7911		return i;
 7912
 7913	/*
 7914	 * For cluster machines which have lower sharing cache like L2 or
 7915	 * LLC Tag, we tend to find an idle CPU in the target's cluster
 7916	 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
 7917	 * use them if possible when no idle CPU found in select_idle_cpu().
 7918	 */
 7919	if ((unsigned int)prev_aff < nr_cpumask_bits)
 7920		return prev_aff;
 7921	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
 7922		return recent_used_cpu;
 7923
 7924	return target;
 7925}
 7926
 7927/**
 7928 * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
 7929 * @cpu: the CPU to get the utilization for
 7930 * @p: task for which the CPU utilization should be predicted or NULL
 7931 * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
 7932 * @boost: 1 to enable boosting, otherwise 0
 7933 *
 7934 * The unit of the return value must be the same as the one of CPU capacity
 7935 * so that CPU utilization can be compared with CPU capacity.
 7936 *
 7937 * CPU utilization is the sum of running time of runnable tasks plus the
 7938 * recent utilization of currently non-runnable tasks on that CPU.
 7939 * It represents the amount of CPU capacity currently used by CFS tasks in
 7940 * the range [0..max CPU capacity] with max CPU capacity being the CPU
 7941 * capacity at f_max.
 7942 *
 7943 * The estimated CPU utilization is defined as the maximum between CPU
 7944 * utilization and sum of the estimated utilization of the currently
 7945 * runnable tasks on that CPU. It preserves a utilization "snapshot" of
 7946 * previously-executed tasks, which helps better deduce how busy a CPU will
 7947 * be when a long-sleeping task wakes up. The contribution to CPU utilization
 7948 * of such a task would be significantly decayed at this point of time.
 7949 *
 7950 * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
 7951 * CPU contention for CFS tasks can be detected by CPU runnable > CPU
 7952 * utilization. Boosting is implemented in cpu_util() so that internal
 7953 * users (e.g. EAS) can use it next to external users (e.g. schedutil),
 7954 * latter via cpu_util_cfs_boost().
 7955 *
 7956 * CPU utilization can be higher than the current CPU capacity
 7957 * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
 7958 * of rounding errors as well as task migrations or wakeups of new tasks.
 7959 * CPU utilization has to be capped to fit into the [0..max CPU capacity]
 7960 * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
 7961 * could be seen as over-utilized even though CPU1 has 20% of spare CPU
 7962 * capacity. CPU utilization is allowed to overshoot current CPU capacity
 7963 * though since this is useful for predicting the CPU capacity required
 7964 * after task migrations (scheduler-driven DVFS).
 7965 *
 7966 * Return: (Boosted) (estimated) utilization for the specified CPU.
 7967 */
 7968static unsigned long
 7969cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 7970{
 7971	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
 7972	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
 7973	unsigned long runnable;
 7974
 7975	if (boost) {
 7976		runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
 7977		util = max(util, runnable);
 7978	}
 7979
 7980	/*
 7981	 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
 7982	 * contribution. If @p migrates from another CPU to @cpu add its
 7983	 * contribution. In all the other cases @cpu is not impacted by the
 7984	 * migration so its util_avg is already correct.
 7985	 */
 7986	if (p && task_cpu(p) == cpu && dst_cpu != cpu)
 7987		lsub_positive(&util, task_util(p));
 7988	else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
 7989		util += task_util(p);
 7990
 7991	if (sched_feat(UTIL_EST)) {
 7992		unsigned long util_est;
 7993
 7994		util_est = READ_ONCE(cfs_rq->avg.util_est);
 7995
 7996		/*
 7997		 * During wake-up @p isn't enqueued yet and doesn't contribute
 7998		 * to any cpu_rq(cpu)->cfs.avg.util_est.
 7999		 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
 8000		 * has been enqueued.
 8001		 *
 8002		 * During exec (@dst_cpu = -1) @p is enqueued and does
 8003		 * contribute to cpu_rq(cpu)->cfs.util_est.
 8004		 * Remove it to "simulate" cpu_util without @p's contribution.
 8005		 *
 8006		 * Despite the task_on_rq_queued(@p) check there is still a
 8007		 * small window for a possible race when an exec
 8008		 * select_task_rq_fair() races with LB's detach_task().
 8009		 *
 8010		 *   detach_task()
 8011		 *     deactivate_task()
 8012		 *       p->on_rq = TASK_ON_RQ_MIGRATING;
 8013		 *       -------------------------------- A
 8014		 *       dequeue_task()                    \
 8015		 *         dequeue_task_fair()              + Race Time
 8016		 *           util_est_dequeue()            /
 8017		 *       -------------------------------- B
 8018		 *
 8019		 * The additional check "current == p" is required to further
 8020		 * reduce the race window.
 8021		 */
 8022		if (dst_cpu == cpu)
 8023			util_est += _task_util_est(p);
 8024		else if (p && unlikely(task_on_rq_queued(p) || current == p))
 8025			lsub_positive(&util_est, _task_util_est(p));
 8026
 8027		util = max(util, util_est);
 8028	}
 8029
 8030	return min(util, arch_scale_cpu_capacity(cpu));
 8031}
 8032
 8033unsigned long cpu_util_cfs(int cpu)
 8034{
 8035	return cpu_util(cpu, NULL, -1, 0);
 8036}
 8037
 8038unsigned long cpu_util_cfs_boost(int cpu)
 8039{
 8040	return cpu_util(cpu, NULL, -1, 1);
 8041}
 8042
 8043/*
 8044 * cpu_util_without: compute cpu utilization without any contributions from *p
 8045 * @cpu: the CPU which utilization is requested
 8046 * @p: the task which utilization should be discounted
 8047 *
 8048 * The utilization of a CPU is defined by the utilization of tasks currently
 8049 * enqueued on that CPU as well as tasks which are currently sleeping after an
 8050 * execution on that CPU.
 8051 *
 8052 * This method returns the utilization of the specified CPU by discounting the
 8053 * utilization of the specified task, whenever the task is currently
 8054 * contributing to the CPU utilization.
 8055 */
 8056static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 8057{
 8058	/* Task has no contribution or is new */
 8059	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 8060		p = NULL;
 8061
 8062	return cpu_util(cpu, p, -1, 0);
 8063}
 8064
 8065/*
 8066 * This function computes an effective utilization for the given CPU, to be
 8067 * used for frequency selection given the linear relation: f = u * f_max.
 8068 *
 8069 * The scheduler tracks the following metrics:
 8070 *
 8071 *   cpu_util_{cfs,rt,dl,irq}()
 8072 *   cpu_bw_dl()
 8073 *
 8074 * Where the cfs,rt and dl util numbers are tracked with the same metric and
 8075 * synchronized windows and are thus directly comparable.
 8076 *
 8077 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
 8078 * which excludes things like IRQ and steal-time. These latter are then accrued
 8079 * in the IRQ utilization.
 8080 *
 8081 * The DL bandwidth number OTOH is not a measured metric but a value computed
 8082 * based on the task model parameters and gives the minimal utilization
 8083 * required to meet deadlines.
 8084 */
 8085unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 8086				 unsigned long *min,
 8087				 unsigned long *max)
 8088{
 8089	unsigned long util, irq, scale;
 8090	struct rq *rq = cpu_rq(cpu);
 8091
 8092	scale = arch_scale_cpu_capacity(cpu);
 8093
 8094	/*
 8095	 * Early check to see if IRQ/steal time saturates the CPU, can be
 8096	 * because of inaccuracies in how we track these -- see
 8097	 * update_irq_load_avg().
 8098	 */
 8099	irq = cpu_util_irq(rq);
 8100	if (unlikely(irq >= scale)) {
 8101		if (min)
 8102			*min = scale;
 8103		if (max)
 8104			*max = scale;
 8105		return scale;
 8106	}
 8107
 8108	if (min) {
 8109		/*
 8110		 * The minimum utilization returns the highest level between:
 8111		 * - the computed DL bandwidth needed with the IRQ pressure which
 8112		 *   steals time to the deadline task.
 8113		 * - The minimum performance requirement for CFS and/or RT.
 8114		 */
 8115		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
 8116
 8117		/*
 8118		 * When an RT task is runnable and uclamp is not used, we must
 8119		 * ensure that the task will run at maximum compute capacity.
 8120		 */
 8121		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
 8122			*min = max(*min, scale);
 8123	}
 8124
 8125	/*
 8126	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
 8127	 * CFS tasks and we use the same metric to track the effective
 8128	 * utilization (PELT windows are synchronized) we can directly add them
 8129	 * to obtain the CPU's actual utilization.
 8130	 */
 8131	util = util_cfs + cpu_util_rt(rq);
 8132	util += cpu_util_dl(rq);
 8133
 8134	/*
 8135	 * The maximum hint is a soft bandwidth requirement, which can be lower
 8136	 * than the actual utilization because of uclamp_max requirements.
 8137	 */
 8138	if (max)
 8139		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
 8140
 8141	if (util >= scale)
 8142		return scale;
 8143
 8144	/*
 8145	 * There is still idle time; further improve the number by using the
 8146	 * IRQ metric. Because IRQ/steal time is hidden from the task clock we
 8147	 * need to scale the task numbers:
 8148	 *
 8149	 *              max - irq
 8150	 *   U' = irq + --------- * U
 8151	 *                 max
 8152	 */
 8153	util = scale_irq_capacity(util, irq, scale);
 8154	util += irq;
 8155
 8156	return min(scale, util);
 8157}
 8158
 8159unsigned long sched_cpu_util(int cpu)
 8160{
 8161	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
 8162}
 8163
 8164/*
 8165 * energy_env - Utilization landscape for energy estimation.
 8166 * @task_busy_time: Utilization contribution by the task for which we test the
 8167 *                  placement. Given by eenv_task_busy_time().
 8168 * @pd_busy_time:   Utilization of the whole perf domain without the task
 8169 *                  contribution. Given by eenv_pd_busy_time().
 8170 * @cpu_cap:        Maximum CPU capacity for the perf domain.
 8171 * @pd_cap:         Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
 8172 */
 8173struct energy_env {
 8174	unsigned long task_busy_time;
 8175	unsigned long pd_busy_time;
 8176	unsigned long cpu_cap;
 8177	unsigned long pd_cap;
 8178};
 8179
 8180/*
 8181 * Compute the task busy time for compute_energy(). This time cannot be
 8182 * injected directly into effective_cpu_util() because of the IRQ scaling.
 8183 * The latter only makes sense with the most recent CPUs where the task has
 8184 * run.
 8185 */
 8186static inline void eenv_task_busy_time(struct energy_env *eenv,
 8187				       struct task_struct *p, int prev_cpu)
 8188{
 8189	unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
 8190	unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
 8191
 8192	if (unlikely(irq >= max_cap))
 8193		busy_time = max_cap;
 8194	else
 8195		busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
 8196
 8197	eenv->task_busy_time = busy_time;
 8198}
 8199
 8200/*
 8201 * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
 8202 * utilization for each @pd_cpus, it however doesn't take into account
 8203 * clamping since the ratio (utilization / cpu_capacity) is already enough to
 8204 * scale the EM reported power consumption at the (eventually clamped)
 8205 * cpu_capacity.
 8206 *
 8207 * The contribution of the task @p for which we want to estimate the
 8208 * energy cost is removed (by cpu_util()) and must be calculated
 8209 * separately (see eenv_task_busy_time). This ensures:
 8210 *
 8211 *   - A stable PD utilization, no matter which CPU of that PD we want to place
 8212 *     the task on.
 8213 *
 8214 *   - A fair comparison between CPUs as the task contribution (task_util())
 8215 *     will always be the same no matter which CPU utilization we rely on
 8216 *     (util_avg or util_est).
 8217 *
 8218 * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
 8219 * exceed @eenv->pd_cap.
 8220 */
 8221static inline void eenv_pd_busy_time(struct energy_env *eenv,
 8222				     struct cpumask *pd_cpus,
 8223				     struct task_struct *p)
 8224{
 8225	unsigned long busy_time = 0;
 8226	int cpu;
 8227
 8228	for_each_cpu(cpu, pd_cpus) {
 8229		unsigned long util = cpu_util(cpu, p, -1, 0);
 8230
 8231		busy_time += effective_cpu_util(cpu, util, NULL, NULL);
 8232	}
 8233
 8234	eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
 8235}
 8236
 8237/*
 8238 * Compute the maximum utilization for compute_energy() when the task @p
 8239 * is placed on the cpu @dst_cpu.
 8240 *
 8241 * Returns the maximum utilization among @eenv->cpus. This utilization can't
 8242 * exceed @eenv->cpu_cap.
 8243 */
 8244static inline unsigned long
 8245eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 8246		 struct task_struct *p, int dst_cpu)
 8247{
 8248	unsigned long max_util = 0;
 8249	int cpu;
 8250
 8251	for_each_cpu(cpu, pd_cpus) {
 8252		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
 8253		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
 8254		unsigned long eff_util, min, max;
 8255
 8256		/*
 8257		 * Performance domain frequency: utilization clamping
 8258		 * must be considered since it affects the selection
 8259		 * of the performance domain frequency.
 8260		 * NOTE: in case RT tasks are running, by default the min
 8261		 * utilization can be max OPP.
 8262		 */
 8263		eff_util = effective_cpu_util(cpu, util, &min, &max);
 8264
 8265		/* Task's uclamp can modify min and max value */
 8266		if (tsk && uclamp_is_used()) {
 8267			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
 8268
 8269			/*
 8270			 * If there is no active max uclamp constraint,
 8271			 * directly use task's one, otherwise keep max.
 8272			 */
 8273			if (uclamp_rq_is_idle(cpu_rq(cpu)))
 8274				max = uclamp_eff_value(p, UCLAMP_MAX);
 8275			else
 8276				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
 8277		}
 8278
 8279		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
 8280		max_util = max(max_util, eff_util);
 8281	}
 8282
 8283	return min(max_util, eenv->cpu_cap);
 8284}
 8285
 8286/*
 8287 * compute_energy(): Use the Energy Model to estimate the energy that @pd would
 8288 * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
 8289 * contribution is ignored.
 8290 */
 8291static inline unsigned long
 8292compute_energy(struct energy_env *eenv, struct perf_domain *pd,
 8293	       struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
 8294{
 8295	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
 8296	unsigned long busy_time = eenv->pd_busy_time;
 8297	unsigned long energy;
 8298
 8299	if (dst_cpu >= 0)
 8300		busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
 8301
 8302	energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
 8303
 8304	trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
 8305
 8306	return energy;
 8307}
 8308
 8309/*
 8310 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
 8311 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
 8312 * spare capacity in each performance domain and uses it as a potential
 8313 * candidate to execute the task. Then, it uses the Energy Model to figure
 8314 * out which of the CPU candidates is the most energy-efficient.
 8315 *
 8316 * The rationale for this heuristic is as follows. In a performance domain,
 8317 * all the most energy efficient CPU candidates (according to the Energy
 8318 * Model) are those for which we'll request a low frequency. When there are
 8319 * several CPUs for which the frequency request will be the same, we don't
 8320 * have enough data to break the tie between them, because the Energy Model
 8321 * only includes active power costs. With this model, if we assume that
 8322 * frequency requests follow utilization (e.g. using schedutil), the CPU with
 8323 * the maximum spare capacity in a performance domain is guaranteed to be among
 8324 * the best candidates of the performance domain.
 8325 *
 8326 * In practice, it could be preferable from an energy standpoint to pack
 8327 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
 8328 * but that could also hurt our chances to go cluster idle, and we have no
 8329 * ways to tell with the current Energy Model if this is actually a good
 8330 * idea or not. So, find_energy_efficient_cpu() basically favors
 8331 * cluster-packing, and spreading inside a cluster. That should at least be
 8332 * a good thing for latency, and this is consistent with the idea that most
 8333 * of the energy savings of EAS come from the asymmetry of the system, and
 8334 * not so much from breaking the tie between identical CPUs. That's also the
 8335 * reason why EAS is enabled in the topology code only for systems where
 8336 * SD_ASYM_CPUCAPACITY is set.
 8337 *
 8338 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
 8339 * they don't have any useful utilization data yet and it's not possible to
 8340 * forecast their impact on energy consumption. Consequently, they will be
 8341 * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
 8342 * to be energy-inefficient in some use-cases. The alternative would be to
 8343 * bias new tasks towards specific types of CPUs first, or to try to infer
 8344 * their util_avg from the parent task, but those heuristics could hurt
 8345 * other use-cases too. So, until someone finds a better way to solve this,
 8346 * let's keep things simple by re-using the existing slow path.
 8347 */
 8348static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 8349{
 8350	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 8351	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
 8352	unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
 8353	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
 8354	struct root_domain *rd = this_rq()->rd;
 8355	int cpu, best_energy_cpu, target = -1;
 8356	int prev_fits = -1, best_fits = -1;
 8357	unsigned long best_actual_cap = 0;
 8358	unsigned long prev_actual_cap = 0;
 8359	struct sched_domain *sd;
 8360	struct perf_domain *pd;
 8361	struct energy_env eenv;
 8362
 8363	rcu_read_lock();
 8364	pd = rcu_dereference(rd->pd);
 8365	if (!pd)
 8366		goto unlock;
 8367
 8368	/*
 8369	 * Energy-aware wake-up happens on the lowest sched_domain starting
 8370	 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
 8371	 */
 8372	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
 8373	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
 8374		sd = sd->parent;
 8375	if (!sd)
 8376		goto unlock;
 8377
 8378	target = prev_cpu;
 8379
 8380	sync_entity_load_avg(&p->se);
 8381	if (!task_util_est(p) && p_util_min == 0)
 8382		goto unlock;
 8383
 8384	eenv_task_busy_time(&eenv, p, prev_cpu);
 8385
 8386	for (; pd; pd = pd->next) {
 8387		unsigned long util_min = p_util_min, util_max = p_util_max;
 8388		unsigned long cpu_cap, cpu_actual_cap, util;
 8389		long prev_spare_cap = -1, max_spare_cap = -1;
 8390		unsigned long rq_util_min, rq_util_max;
 8391		unsigned long cur_delta, base_energy;
 8392		int max_spare_cap_cpu = -1;
 8393		int fits, max_fits = -1;
 8394
 8395		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
 8396
 8397		if (cpumask_empty(cpus))
 8398			continue;
 8399
 8400		/* Account external pressure for the energy estimation */
 8401		cpu = cpumask_first(cpus);
 8402		cpu_actual_cap = get_actual_cpu_capacity(cpu);
 8403
 8404		eenv.cpu_cap = cpu_actual_cap;
 8405		eenv.pd_cap = 0;
 8406
 8407		for_each_cpu(cpu, cpus) {
 8408			struct rq *rq = cpu_rq(cpu);
 8409
 8410			eenv.pd_cap += cpu_actual_cap;
 8411
 8412			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 8413				continue;
 8414
 8415			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 8416				continue;
 8417
 8418			util = cpu_util(cpu, p, cpu, 0);
 8419			cpu_cap = capacity_of(cpu);
 8420
 8421			/*
 8422			 * Skip CPUs that cannot satisfy the capacity request.
 8423			 * IOW, placing the task there would make the CPU
 8424			 * overutilized. Take uclamp into account to see how
 8425			 * much capacity we can get out of the CPU; this is
 8426			 * aligned with sched_cpu_util().
 8427			 */
 8428			if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
 8429				/*
 8430				 * Open code uclamp_rq_util_with() except for
 8431				 * the clamp() part. I.e.: apply max aggregation
 8432				 * only. util_fits_cpu() logic requires to
 8433				 * operate on non clamped util but must use the
 8434				 * max-aggregated uclamp_{min, max}.
 8435				 */
 8436				rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
 8437				rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
 8438
 8439				util_min = max(rq_util_min, p_util_min);
 8440				util_max = max(rq_util_max, p_util_max);
 8441			}
 8442
 8443			fits = util_fits_cpu(util, util_min, util_max, cpu);
 8444			if (!fits)
 8445				continue;
 8446
 8447			lsub_positive(&cpu_cap, util);
 8448
 8449			if (cpu == prev_cpu) {
 8450				/* Always use prev_cpu as a candidate. */
 8451				prev_spare_cap = cpu_cap;
 8452				prev_fits = fits;
 8453			} else if ((fits > max_fits) ||
 8454				   ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
 8455				/*
 8456				 * Find the CPU with the maximum spare capacity
 8457				 * among the remaining CPUs in the performance
 8458				 * domain.
 8459				 */
 8460				max_spare_cap = cpu_cap;
 8461				max_spare_cap_cpu = cpu;
 8462				max_fits = fits;
 8463			}
 8464		}
 8465
 8466		if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
 8467			continue;
 8468
 8469		eenv_pd_busy_time(&eenv, cpus, p);
 8470		/* Compute the 'base' energy of the pd, without @p */
 8471		base_energy = compute_energy(&eenv, pd, cpus, p, -1);
 8472
 8473		/* Evaluate the energy impact of using prev_cpu. */
 8474		if (prev_spare_cap > -1) {
 8475			prev_delta = compute_energy(&eenv, pd, cpus, p,
 8476						    prev_cpu);
 8477			/* CPU utilization has changed */
 8478			if (prev_delta < base_energy)
 8479				goto unlock;
 8480			prev_delta -= base_energy;
 8481			prev_actual_cap = cpu_actual_cap;
 8482			best_delta = min(best_delta, prev_delta);
 8483		}
 8484
 8485		/* Evaluate the energy impact of using max_spare_cap_cpu. */
 8486		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
 8487			/* Current best energy cpu fits better */
 8488			if (max_fits < best_fits)
 8489				continue;
 8490
 8491			/*
 8492			 * Both don't fit performance hint (i.e. uclamp_min)
 8493			 * but best energy cpu has better capacity.
 8494			 */
 8495			if ((max_fits < 0) &&
 8496			    (cpu_actual_cap <= best_actual_cap))
 8497				continue;
 8498
 8499			cur_delta = compute_energy(&eenv, pd, cpus, p,
 8500						   max_spare_cap_cpu);
 8501			/* CPU utilization has changed */
 8502			if (cur_delta < base_energy)
 8503				goto unlock;
 8504			cur_delta -= base_energy;
 8505
 8506			/*
 8507			 * Both fit for the task but best energy cpu has lower
 8508			 * energy impact.
 8509			 */
 8510			if ((max_fits > 0) && (best_fits > 0) &&
 8511			    (cur_delta >= best_delta))
 8512				continue;
 8513
 8514			best_delta = cur_delta;
 8515			best_energy_cpu = max_spare_cap_cpu;
 8516			best_fits = max_fits;
 8517			best_actual_cap = cpu_actual_cap;
 8518		}
 8519	}
 8520	rcu_read_unlock();
 8521
 8522	if ((best_fits > prev_fits) ||
 8523	    ((best_fits > 0) && (best_delta < prev_delta)) ||
 8524	    ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
 8525		target = best_energy_cpu;
 8526
 8527	return target;
 8528
 8529unlock:
 8530	rcu_read_unlock();
 8531
 8532	return target;
 8533}
 8534
 8535/*
 8536 * select_task_rq_fair: Select target runqueue for the waking task in domains
 8537 * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
 8538 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 8539 *
 8540 * Balances load by selecting the idlest CPU in the idlest group, or under
 8541 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
 8542 *
 8543 * Returns the target CPU number.
 8544 */
 8545static int
 8546select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 8547{
 8548	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
 8549	struct sched_domain *tmp, *sd = NULL;
 8550	int cpu = smp_processor_id();
 8551	int new_cpu = prev_cpu;
 8552	int want_affine = 0;
 8553	/* SD_flags and WF_flags share the first nibble */
 8554	int sd_flag = wake_flags & 0xF;
 8555
 8556	/*
 8557	 * required for stable ->cpus_allowed
 8558	 */
 8559	lockdep_assert_held(&p->pi_lock);
 8560	if (wake_flags & WF_TTWU) {
 8561		record_wakee(p);
 8562
 8563		if ((wake_flags & WF_CURRENT_CPU) &&
 8564		    cpumask_test_cpu(cpu, p->cpus_ptr))
 8565			return cpu;
 8566
 8567		if (!is_rd_overutilized(this_rq()->rd)) {
 8568			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
 8569			if (new_cpu >= 0)
 8570				return new_cpu;
 8571			new_cpu = prev_cpu;
 8572		}
 8573
 8574		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
 8575	}
 8576
 8577	rcu_read_lock();
 8578	for_each_domain(cpu, tmp) {
 8579		/*
 8580		 * If both 'cpu' and 'prev_cpu' are part of this domain,
 8581		 * cpu is a valid SD_WAKE_AFFINE target.
 8582		 */
 8583		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 8584		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
 8585			if (cpu != prev_cpu)
 8586				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
 8587
 8588			sd = NULL; /* Prefer wake_affine over balance flags */
 8589			break;
 8590		}
 8591
 8592		/*
 8593		 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
 8594		 * usually do not have SD_BALANCE_WAKE set. That means wakeup
 8595		 * will usually go to the fast path.
 8596		 */
 8597		if (tmp->flags & sd_flag)
 8598			sd = tmp;
 8599		else if (!want_affine)
 8600			break;
 8601	}
 8602
 8603	if (unlikely(sd)) {
 8604		/* Slow path */
 8605		new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
 8606	} else if (wake_flags & WF_TTWU) { /* XXX always ? */
 8607		/* Fast path */
 8608		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 8609	}
 8610	rcu_read_unlock();
 8611
 8612	return new_cpu;
 8613}
 8614
 8615/*
 8616 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
 8617 * cfs_rq_of(p) references at time of call are still valid and identify the
 8618 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
 8619 */
 8620static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 8621{
 8622	struct sched_entity *se = &p->se;
 8623
 8624	if (!task_on_rq_migrating(p)) {
 8625		remove_entity_load_avg(se);
 8626
 8627		/*
 8628		 * Here, the task's PELT values have been updated according to
 8629		 * the current rq's clock. But if that clock hasn't been
 8630		 * updated in a while, a substantial idle time will be missed,
 8631		 * leading to an inflation after wake-up on the new rq.
 8632		 *
 8633		 * Estimate the missing time from the cfs_rq last_update_time
 8634		 * and update sched_avg to improve the PELT continuity after
 8635		 * migration.
 8636		 */
 8637		migrate_se_pelt_lag(se);
 8638	}
 8639
 8640	/* Tell new CPU we are migrated */
 8641	se->avg.last_update_time = 0;
 8642
 8643	update_scan_period(p, new_cpu);
 8644}
 8645
 8646static void task_dead_fair(struct task_struct *p)
 8647{
 8648	struct sched_entity *se = &p->se;
 8649
 8650	if (se->sched_delayed) {
 8651		struct rq_flags rf;
 8652		struct rq *rq;
 8653
 8654		rq = task_rq_lock(p, &rf);
 8655		if (se->sched_delayed) {
 8656			update_rq_clock(rq);
 8657			dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
 8658		}
 8659		task_rq_unlock(rq, p, &rf);
 8660	}
 8661
 8662	remove_entity_load_avg(se);
 8663}
 8664
 8665/*
 8666 * Set the max capacity the task is allowed to run at for misfit detection.
 8667 */
 8668static void set_task_max_allowed_capacity(struct task_struct *p)
 8669{
 8670	struct asym_cap_data *entry;
 8671
 8672	if (!sched_asym_cpucap_active())
 8673		return;
 8674
 8675	rcu_read_lock();
 8676	list_for_each_entry_rcu(entry, &asym_cap_list, link) {
 8677		cpumask_t *cpumask;
 8678
 8679		cpumask = cpu_capacity_span(entry);
 8680		if (!cpumask_intersects(p->cpus_ptr, cpumask))
 8681			continue;
 8682
 8683		p->max_allowed_capacity = entry->capacity;
 8684		break;
 8685	}
 8686	rcu_read_unlock();
 8687}
 8688
 8689static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
 8690{
 8691	set_cpus_allowed_common(p, ctx);
 8692	set_task_max_allowed_capacity(p);
 8693}
 8694
 8695static int
 8696balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 8697{
 8698	if (sched_fair_runnable(rq))
 8699		return 1;
 8700
 8701	return sched_balance_newidle(rq, rf) != 0;
 8702}
 8703#else
 8704static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
 8705#endif /* CONFIG_SMP */
 8706
 8707static void set_next_buddy(struct sched_entity *se)
 8708{
 8709	for_each_sched_entity(se) {
 8710		if (SCHED_WARN_ON(!se->on_rq))
 8711			return;
 8712		if (se_is_idle(se))
 8713			return;
 8714		cfs_rq_of(se)->next = se;
 8715	}
 8716}
 8717
 8718/*
 8719 * Preempt the current task with a newly woken task if needed:
 8720 */
 8721static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 8722{
 8723	struct task_struct *donor = rq->donor;
 8724	struct sched_entity *se = &donor->se, *pse = &p->se;
 8725	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
 8726	int cse_is_idle, pse_is_idle;
 8727
 8728	if (unlikely(se == pse))
 8729		return;
 8730
 8731	/*
 8732	 * This is possible from callers such as attach_tasks(), in which we
 8733	 * unconditionally wakeup_preempt() after an enqueue (which may have
 8734	 * lead to a throttle).  This both saves work and prevents false
 8735	 * next-buddy nomination below.
 8736	 */
 8737	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
 8738		return;
 8739
 8740	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
 8741		set_next_buddy(pse);
 8742	}
 8743
 8744	/*
 8745	 * We can come here with TIF_NEED_RESCHED already set from new task
 8746	 * wake up path.
 8747	 *
 8748	 * Note: this also catches the edge-case of curr being in a throttled
 8749	 * group (e.g. via set_curr_task), since update_curr() (in the
 8750	 * enqueue of curr) will have resulted in resched being set.  This
 8751	 * prevents us from potentially nominating it as a false LAST_BUDDY
 8752	 * below.
 8753	 */
 8754	if (test_tsk_need_resched(rq->curr))
 8755		return;
 8756
 8757	if (!sched_feat(WAKEUP_PREEMPTION))
 8758		return;
 8759
 8760	find_matching_se(&se, &pse);
 8761	WARN_ON_ONCE(!pse);
 8762
 8763	cse_is_idle = se_is_idle(se);
 8764	pse_is_idle = se_is_idle(pse);
 8765
 8766	/*
 8767	 * Preempt an idle entity in favor of a non-idle entity (and don't preempt
 8768	 * in the inverse case).
 8769	 */
 8770	if (cse_is_idle && !pse_is_idle)
 8771		goto preempt;
 8772	if (cse_is_idle != pse_is_idle)
 8773		return;
 8774
 8775	/*
 8776	 * BATCH and IDLE tasks do not preempt others.
 8777	 */
 8778	if (unlikely(!normal_policy(p->policy)))
 8779		return;
 8780
 8781	cfs_rq = cfs_rq_of(se);
 8782	update_curr(cfs_rq);
 8783	/*
 8784	 * If @p has a shorter slice than current and @p is eligible, override
 8785	 * current's slice protection in order to allow preemption.
 8786	 *
 8787	 * Note that even if @p does not turn out to be the most eligible
 8788	 * task at this moment, current's slice protection will be lost.
 8789	 */
 8790	if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
 8791		se->vlag = se->deadline + 1;
 8792
 8793	/*
 8794	 * If @p has become the most eligible task, force preemption.
 8795	 */
 8796	if (pick_eevdf(cfs_rq) == pse)
 8797		goto preempt;
 8798
 8799	return;
 8800
 8801preempt:
 8802	resched_curr_lazy(rq);
 8803}
 8804
 8805static struct task_struct *pick_task_fair(struct rq *rq)
 8806{
 8807	struct sched_entity *se;
 8808	struct cfs_rq *cfs_rq;
 8809
 8810again:
 8811	cfs_rq = &rq->cfs;
 8812	if (!cfs_rq->nr_running)
 8813		return NULL;
 8814
 8815	do {
 8816		/* Might not have done put_prev_entity() */
 8817		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 8818			update_curr(cfs_rq);
 8819
 8820		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
 8821			goto again;
 8822
 8823		se = pick_next_entity(rq, cfs_rq);
 8824		if (!se)
 8825			goto again;
 8826		cfs_rq = group_cfs_rq(se);
 8827	} while (cfs_rq);
 8828
 8829	return task_of(se);
 8830}
 8831
 8832static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
 8833static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
 8834
 8835struct task_struct *
 8836pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 8837{
 8838	struct sched_entity *se;
 8839	struct task_struct *p;
 8840	int new_tasks;
 8841
 8842again:
 8843	p = pick_task_fair(rq);
 8844	if (!p)
 8845		goto idle;
 8846	se = &p->se;
 8847
 8848#ifdef CONFIG_FAIR_GROUP_SCHED
 8849	if (prev->sched_class != &fair_sched_class)
 8850		goto simple;
 8851
 8852	__put_prev_set_next_dl_server(rq, prev, p);
 8853
 8854	/*
 8855	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
 8856	 * likely that a next task is from the same cgroup as the current.
 8857	 *
 8858	 * Therefore attempt to avoid putting and setting the entire cgroup
 8859	 * hierarchy, only change the part that actually changes.
 8860	 *
 8861	 * Since we haven't yet done put_prev_entity and if the selected task
 8862	 * is a different task than we started out with, try and touch the
 8863	 * least amount of cfs_rqs.
 8864	 */
 8865	if (prev != p) {
 8866		struct sched_entity *pse = &prev->se;
 8867		struct cfs_rq *cfs_rq;
 8868
 8869		while (!(cfs_rq = is_same_group(se, pse))) {
 8870			int se_depth = se->depth;
 8871			int pse_depth = pse->depth;
 8872
 8873			if (se_depth <= pse_depth) {
 8874				put_prev_entity(cfs_rq_of(pse), pse);
 8875				pse = parent_entity(pse);
 8876			}
 8877			if (se_depth >= pse_depth) {
 8878				set_next_entity(cfs_rq_of(se), se);
 8879				se = parent_entity(se);
 8880			}
 8881		}
 8882
 8883		put_prev_entity(cfs_rq, pse);
 8884		set_next_entity(cfs_rq, se);
 8885
 8886		__set_next_task_fair(rq, p, true);
 8887	}
 8888
 8889	return p;
 8890
 8891simple:
 8892#endif
 8893	put_prev_set_next_task(rq, prev, p);
 8894	return p;
 8895
 8896idle:
 8897	if (!rf)
 8898		return NULL;
 8899
 8900	new_tasks = sched_balance_newidle(rq, rf);
 8901
 8902	/*
 8903	 * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
 8904	 * possible for any higher priority task to appear. In that case we
 8905	 * must re-start the pick_next_entity() loop.
 8906	 */
 8907	if (new_tasks < 0)
 8908		return RETRY_TASK;
 8909
 8910	if (new_tasks > 0)
 8911		goto again;
 8912
 8913	/*
 8914	 * rq is about to be idle, check if we need to update the
 8915	 * lost_idle_time of clock_pelt
 8916	 */
 8917	update_idle_rq_clock_pelt(rq);
 8918
 8919	return NULL;
 8920}
 8921
 8922static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 8923{
 8924	return pick_next_task_fair(rq, prev, NULL);
 8925}
 8926
 8927static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
 8928{
 8929	return !!dl_se->rq->cfs.nr_running;
 8930}
 8931
 8932static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
 8933{
 8934	return pick_task_fair(dl_se->rq);
 8935}
 8936
 8937void fair_server_init(struct rq *rq)
 8938{
 8939	struct sched_dl_entity *dl_se = &rq->fair_server;
 8940
 8941	init_dl_entity(dl_se);
 8942
 8943	dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
 8944}
 8945
 8946/*
 8947 * Account for a descheduled task:
 8948 */
 8949static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
 8950{
 8951	struct sched_entity *se = &prev->se;
 8952	struct cfs_rq *cfs_rq;
 8953
 8954	for_each_sched_entity(se) {
 8955		cfs_rq = cfs_rq_of(se);
 8956		put_prev_entity(cfs_rq, se);
 8957	}
 8958}
 8959
 8960/*
 8961 * sched_yield() is very simple
 8962 */
 8963static void yield_task_fair(struct rq *rq)
 8964{
 8965	struct task_struct *curr = rq->curr;
 8966	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 8967	struct sched_entity *se = &curr->se;
 8968
 8969	/*
 8970	 * Are we the only task in the tree?
 8971	 */
 8972	if (unlikely(rq->nr_running == 1))
 8973		return;
 8974
 8975	clear_buddies(cfs_rq, se);
 8976
 8977	update_rq_clock(rq);
 8978	/*
 8979	 * Update run-time statistics of the 'current'.
 8980	 */
 8981	update_curr(cfs_rq);
 8982	/*
 8983	 * Tell update_rq_clock() that we've just updated,
 8984	 * so we don't do microscopic update in schedule()
 8985	 * and double the fastpath cost.
 8986	 */
 8987	rq_clock_skip_update(rq);
 8988
 8989	se->deadline += calc_delta_fair(se->slice, se);
 8990}
 8991
 8992static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 8993{
 8994	struct sched_entity *se = &p->se;
 8995
 8996	/* throttled hierarchies are not runnable */
 8997	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 8998		return false;
 8999
 9000	/* Tell the scheduler that we'd really like se to run next. */
 9001	set_next_buddy(se);
 9002
 9003	yield_task_fair(rq);
 9004
 9005	return true;
 9006}
 9007
 9008#ifdef CONFIG_SMP
 9009/**************************************************
 9010 * Fair scheduling class load-balancing methods.
 9011 *
 9012 * BASICS
 9013 *
 9014 * The purpose of load-balancing is to achieve the same basic fairness the
 9015 * per-CPU scheduler provides, namely provide a proportional amount of compute
 9016 * time to each task. This is expressed in the following equation:
 9017 *
 9018 *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
 9019 *
 9020 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
 9021 * W_i,0 is defined as:
 9022 *
 9023 *   W_i,0 = \Sum_j w_i,j                                             (2)
 9024 *
 9025 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
 9026 * is derived from the nice value as per sched_prio_to_weight[].
 9027 *
 9028 * The weight average is an exponential decay average of the instantaneous
 9029 * weight:
 9030 *
 9031 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
 9032 *
 9033 * C_i is the compute capacity of CPU i, typically it is the
 9034 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
 9035 * can also include other factors [XXX].
 9036 *
 9037 * To achieve this balance we define a measure of imbalance which follows
 9038 * directly from (1):
 9039 *
 9040 *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
 9041 *
 9042 * We them move tasks around to minimize the imbalance. In the continuous
 9043 * function space it is obvious this converges, in the discrete case we get
 9044 * a few fun cases generally called infeasible weight scenarios.
 9045 *
 9046 * [XXX expand on:
 9047 *     - infeasible weights;
 9048 *     - local vs global optima in the discrete case. ]
 9049 *
 9050 *
 9051 * SCHED DOMAINS
 9052 *
 9053 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
 9054 * for all i,j solution, we create a tree of CPUs that follows the hardware
 9055 * topology where each level pairs two lower groups (or better). This results
 9056 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
 9057 * tree to only the first of the previous level and we decrease the frequency
 9058 * of load-balance at each level inversely proportional to the number of CPUs in
 9059 * the groups.
 9060 *
 9061 * This yields:
 9062 *
 9063 *     log_2 n     1     n
 9064 *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
 9065 *     i = 0      2^i   2^i
 9066 *                               `- size of each group
 9067 *         |         |     `- number of CPUs doing load-balance
 9068 *         |         `- freq
 9069 *         `- sum over all levels
 9070 *
 9071 * Coupled with a limit on how many tasks we can migrate every balance pass,
 9072 * this makes (5) the runtime complexity of the balancer.
 9073 *
 9074 * An important property here is that each CPU is still (indirectly) connected
 9075 * to every other CPU in at most O(log n) steps:
 9076 *
 9077 * The adjacency matrix of the resulting graph is given by:
 9078 *
 9079 *             log_2 n
 9080 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
 9081 *             k = 0
 9082 *
 9083 * And you'll find that:
 9084 *
 9085 *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
 9086 *
 9087 * Showing there's indeed a path between every CPU in at most O(log n) steps.
 9088 * The task movement gives a factor of O(m), giving a convergence complexity
 9089 * of:
 9090 *
 9091 *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
 9092 *
 9093 *
 9094 * WORK CONSERVING
 9095 *
 9096 * In order to avoid CPUs going idle while there's still work to do, new idle
 9097 * balancing is more aggressive and has the newly idle CPU iterate up the domain
 9098 * tree itself instead of relying on other CPUs to bring it work.
 9099 *
 9100 * This adds some complexity to both (5) and (8) but it reduces the total idle
 9101 * time.
 9102 *
 9103 * [XXX more?]
 9104 *
 9105 *
 9106 * CGROUPS
 9107 *
 9108 * Cgroups make a horror show out of (2), instead of a simple sum we get:
 9109 *
 9110 *                                s_k,i
 9111 *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
 9112 *                                 S_k
 9113 *
 9114 * Where
 9115 *
 9116 *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
 9117 *
 9118 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
 9119 *
 9120 * The big problem is S_k, its a global sum needed to compute a local (W_i)
 9121 * property.
 9122 *
 9123 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
 9124 *      rewrite all of this once again.]
 9125 */
 9126
 9127static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 9128
 9129enum fbq_type { regular, remote, all };
 9130
 9131/*
 9132 * 'group_type' describes the group of CPUs at the moment of load balancing.
 9133 *
 9134 * The enum is ordered by pulling priority, with the group with lowest priority
 9135 * first so the group_type can simply be compared when selecting the busiest
 9136 * group. See update_sd_pick_busiest().
 9137 */
 9138enum group_type {
 9139	/* The group has spare capacity that can be used to run more tasks.  */
 9140	group_has_spare = 0,
 9141	/*
 9142	 * The group is fully used and the tasks don't compete for more CPU
 9143	 * cycles. Nevertheless, some tasks might wait before running.
 9144	 */
 9145	group_fully_busy,
 9146	/*
 9147	 * One task doesn't fit with CPU's capacity and must be migrated to a
 9148	 * more powerful CPU.
 9149	 */
 9150	group_misfit_task,
 9151	/*
 9152	 * Balance SMT group that's fully busy. Can benefit from migration
 9153	 * a task on SMT with busy sibling to another CPU on idle core.
 9154	 */
 9155	group_smt_balance,
 9156	/*
 9157	 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
 9158	 * and the task should be migrated to it instead of running on the
 9159	 * current CPU.
 9160	 */
 9161	group_asym_packing,
 9162	/*
 9163	 * The tasks' affinity constraints previously prevented the scheduler
 9164	 * from balancing the load across the system.
 9165	 */
 9166	group_imbalanced,
 9167	/*
 9168	 * The CPU is overloaded and can't provide expected CPU cycles to all
 9169	 * tasks.
 9170	 */
 9171	group_overloaded
 9172};
 9173
 9174enum migration_type {
 9175	migrate_load = 0,
 9176	migrate_util,
 9177	migrate_task,
 9178	migrate_misfit
 9179};
 9180
 9181#define LBF_ALL_PINNED	0x01
 9182#define LBF_NEED_BREAK	0x02
 9183#define LBF_DST_PINNED  0x04
 9184#define LBF_SOME_PINNED	0x08
 9185#define LBF_ACTIVE_LB	0x10
 9186
 9187struct lb_env {
 9188	struct sched_domain	*sd;
 9189
 9190	struct rq		*src_rq;
 9191	int			src_cpu;
 9192
 9193	int			dst_cpu;
 9194	struct rq		*dst_rq;
 9195
 9196	struct cpumask		*dst_grpmask;
 9197	int			new_dst_cpu;
 9198	enum cpu_idle_type	idle;
 9199	long			imbalance;
 9200	/* The set of CPUs under consideration for load-balancing */
 9201	struct cpumask		*cpus;
 9202
 9203	unsigned int		flags;
 9204
 9205	unsigned int		loop;
 9206	unsigned int		loop_break;
 9207	unsigned int		loop_max;
 9208
 9209	enum fbq_type		fbq_type;
 9210	enum migration_type	migration_type;
 9211	struct list_head	tasks;
 9212};
 9213
 9214/*
 9215 * Is this task likely cache-hot:
 9216 */
 9217static int task_hot(struct task_struct *p, struct lb_env *env)
 9218{
 9219	s64 delta;
 9220
 9221	lockdep_assert_rq_held(env->src_rq);
 9222
 9223	if (p->sched_class != &fair_sched_class)
 9224		return 0;
 9225
 9226	if (unlikely(task_has_idle_policy(p)))
 9227		return 0;
 9228
 9229	/* SMT siblings share cache */
 9230	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
 9231		return 0;
 9232
 9233	/*
 9234	 * Buddy candidates are cache hot:
 9235	 */
 9236	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
 9237	    (&p->se == cfs_rq_of(&p->se)->next))
 9238		return 1;
 9239
 9240	if (sysctl_sched_migration_cost == -1)
 9241		return 1;
 9242
 9243	/*
 9244	 * Don't migrate task if the task's cookie does not match
 9245	 * with the destination CPU's core cookie.
 9246	 */
 9247	if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
 9248		return 1;
 9249
 9250	if (sysctl_sched_migration_cost == 0)
 9251		return 0;
 9252
 9253	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
 9254
 9255	return delta < (s64)sysctl_sched_migration_cost;
 9256}
 9257
 9258#ifdef CONFIG_NUMA_BALANCING
 9259/*
 9260 * Returns 1, if task migration degrades locality
 9261 * Returns 0, if task migration improves locality i.e migration preferred.
 9262 * Returns -1, if task migration is not affected by locality.
 9263 */
 9264static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 9265{
 9266	struct numa_group *numa_group = rcu_dereference(p->numa_group);
 9267	unsigned long src_weight, dst_weight;
 9268	int src_nid, dst_nid, dist;
 9269
 9270	if (!static_branch_likely(&sched_numa_balancing))
 9271		return -1;
 9272
 9273	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
 9274		return -1;
 9275
 9276	src_nid = cpu_to_node(env->src_cpu);
 9277	dst_nid = cpu_to_node(env->dst_cpu);
 9278
 9279	if (src_nid == dst_nid)
 9280		return -1;
 9281
 9282	/* Migrating away from the preferred node is always bad. */
 9283	if (src_nid == p->numa_preferred_nid) {
 9284		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
 9285			return 1;
 9286		else
 9287			return -1;
 9288	}
 9289
 9290	/* Encourage migration to the preferred node. */
 9291	if (dst_nid == p->numa_preferred_nid)
 9292		return 0;
 9293
 9294	/* Leaving a core idle is often worse than degrading locality. */
 9295	if (env->idle == CPU_IDLE)
 9296		return -1;
 9297
 9298	dist = node_distance(src_nid, dst_nid);
 9299	if (numa_group) {
 9300		src_weight = group_weight(p, src_nid, dist);
 9301		dst_weight = group_weight(p, dst_nid, dist);
 9302	} else {
 9303		src_weight = task_weight(p, src_nid, dist);
 9304		dst_weight = task_weight(p, dst_nid, dist);
 9305	}
 9306
 9307	return dst_weight < src_weight;
 9308}
 9309
 9310#else
 9311static inline int migrate_degrades_locality(struct task_struct *p,
 9312					     struct lb_env *env)
 9313{
 9314	return -1;
 9315}
 9316#endif
 9317
 9318/*
 9319 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 9320 */
 9321static
 9322int can_migrate_task(struct task_struct *p, struct lb_env *env)
 9323{
 9324	int tsk_cache_hot;
 9325
 9326	lockdep_assert_rq_held(env->src_rq);
 9327	if (p->sched_task_hot)
 9328		p->sched_task_hot = 0;
 9329
 9330	/*
 9331	 * We do not migrate tasks that are:
 9332	 * 1) throttled_lb_pair, or
 9333	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
 9334	 * 3) running (obviously), or
 9335	 * 4) are cache-hot on their current CPU.
 9336	 */
 9337	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 9338		return 0;
 9339
 9340	/* Disregard percpu kthreads; they are where they need to be. */
 9341	if (kthread_is_per_cpu(p))
 9342		return 0;
 9343
 9344	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
 9345		int cpu;
 9346
 9347		schedstat_inc(p->stats.nr_failed_migrations_affine);
 9348
 9349		env->flags |= LBF_SOME_PINNED;
 9350
 9351		/*
 9352		 * Remember if this task can be migrated to any other CPU in
 9353		 * our sched_group. We may want to revisit it if we couldn't
 9354		 * meet load balance goals by pulling other tasks on src_cpu.
 9355		 *
 9356		 * Avoid computing new_dst_cpu
 9357		 * - for NEWLY_IDLE
 9358		 * - if we have already computed one in current iteration
 9359		 * - if it's an active balance
 9360		 */
 9361		if (env->idle == CPU_NEWLY_IDLE ||
 9362		    env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
 9363			return 0;
 9364
 9365		/* Prevent to re-select dst_cpu via env's CPUs: */
 9366		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
 9367			if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
 9368				env->flags |= LBF_DST_PINNED;
 9369				env->new_dst_cpu = cpu;
 9370				break;
 9371			}
 9372		}
 9373
 9374		return 0;
 9375	}
 9376
 9377	/* Record that we found at least one task that could run on dst_cpu */
 9378	env->flags &= ~LBF_ALL_PINNED;
 9379
 9380	if (task_on_cpu(env->src_rq, p)) {
 9381		schedstat_inc(p->stats.nr_failed_migrations_running);
 9382		return 0;
 9383	}
 9384
 9385	/*
 9386	 * Aggressive migration if:
 9387	 * 1) active balance
 9388	 * 2) destination numa is preferred
 9389	 * 3) task is cache cold, or
 9390	 * 4) too many balance attempts have failed.
 9391	 */
 9392	if (env->flags & LBF_ACTIVE_LB)
 9393		return 1;
 9394
 9395	tsk_cache_hot = migrate_degrades_locality(p, env);
 9396	if (tsk_cache_hot == -1)
 9397		tsk_cache_hot = task_hot(p, env);
 9398
 9399	if (tsk_cache_hot <= 0 ||
 9400	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 9401		if (tsk_cache_hot == 1)
 9402			p->sched_task_hot = 1;
 9403		return 1;
 9404	}
 9405
 9406	schedstat_inc(p->stats.nr_failed_migrations_hot);
 9407	return 0;
 9408}
 9409
 9410/*
 9411 * detach_task() -- detach the task for the migration specified in env
 9412 */
 9413static void detach_task(struct task_struct *p, struct lb_env *env)
 9414{
 9415	lockdep_assert_rq_held(env->src_rq);
 9416
 9417	if (p->sched_task_hot) {
 9418		p->sched_task_hot = 0;
 9419		schedstat_inc(env->sd->lb_hot_gained[env->idle]);
 9420		schedstat_inc(p->stats.nr_forced_migrations);
 9421	}
 9422
 9423	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
 9424	set_task_cpu(p, env->dst_cpu);
 9425}
 9426
 9427/*
 9428 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
 9429 * part of active balancing operations within "domain".
 9430 *
 9431 * Returns a task if successful and NULL otherwise.
 9432 */
 9433static struct task_struct *detach_one_task(struct lb_env *env)
 9434{
 9435	struct task_struct *p;
 9436
 9437	lockdep_assert_rq_held(env->src_rq);
 9438
 9439	list_for_each_entry_reverse(p,
 9440			&env->src_rq->cfs_tasks, se.group_node) {
 9441		if (!can_migrate_task(p, env))
 9442			continue;
 9443
 9444		detach_task(p, env);
 9445
 9446		/*
 9447		 * Right now, this is only the second place where
 9448		 * lb_gained[env->idle] is updated (other is detach_tasks)
 9449		 * so we can safely collect stats here rather than
 9450		 * inside detach_tasks().
 9451		 */
 9452		schedstat_inc(env->sd->lb_gained[env->idle]);
 9453		return p;
 9454	}
 9455	return NULL;
 9456}
 9457
 9458/*
 9459 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
 9460 * busiest_rq, as part of a balancing operation within domain "sd".
 9461 *
 9462 * Returns number of detached tasks if successful and 0 otherwise.
 9463 */
 9464static int detach_tasks(struct lb_env *env)
 9465{
 9466	struct list_head *tasks = &env->src_rq->cfs_tasks;
 9467	unsigned long util, load;
 9468	struct task_struct *p;
 9469	int detached = 0;
 9470
 9471	lockdep_assert_rq_held(env->src_rq);
 9472
 9473	/*
 9474	 * Source run queue has been emptied by another CPU, clear
 9475	 * LBF_ALL_PINNED flag as we will not test any task.
 9476	 */
 9477	if (env->src_rq->nr_running <= 1) {
 9478		env->flags &= ~LBF_ALL_PINNED;
 9479		return 0;
 9480	}
 9481
 9482	if (env->imbalance <= 0)
 9483		return 0;
 9484
 9485	while (!list_empty(tasks)) {
 9486		/*
 9487		 * We don't want to steal all, otherwise we may be treated likewise,
 9488		 * which could at worst lead to a livelock crash.
 9489		 */
 9490		if (env->idle && env->src_rq->nr_running <= 1)
 9491			break;
 9492
 9493		env->loop++;
 9494		/* We've more or less seen every task there is, call it quits */
 9495		if (env->loop > env->loop_max)
 9496			break;
 9497
 9498		/* take a breather every nr_migrate tasks */
 9499		if (env->loop > env->loop_break) {
 9500			env->loop_break += SCHED_NR_MIGRATE_BREAK;
 9501			env->flags |= LBF_NEED_BREAK;
 9502			break;
 9503		}
 9504
 9505		p = list_last_entry(tasks, struct task_struct, se.group_node);
 9506
 9507		if (!can_migrate_task(p, env))
 9508			goto next;
 9509
 9510		switch (env->migration_type) {
 9511		case migrate_load:
 9512			/*
 9513			 * Depending of the number of CPUs and tasks and the
 9514			 * cgroup hierarchy, task_h_load() can return a null
 9515			 * value. Make sure that env->imbalance decreases
 9516			 * otherwise detach_tasks() will stop only after
 9517			 * detaching up to loop_max tasks.
 9518			 */
 9519			load = max_t(unsigned long, task_h_load(p), 1);
 9520
 9521			if (sched_feat(LB_MIN) &&
 9522			    load < 16 && !env->sd->nr_balance_failed)
 9523				goto next;
 9524
 9525			/*
 9526			 * Make sure that we don't migrate too much load.
 9527			 * Nevertheless, let relax the constraint if
 9528			 * scheduler fails to find a good waiting task to
 9529			 * migrate.
 9530			 */
 9531			if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
 9532				goto next;
 9533
 9534			env->imbalance -= load;
 9535			break;
 9536
 9537		case migrate_util:
 9538			util = task_util_est(p);
 9539
 9540			if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
 9541				goto next;
 9542
 9543			env->imbalance -= util;
 9544			break;
 9545
 9546		case migrate_task:
 9547			env->imbalance--;
 9548			break;
 9549
 9550		case migrate_misfit:
 9551			/* This is not a misfit task */
 9552			if (task_fits_cpu(p, env->src_cpu))
 9553				goto next;
 9554
 9555			env->imbalance = 0;
 9556			break;
 9557		}
 9558
 9559		detach_task(p, env);
 9560		list_add(&p->se.group_node, &env->tasks);
 9561
 9562		detached++;
 9563
 9564#ifdef CONFIG_PREEMPTION
 9565		/*
 9566		 * NEWIDLE balancing is a source of latency, so preemptible
 9567		 * kernels will stop after the first task is detached to minimize
 9568		 * the critical section.
 9569		 */
 9570		if (env->idle == CPU_NEWLY_IDLE)
 9571			break;
 9572#endif
 9573
 9574		/*
 9575		 * We only want to steal up to the prescribed amount of
 9576		 * load/util/tasks.
 9577		 */
 9578		if (env->imbalance <= 0)
 9579			break;
 9580
 9581		continue;
 9582next:
 9583		if (p->sched_task_hot)
 9584			schedstat_inc(p->stats.nr_failed_migrations_hot);
 9585
 9586		list_move(&p->se.group_node, tasks);
 9587	}
 9588
 9589	/*
 9590	 * Right now, this is one of only two places we collect this stat
 9591	 * so we can safely collect detach_one_task() stats here rather
 9592	 * than inside detach_one_task().
 9593	 */
 9594	schedstat_add(env->sd->lb_gained[env->idle], detached);
 9595
 9596	return detached;
 9597}
 9598
 9599/*
 9600 * attach_task() -- attach the task detached by detach_task() to its new rq.
 9601 */
 9602static void attach_task(struct rq *rq, struct task_struct *p)
 9603{
 9604	lockdep_assert_rq_held(rq);
 9605
 9606	WARN_ON_ONCE(task_rq(p) != rq);
 9607	activate_task(rq, p, ENQUEUE_NOCLOCK);
 9608	wakeup_preempt(rq, p, 0);
 9609}
 9610
 9611/*
 9612 * attach_one_task() -- attaches the task returned from detach_one_task() to
 9613 * its new rq.
 9614 */
 9615static void attach_one_task(struct rq *rq, struct task_struct *p)
 9616{
 9617	struct rq_flags rf;
 9618
 9619	rq_lock(rq, &rf);
 9620	update_rq_clock(rq);
 9621	attach_task(rq, p);
 9622	rq_unlock(rq, &rf);
 9623}
 9624
 9625/*
 9626 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
 9627 * new rq.
 9628 */
 9629static void attach_tasks(struct lb_env *env)
 9630{
 9631	struct list_head *tasks = &env->tasks;
 9632	struct task_struct *p;
 9633	struct rq_flags rf;
 9634
 9635	rq_lock(env->dst_rq, &rf);
 9636	update_rq_clock(env->dst_rq);
 9637
 9638	while (!list_empty(tasks)) {
 9639		p = list_first_entry(tasks, struct task_struct, se.group_node);
 9640		list_del_init(&p->se.group_node);
 9641
 9642		attach_task(env->dst_rq, p);
 9643	}
 9644
 9645	rq_unlock(env->dst_rq, &rf);
 9646}
 9647
 9648#ifdef CONFIG_NO_HZ_COMMON
 9649static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
 9650{
 9651	if (cfs_rq->avg.load_avg)
 9652		return true;
 9653
 9654	if (cfs_rq->avg.util_avg)
 9655		return true;
 9656
 9657	return false;
 9658}
 9659
 9660static inline bool others_have_blocked(struct rq *rq)
 9661{
 9662	if (cpu_util_rt(rq))
 9663		return true;
 9664
 9665	if (cpu_util_dl(rq))
 9666		return true;
 9667
 9668	if (hw_load_avg(rq))
 9669		return true;
 9670
 9671	if (cpu_util_irq(rq))
 9672		return true;
 9673
 9674	return false;
 9675}
 9676
 9677static inline void update_blocked_load_tick(struct rq *rq)
 9678{
 9679	WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
 9680}
 9681
 9682static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
 9683{
 9684	if (!has_blocked)
 9685		rq->has_blocked_load = 0;
 9686}
 9687#else
 9688static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
 9689static inline bool others_have_blocked(struct rq *rq) { return false; }
 9690static inline void update_blocked_load_tick(struct rq *rq) {}
 9691static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
 9692#endif
 9693
 9694static bool __update_blocked_others(struct rq *rq, bool *done)
 9695{
 9696	bool updated;
 9697
 9698	/*
 9699	 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
 9700	 * DL and IRQ signals have been updated before updating CFS.
 9701	 */
 9702	updated = update_other_load_avgs(rq);
 9703
 9704	if (others_have_blocked(rq))
 9705		*done = false;
 9706
 9707	return updated;
 9708}
 9709
 9710#ifdef CONFIG_FAIR_GROUP_SCHED
 9711
 9712static bool __update_blocked_fair(struct rq *rq, bool *done)
 9713{
 9714	struct cfs_rq *cfs_rq, *pos;
 9715	bool decayed = false;
 9716	int cpu = cpu_of(rq);
 9717
 9718	/*
 9719	 * Iterates the task_group tree in a bottom up fashion, see
 9720	 * list_add_leaf_cfs_rq() for details.
 9721	 */
 9722	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 9723		struct sched_entity *se;
 9724
 9725		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
 9726			update_tg_load_avg(cfs_rq);
 9727
 9728			if (cfs_rq->nr_running == 0)
 9729				update_idle_cfs_rq_clock_pelt(cfs_rq);
 9730
 9731			if (cfs_rq == &rq->cfs)
 9732				decayed = true;
 9733		}
 9734
 9735		/* Propagate pending load changes to the parent, if any: */
 9736		se = cfs_rq->tg->se[cpu];
 9737		if (se && !skip_blocked_update(se))
 9738			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
 9739
 9740		/*
 9741		 * There can be a lot of idle CPU cgroups.  Don't let fully
 9742		 * decayed cfs_rqs linger on the list.
 9743		 */
 9744		if (cfs_rq_is_decayed(cfs_rq))
 9745			list_del_leaf_cfs_rq(cfs_rq);
 9746
 9747		/* Don't need periodic decay once load/util_avg are null */
 9748		if (cfs_rq_has_blocked(cfs_rq))
 9749			*done = false;
 9750	}
 9751
 9752	return decayed;
 9753}
 9754
 9755/*
 9756 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
 9757 * This needs to be done in a top-down fashion because the load of a child
 9758 * group is a fraction of its parents load.
 9759 */
 9760static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 9761{
 9762	struct rq *rq = rq_of(cfs_rq);
 9763	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
 9764	unsigned long now = jiffies;
 9765	unsigned long load;
 9766
 9767	if (cfs_rq->last_h_load_update == now)
 9768		return;
 9769
 9770	WRITE_ONCE(cfs_rq->h_load_next, NULL);
 9771	for_each_sched_entity(se) {
 9772		cfs_rq = cfs_rq_of(se);
 9773		WRITE_ONCE(cfs_rq->h_load_next, se);
 9774		if (cfs_rq->last_h_load_update == now)
 9775			break;
 9776	}
 9777
 9778	if (!se) {
 9779		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
 9780		cfs_rq->last_h_load_update = now;
 9781	}
 9782
 9783	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
 9784		load = cfs_rq->h_load;
 9785		load = div64_ul(load * se->avg.load_avg,
 9786			cfs_rq_load_avg(cfs_rq) + 1);
 9787		cfs_rq = group_cfs_rq(se);
 9788		cfs_rq->h_load = load;
 9789		cfs_rq->last_h_load_update = now;
 9790	}
 9791}
 9792
 9793static unsigned long task_h_load(struct task_struct *p)
 9794{
 9795	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 9796
 9797	update_cfs_rq_h_load(cfs_rq);
 9798	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
 9799			cfs_rq_load_avg(cfs_rq) + 1);
 9800}
 9801#else
 9802static bool __update_blocked_fair(struct rq *rq, bool *done)
 9803{
 9804	struct cfs_rq *cfs_rq = &rq->cfs;
 9805	bool decayed;
 9806
 9807	decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
 9808	if (cfs_rq_has_blocked(cfs_rq))
 9809		*done = false;
 9810
 9811	return decayed;
 9812}
 9813
 9814static unsigned long task_h_load(struct task_struct *p)
 9815{
 9816	return p->se.avg.load_avg;
 9817}
 9818#endif
 9819
 9820static void sched_balance_update_blocked_averages(int cpu)
 9821{
 9822	bool decayed = false, done = true;
 9823	struct rq *rq = cpu_rq(cpu);
 9824	struct rq_flags rf;
 9825
 9826	rq_lock_irqsave(rq, &rf);
 9827	update_blocked_load_tick(rq);
 9828	update_rq_clock(rq);
 9829
 9830	decayed |= __update_blocked_others(rq, &done);
 9831	decayed |= __update_blocked_fair(rq, &done);
 9832
 9833	update_blocked_load_status(rq, !done);
 9834	if (decayed)
 9835		cpufreq_update_util(rq, 0);
 9836	rq_unlock_irqrestore(rq, &rf);
 9837}
 9838
 9839/********** Helpers for sched_balance_find_src_group ************************/
 9840
 9841/*
 9842 * sg_lb_stats - stats of a sched_group required for load-balancing:
 9843 */
 9844struct sg_lb_stats {
 9845	unsigned long avg_load;			/* Avg load            over the CPUs of the group */
 9846	unsigned long group_load;		/* Total load          over the CPUs of the group */
 9847	unsigned long group_capacity;		/* Capacity            over the CPUs of the group */
 9848	unsigned long group_util;		/* Total utilization   over the CPUs of the group */
 9849	unsigned long group_runnable;		/* Total runnable time over the CPUs of the group */
 9850	unsigned int sum_nr_running;		/* Nr of all tasks running in the group */
 9851	unsigned int sum_h_nr_running;		/* Nr of CFS tasks running in the group */
 9852	unsigned int idle_cpus;                 /* Nr of idle CPUs         in the group */
 9853	unsigned int group_weight;
 9854	enum group_type group_type;
 9855	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
 9856	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
 9857	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
 9858#ifdef CONFIG_NUMA_BALANCING
 9859	unsigned int nr_numa_running;
 9860	unsigned int nr_preferred_running;
 9861#endif
 9862};
 9863
 9864/*
 9865 * sd_lb_stats - stats of a sched_domain required for load-balancing:
 9866 */
 9867struct sd_lb_stats {
 9868	struct sched_group *busiest;		/* Busiest group in this sd */
 9869	struct sched_group *local;		/* Local group in this sd */
 9870	unsigned long total_load;		/* Total load of all groups in sd */
 9871	unsigned long total_capacity;		/* Total capacity of all groups in sd */
 9872	unsigned long avg_load;			/* Average load across all groups in sd */
 9873	unsigned int prefer_sibling;		/* Tasks should go to sibling first */
 9874
 9875	struct sg_lb_stats busiest_stat;	/* Statistics of the busiest group */
 9876	struct sg_lb_stats local_stat;		/* Statistics of the local group */
 9877};
 9878
 9879static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 9880{
 9881	/*
 9882	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
 9883	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
 9884	 * We must however set busiest_stat::group_type and
 9885	 * busiest_stat::idle_cpus to the worst busiest group because
 9886	 * update_sd_pick_busiest() reads these before assignment.
 9887	 */
 9888	*sds = (struct sd_lb_stats){
 9889		.busiest = NULL,
 9890		.local = NULL,
 9891		.total_load = 0UL,
 9892		.total_capacity = 0UL,
 9893		.busiest_stat = {
 9894			.idle_cpus = UINT_MAX,
 9895			.group_type = group_has_spare,
 9896		},
 9897	};
 9898}
 9899
 9900static unsigned long scale_rt_capacity(int cpu)
 9901{
 9902	unsigned long max = get_actual_cpu_capacity(cpu);
 9903	struct rq *rq = cpu_rq(cpu);
 9904	unsigned long used, free;
 9905	unsigned long irq;
 9906
 9907	irq = cpu_util_irq(rq);
 9908
 9909	if (unlikely(irq >= max))
 9910		return 1;
 9911
 9912	/*
 9913	 * avg_rt.util_avg and avg_dl.util_avg track binary signals
 9914	 * (running and not running) with weights 0 and 1024 respectively.
 9915	 */
 9916	used = cpu_util_rt(rq);
 9917	used += cpu_util_dl(rq);
 9918
 9919	if (unlikely(used >= max))
 9920		return 1;
 9921
 9922	free = max - used;
 9923
 9924	return scale_irq_capacity(free, irq, max);
 9925}
 9926
 9927static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 9928{
 9929	unsigned long capacity = scale_rt_capacity(cpu);
 9930	struct sched_group *sdg = sd->groups;
 9931
 9932	if (!capacity)
 9933		capacity = 1;
 9934
 9935	cpu_rq(cpu)->cpu_capacity = capacity;
 9936	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
 9937
 9938	sdg->sgc->capacity = capacity;
 9939	sdg->sgc->min_capacity = capacity;
 9940	sdg->sgc->max_capacity = capacity;
 9941}
 9942
 9943void update_group_capacity(struct sched_domain *sd, int cpu)
 9944{
 9945	struct sched_domain *child = sd->child;
 9946	struct sched_group *group, *sdg = sd->groups;
 9947	unsigned long capacity, min_capacity, max_capacity;
 9948	unsigned long interval;
 9949
 9950	interval = msecs_to_jiffies(sd->balance_interval);
 9951	interval = clamp(interval, 1UL, max_load_balance_interval);
 9952	sdg->sgc->next_update = jiffies + interval;
 9953
 9954	if (!child) {
 9955		update_cpu_capacity(sd, cpu);
 9956		return;
 9957	}
 9958
 9959	capacity = 0;
 9960	min_capacity = ULONG_MAX;
 9961	max_capacity = 0;
 9962
 9963	if (child->flags & SD_OVERLAP) {
 9964		/*
 9965		 * SD_OVERLAP domains cannot assume that child groups
 9966		 * span the current group.
 9967		 */
 9968
 9969		for_each_cpu(cpu, sched_group_span(sdg)) {
 9970			unsigned long cpu_cap = capacity_of(cpu);
 9971
 9972			capacity += cpu_cap;
 9973			min_capacity = min(cpu_cap, min_capacity);
 9974			max_capacity = max(cpu_cap, max_capacity);
 9975		}
 9976	} else  {
 9977		/*
 9978		 * !SD_OVERLAP domains can assume that child groups
 9979		 * span the current group.
 9980		 */
 9981
 9982		group = child->groups;
 9983		do {
 9984			struct sched_group_capacity *sgc = group->sgc;
 9985
 9986			capacity += sgc->capacity;
 9987			min_capacity = min(sgc->min_capacity, min_capacity);
 9988			max_capacity = max(sgc->max_capacity, max_capacity);
 9989			group = group->next;
 9990		} while (group != child->groups);
 9991	}
 9992
 9993	sdg->sgc->capacity = capacity;
 9994	sdg->sgc->min_capacity = min_capacity;
 9995	sdg->sgc->max_capacity = max_capacity;
 9996}
 9997
 9998/*
 9999 * Check whether the capacity of the rq has been noticeably reduced by side
10000 * activity. The imbalance_pct is used for the threshold.
10001 * Return true is the capacity is reduced
10002 */
10003static inline int
10004check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
10005{
10006	return ((rq->cpu_capacity * sd->imbalance_pct) <
10007				(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
10008}
10009
10010/* Check if the rq has a misfit task */
10011static inline bool check_misfit_status(struct rq *rq)
10012{
10013	return rq->misfit_task_load;
10014}
10015
10016/*
10017 * Group imbalance indicates (and tries to solve) the problem where balancing
10018 * groups is inadequate due to ->cpus_ptr constraints.
10019 *
10020 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
10021 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
10022 * Something like:
10023 *
10024 *	{ 0 1 2 3 } { 4 5 6 7 }
10025 *	        *     * * *
10026 *
10027 * If we were to balance group-wise we'd place two tasks in the first group and
10028 * two tasks in the second group. Clearly this is undesired as it will overload
10029 * cpu 3 and leave one of the CPUs in the second group unused.
10030 *
10031 * The current solution to this issue is detecting the skew in the first group
10032 * by noticing the lower domain failed to reach balance and had difficulty
10033 * moving tasks due to affinity constraints.
10034 *
10035 * When this is so detected; this group becomes a candidate for busiest; see
10036 * update_sd_pick_busiest(). And calculate_imbalance() and
10037 * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
10038 * to create an effective group imbalance.
10039 *
10040 * This is a somewhat tricky proposition since the next run might not find the
10041 * group imbalance and decide the groups need to be balanced again. A most
10042 * subtle and fragile situation.
10043 */
10044
10045static inline int sg_imbalanced(struct sched_group *group)
10046{
10047	return group->sgc->imbalance;
10048}
10049
10050/*
10051 * group_has_capacity returns true if the group has spare capacity that could
10052 * be used by some tasks.
10053 * We consider that a group has spare capacity if the number of task is
10054 * smaller than the number of CPUs or if the utilization is lower than the
10055 * available capacity for CFS tasks.
10056 * For the latter, we use a threshold to stabilize the state, to take into
10057 * account the variance of the tasks' load and to return true if the available
10058 * capacity in meaningful for the load balancer.
10059 * As an example, an available capacity of 1% can appear but it doesn't make
10060 * any benefit for the load balance.
10061 */
10062static inline bool
10063group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10064{
10065	if (sgs->sum_nr_running < sgs->group_weight)
10066		return true;
10067
10068	if ((sgs->group_capacity * imbalance_pct) <
10069			(sgs->group_runnable * 100))
10070		return false;
10071
10072	if ((sgs->group_capacity * 100) >
10073			(sgs->group_util * imbalance_pct))
10074		return true;
10075
10076	return false;
10077}
10078
10079/*
10080 *  group_is_overloaded returns true if the group has more tasks than it can
10081 *  handle.
10082 *  group_is_overloaded is not equals to !group_has_capacity because a group
10083 *  with the exact right number of tasks, has no more spare capacity but is not
10084 *  overloaded so both group_has_capacity and group_is_overloaded return
10085 *  false.
10086 */
10087static inline bool
10088group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10089{
10090	if (sgs->sum_nr_running <= sgs->group_weight)
10091		return false;
10092
10093	if ((sgs->group_capacity * 100) <
10094			(sgs->group_util * imbalance_pct))
10095		return true;
10096
10097	if ((sgs->group_capacity * imbalance_pct) <
10098			(sgs->group_runnable * 100))
10099		return true;
10100
10101	return false;
10102}
10103
10104static inline enum
10105group_type group_classify(unsigned int imbalance_pct,
10106			  struct sched_group *group,
10107			  struct sg_lb_stats *sgs)
10108{
10109	if (group_is_overloaded(imbalance_pct, sgs))
10110		return group_overloaded;
10111
10112	if (sg_imbalanced(group))
10113		return group_imbalanced;
10114
10115	if (sgs->group_asym_packing)
10116		return group_asym_packing;
10117
10118	if (sgs->group_smt_balance)
10119		return group_smt_balance;
10120
10121	if (sgs->group_misfit_task_load)
10122		return group_misfit_task;
10123
10124	if (!group_has_capacity(imbalance_pct, sgs))
10125		return group_fully_busy;
10126
10127	return group_has_spare;
10128}
10129
10130/**
10131 * sched_use_asym_prio - Check whether asym_packing priority must be used
10132 * @sd:		The scheduling domain of the load balancing
10133 * @cpu:	A CPU
10134 *
10135 * Always use CPU priority when balancing load between SMT siblings. When
10136 * balancing load between cores, it is not sufficient that @cpu is idle. Only
10137 * use CPU priority if the whole core is idle.
10138 *
10139 * Returns: True if the priority of @cpu must be followed. False otherwise.
10140 */
10141static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
10142{
10143	if (!(sd->flags & SD_ASYM_PACKING))
10144		return false;
10145
10146	if (!sched_smt_active())
10147		return true;
10148
10149	return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
10150}
10151
10152static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
10153{
10154	/*
10155	 * First check if @dst_cpu can do asym_packing load balance. Only do it
10156	 * if it has higher priority than @src_cpu.
10157	 */
10158	return sched_use_asym_prio(sd, dst_cpu) &&
10159		sched_asym_prefer(dst_cpu, src_cpu);
10160}
10161
10162/**
10163 * sched_group_asym - Check if the destination CPU can do asym_packing balance
10164 * @env:	The load balancing environment
10165 * @sgs:	Load-balancing statistics of the candidate busiest group
10166 * @group:	The candidate busiest group
10167 *
10168 * @env::dst_cpu can do asym_packing if it has higher priority than the
10169 * preferred CPU of @group.
10170 *
10171 * Return: true if @env::dst_cpu can do with asym_packing load balance. False
10172 * otherwise.
10173 */
10174static inline bool
10175sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
10176{
10177	/*
10178	 * CPU priorities do not make sense for SMT cores with more than one
10179	 * busy sibling.
10180	 */
10181	if ((group->flags & SD_SHARE_CPUCAPACITY) &&
10182	    (sgs->group_weight - sgs->idle_cpus != 1))
10183		return false;
10184
10185	return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
10186}
10187
10188/* One group has more than one SMT CPU while the other group does not */
10189static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
10190				    struct sched_group *sg2)
10191{
10192	if (!sg1 || !sg2)
10193		return false;
10194
10195	return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
10196		(sg2->flags & SD_SHARE_CPUCAPACITY);
10197}
10198
10199static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
10200			       struct sched_group *group)
10201{
10202	if (!env->idle)
10203		return false;
10204
10205	/*
10206	 * For SMT source group, it is better to move a task
10207	 * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
10208	 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
10209	 * will not be on.
10210	 */
10211	if (group->flags & SD_SHARE_CPUCAPACITY &&
10212	    sgs->sum_h_nr_running > 1)
10213		return true;
10214
10215	return false;
10216}
10217
10218static inline long sibling_imbalance(struct lb_env *env,
10219				    struct sd_lb_stats *sds,
10220				    struct sg_lb_stats *busiest,
10221				    struct sg_lb_stats *local)
10222{
10223	int ncores_busiest, ncores_local;
10224	long imbalance;
10225
10226	if (!env->idle || !busiest->sum_nr_running)
10227		return 0;
10228
10229	ncores_busiest = sds->busiest->cores;
10230	ncores_local = sds->local->cores;
10231
10232	if (ncores_busiest == ncores_local) {
10233		imbalance = busiest->sum_nr_running;
10234		lsub_positive(&imbalance, local->sum_nr_running);
10235		return imbalance;
10236	}
10237
10238	/* Balance such that nr_running/ncores ratio are same on both groups */
10239	imbalance = ncores_local * busiest->sum_nr_running;
10240	lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
10241	/* Normalize imbalance and do rounding on normalization */
10242	imbalance = 2 * imbalance + ncores_local + ncores_busiest;
10243	imbalance /= ncores_local + ncores_busiest;
10244
10245	/* Take advantage of resource in an empty sched group */
10246	if (imbalance <= 1 && local->sum_nr_running == 0 &&
10247	    busiest->sum_nr_running > 1)
10248		imbalance = 2;
10249
10250	return imbalance;
10251}
10252
10253static inline bool
10254sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
10255{
10256	/*
10257	 * When there is more than 1 task, the group_overloaded case already
10258	 * takes care of cpu with reduced capacity
10259	 */
10260	if (rq->cfs.h_nr_running != 1)
10261		return false;
10262
10263	return check_cpu_capacity(rq, sd);
10264}
10265
10266/**
10267 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
10268 * @env: The load balancing environment.
10269 * @sds: Load-balancing data with statistics of the local group.
10270 * @group: sched_group whose statistics are to be updated.
10271 * @sgs: variable to hold the statistics for this group.
10272 * @sg_overloaded: sched_group is overloaded
10273 * @sg_overutilized: sched_group is overutilized
10274 */
10275static inline void update_sg_lb_stats(struct lb_env *env,
10276				      struct sd_lb_stats *sds,
10277				      struct sched_group *group,
10278				      struct sg_lb_stats *sgs,
10279				      bool *sg_overloaded,
10280				      bool *sg_overutilized)
10281{
10282	int i, nr_running, local_group;
10283
10284	memset(sgs, 0, sizeof(*sgs));
10285
10286	local_group = group == sds->local;
10287
10288	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
10289		struct rq *rq = cpu_rq(i);
10290		unsigned long load = cpu_load(rq);
10291
10292		sgs->group_load += load;
10293		sgs->group_util += cpu_util_cfs(i);
10294		sgs->group_runnable += cpu_runnable(rq);
10295		sgs->sum_h_nr_running += rq->cfs.h_nr_running;
10296
10297		nr_running = rq->nr_running;
10298		sgs->sum_nr_running += nr_running;
10299
10300		if (nr_running > 1)
10301			*sg_overloaded = 1;
10302
10303		if (cpu_overutilized(i))
10304			*sg_overutilized = 1;
10305
10306#ifdef CONFIG_NUMA_BALANCING
10307		sgs->nr_numa_running += rq->nr_numa_running;
10308		sgs->nr_preferred_running += rq->nr_preferred_running;
10309#endif
10310		/*
10311		 * No need to call idle_cpu() if nr_running is not 0
10312		 */
10313		if (!nr_running && idle_cpu(i)) {
10314			sgs->idle_cpus++;
10315			/* Idle cpu can't have misfit task */
10316			continue;
10317		}
10318
10319		if (local_group)
10320			continue;
10321
10322		if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
10323			/* Check for a misfit task on the cpu */
10324			if (sgs->group_misfit_task_load < rq->misfit_task_load) {
10325				sgs->group_misfit_task_load = rq->misfit_task_load;
10326				*sg_overloaded = 1;
10327			}
10328		} else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
10329			/* Check for a task running on a CPU with reduced capacity */
10330			if (sgs->group_misfit_task_load < load)
10331				sgs->group_misfit_task_load = load;
10332		}
10333	}
10334
10335	sgs->group_capacity = group->sgc->capacity;
10336
10337	sgs->group_weight = group->group_weight;
10338
10339	/* Check if dst CPU is idle and preferred to this group */
10340	if (!local_group && env->idle && sgs->sum_h_nr_running &&
10341	    sched_group_asym(env, sgs, group))
10342		sgs->group_asym_packing = 1;
10343
10344	/* Check for loaded SMT group to be balanced to dst CPU */
10345	if (!local_group && smt_balance(env, sgs, group))
10346		sgs->group_smt_balance = 1;
10347
10348	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
10349
10350	/* Computing avg_load makes sense only when group is overloaded */
10351	if (sgs->group_type == group_overloaded)
10352		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10353				sgs->group_capacity;
10354}
10355
10356/**
10357 * update_sd_pick_busiest - return 1 on busiest group
10358 * @env: The load balancing environment.
10359 * @sds: sched_domain statistics
10360 * @sg: sched_group candidate to be checked for being the busiest
10361 * @sgs: sched_group statistics
10362 *
10363 * Determine if @sg is a busier group than the previously selected
10364 * busiest group.
10365 *
10366 * Return: %true if @sg is a busier group than the previously selected
10367 * busiest group. %false otherwise.
10368 */
10369static bool update_sd_pick_busiest(struct lb_env *env,
10370				   struct sd_lb_stats *sds,
10371				   struct sched_group *sg,
10372				   struct sg_lb_stats *sgs)
10373{
10374	struct sg_lb_stats *busiest = &sds->busiest_stat;
10375
10376	/* Make sure that there is at least one task to pull */
10377	if (!sgs->sum_h_nr_running)
10378		return false;
10379
10380	/*
10381	 * Don't try to pull misfit tasks we can't help.
10382	 * We can use max_capacity here as reduction in capacity on some
10383	 * CPUs in the group should either be possible to resolve
10384	 * internally or be covered by avg_load imbalance (eventually).
10385	 */
10386	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10387	    (sgs->group_type == group_misfit_task) &&
10388	    (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
10389	     sds->local_stat.group_type != group_has_spare))
10390		return false;
10391
10392	if (sgs->group_type > busiest->group_type)
10393		return true;
10394
10395	if (sgs->group_type < busiest->group_type)
10396		return false;
10397
10398	/*
10399	 * The candidate and the current busiest group are the same type of
10400	 * group. Let check which one is the busiest according to the type.
10401	 */
10402
10403	switch (sgs->group_type) {
10404	case group_overloaded:
10405		/* Select the overloaded group with highest avg_load. */
10406		return sgs->avg_load > busiest->avg_load;
10407
10408	case group_imbalanced:
10409		/*
10410		 * Select the 1st imbalanced group as we don't have any way to
10411		 * choose one more than another.
10412		 */
10413		return false;
10414
10415	case group_asym_packing:
10416		/* Prefer to move from lowest priority CPU's work */
10417		return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
10418
10419	case group_misfit_task:
10420		/*
10421		 * If we have more than one misfit sg go with the biggest
10422		 * misfit.
10423		 */
10424		return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
10425
10426	case group_smt_balance:
10427		/*
10428		 * Check if we have spare CPUs on either SMT group to
10429		 * choose has spare or fully busy handling.
10430		 */
10431		if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
10432			goto has_spare;
10433
10434		fallthrough;
10435
10436	case group_fully_busy:
10437		/*
10438		 * Select the fully busy group with highest avg_load. In
10439		 * theory, there is no need to pull task from such kind of
10440		 * group because tasks have all compute capacity that they need
10441		 * but we can still improve the overall throughput by reducing
10442		 * contention when accessing shared HW resources.
10443		 *
10444		 * XXX for now avg_load is not computed and always 0 so we
10445		 * select the 1st one, except if @sg is composed of SMT
10446		 * siblings.
10447		 */
10448
10449		if (sgs->avg_load < busiest->avg_load)
10450			return false;
10451
10452		if (sgs->avg_load == busiest->avg_load) {
10453			/*
10454			 * SMT sched groups need more help than non-SMT groups.
10455			 * If @sg happens to also be SMT, either choice is good.
10456			 */
10457			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
10458				return false;
10459		}
10460
10461		break;
10462
10463	case group_has_spare:
10464		/*
10465		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
10466		 * as we do not want to pull task off SMT core with one task
10467		 * and make the core idle.
10468		 */
10469		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
10470			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
10471				return false;
10472			else
10473				return true;
10474		}
10475has_spare:
10476
10477		/*
10478		 * Select not overloaded group with lowest number of idle CPUs
10479		 * and highest number of running tasks. We could also compare
10480		 * the spare capacity which is more stable but it can end up
10481		 * that the group has less spare capacity but finally more idle
10482		 * CPUs which means less opportunity to pull tasks.
10483		 */
10484		if (sgs->idle_cpus > busiest->idle_cpus)
10485			return false;
10486		else if ((sgs->idle_cpus == busiest->idle_cpus) &&
10487			 (sgs->sum_nr_running <= busiest->sum_nr_running))
10488			return false;
10489
10490		break;
10491	}
10492
10493	/*
10494	 * Candidate sg has no more than one task per CPU and has higher
10495	 * per-CPU capacity. Migrating tasks to less capable CPUs may harm
10496	 * throughput. Maximize throughput, power/energy consequences are not
10497	 * considered.
10498	 */
10499	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10500	    (sgs->group_type <= group_fully_busy) &&
10501	    (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
10502		return false;
10503
10504	return true;
10505}
10506
10507#ifdef CONFIG_NUMA_BALANCING
10508static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10509{
10510	if (sgs->sum_h_nr_running > sgs->nr_numa_running)
10511		return regular;
10512	if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
10513		return remote;
10514	return all;
10515}
10516
10517static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10518{
10519	if (rq->nr_running > rq->nr_numa_running)
10520		return regular;
10521	if (rq->nr_running > rq->nr_preferred_running)
10522		return remote;
10523	return all;
10524}
10525#else
10526static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10527{
10528	return all;
10529}
10530
10531static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10532{
10533	return regular;
10534}
10535#endif /* CONFIG_NUMA_BALANCING */
10536
10537
10538struct sg_lb_stats;
10539
10540/*
10541 * task_running_on_cpu - return 1 if @p is running on @cpu.
10542 */
10543
10544static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
10545{
10546	/* Task has no contribution or is new */
10547	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
10548		return 0;
10549
10550	if (task_on_rq_queued(p))
10551		return 1;
10552
10553	return 0;
10554}
10555
10556/**
10557 * idle_cpu_without - would a given CPU be idle without p ?
10558 * @cpu: the processor on which idleness is tested.
10559 * @p: task which should be ignored.
10560 *
10561 * Return: 1 if the CPU would be idle. 0 otherwise.
10562 */
10563static int idle_cpu_without(int cpu, struct task_struct *p)
10564{
10565	struct rq *rq = cpu_rq(cpu);
10566
10567	if (rq->curr != rq->idle && rq->curr != p)
10568		return 0;
10569
10570	/*
10571	 * rq->nr_running can't be used but an updated version without the
10572	 * impact of p on cpu must be used instead. The updated nr_running
10573	 * be computed and tested before calling idle_cpu_without().
10574	 */
10575
10576	if (rq->ttwu_pending)
10577		return 0;
10578
10579	return 1;
10580}
10581
10582/*
10583 * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
10584 * @sd: The sched_domain level to look for idlest group.
10585 * @group: sched_group whose statistics are to be updated.
10586 * @sgs: variable to hold the statistics for this group.
10587 * @p: The task for which we look for the idlest group/CPU.
10588 */
10589static inline void update_sg_wakeup_stats(struct sched_domain *sd,
10590					  struct sched_group *group,
10591					  struct sg_lb_stats *sgs,
10592					  struct task_struct *p)
10593{
10594	int i, nr_running;
10595
10596	memset(sgs, 0, sizeof(*sgs));
10597
10598	/* Assume that task can't fit any CPU of the group */
10599	if (sd->flags & SD_ASYM_CPUCAPACITY)
10600		sgs->group_misfit_task_load = 1;
10601
10602	for_each_cpu(i, sched_group_span(group)) {
10603		struct rq *rq = cpu_rq(i);
10604		unsigned int local;
10605
10606		sgs->group_load += cpu_load_without(rq, p);
10607		sgs->group_util += cpu_util_without(i, p);
10608		sgs->group_runnable += cpu_runnable_without(rq, p);
10609		local = task_running_on_cpu(i, p);
10610		sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
10611
10612		nr_running = rq->nr_running - local;
10613		sgs->sum_nr_running += nr_running;
10614
10615		/*
10616		 * No need to call idle_cpu_without() if nr_running is not 0
10617		 */
10618		if (!nr_running && idle_cpu_without(i, p))
10619			sgs->idle_cpus++;
10620
10621		/* Check if task fits in the CPU */
10622		if (sd->flags & SD_ASYM_CPUCAPACITY &&
10623		    sgs->group_misfit_task_load &&
10624		    task_fits_cpu(p, i))
10625			sgs->group_misfit_task_load = 0;
10626
10627	}
10628
10629	sgs->group_capacity = group->sgc->capacity;
10630
10631	sgs->group_weight = group->group_weight;
10632
10633	sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
10634
10635	/*
10636	 * Computing avg_load makes sense only when group is fully busy or
10637	 * overloaded
10638	 */
10639	if (sgs->group_type == group_fully_busy ||
10640		sgs->group_type == group_overloaded)
10641		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10642				sgs->group_capacity;
10643}
10644
10645static bool update_pick_idlest(struct sched_group *idlest,
10646			       struct sg_lb_stats *idlest_sgs,
10647			       struct sched_group *group,
10648			       struct sg_lb_stats *sgs)
10649{
10650	if (sgs->group_type < idlest_sgs->group_type)
10651		return true;
10652
10653	if (sgs->group_type > idlest_sgs->group_type)
10654		return false;
10655
10656	/*
10657	 * The candidate and the current idlest group are the same type of
10658	 * group. Let check which one is the idlest according to the type.
10659	 */
10660
10661	switch (sgs->group_type) {
10662	case group_overloaded:
10663	case group_fully_busy:
10664		/* Select the group with lowest avg_load. */
10665		if (idlest_sgs->avg_load <= sgs->avg_load)
10666			return false;
10667		break;
10668
10669	case group_imbalanced:
10670	case group_asym_packing:
10671	case group_smt_balance:
10672		/* Those types are not used in the slow wakeup path */
10673		return false;
10674
10675	case group_misfit_task:
10676		/* Select group with the highest max capacity */
10677		if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
10678			return false;
10679		break;
10680
10681	case group_has_spare:
10682		/* Select group with most idle CPUs */
10683		if (idlest_sgs->idle_cpus > sgs->idle_cpus)
10684			return false;
10685
10686		/* Select group with lowest group_util */
10687		if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
10688			idlest_sgs->group_util <= sgs->group_util)
10689			return false;
10690
10691		break;
10692	}
10693
10694	return true;
10695}
10696
10697/*
10698 * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
10699 * domain.
10700 *
10701 * Assumes p is allowed on at least one CPU in sd.
10702 */
10703static struct sched_group *
10704sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
10705{
10706	struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
10707	struct sg_lb_stats local_sgs, tmp_sgs;
10708	struct sg_lb_stats *sgs;
10709	unsigned long imbalance;
10710	struct sg_lb_stats idlest_sgs = {
10711			.avg_load = UINT_MAX,
10712			.group_type = group_overloaded,
10713	};
10714
10715	do {
10716		int local_group;
10717
10718		/* Skip over this group if it has no CPUs allowed */
10719		if (!cpumask_intersects(sched_group_span(group),
10720					p->cpus_ptr))
10721			continue;
10722
10723		/* Skip over this group if no cookie matched */
10724		if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
10725			continue;
10726
10727		local_group = cpumask_test_cpu(this_cpu,
10728					       sched_group_span(group));
10729
10730		if (local_group) {
10731			sgs = &local_sgs;
10732			local = group;
10733		} else {
10734			sgs = &tmp_sgs;
10735		}
10736
10737		update_sg_wakeup_stats(sd, group, sgs, p);
10738
10739		if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
10740			idlest = group;
10741			idlest_sgs = *sgs;
10742		}
10743
10744	} while (group = group->next, group != sd->groups);
10745
10746
10747	/* There is no idlest group to push tasks to */
10748	if (!idlest)
10749		return NULL;
10750
10751	/* The local group has been skipped because of CPU affinity */
10752	if (!local)
10753		return idlest;
10754
10755	/*
10756	 * If the local group is idler than the selected idlest group
10757	 * don't try and push the task.
10758	 */
10759	if (local_sgs.group_type < idlest_sgs.group_type)
10760		return NULL;
10761
10762	/*
10763	 * If the local group is busier than the selected idlest group
10764	 * try and push the task.
10765	 */
10766	if (local_sgs.group_type > idlest_sgs.group_type)
10767		return idlest;
10768
10769	switch (local_sgs.group_type) {
10770	case group_overloaded:
10771	case group_fully_busy:
10772
10773		/* Calculate allowed imbalance based on load */
10774		imbalance = scale_load_down(NICE_0_LOAD) *
10775				(sd->imbalance_pct-100) / 100;
10776
10777		/*
10778		 * When comparing groups across NUMA domains, it's possible for
10779		 * the local domain to be very lightly loaded relative to the
10780		 * remote domains but "imbalance" skews the comparison making
10781		 * remote CPUs look much more favourable. When considering
10782		 * cross-domain, add imbalance to the load on the remote node
10783		 * and consider staying local.
10784		 */
10785
10786		if ((sd->flags & SD_NUMA) &&
10787		    ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
10788			return NULL;
10789
10790		/*
10791		 * If the local group is less loaded than the selected
10792		 * idlest group don't try and push any tasks.
10793		 */
10794		if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
10795			return NULL;
10796
10797		if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
10798			return NULL;
10799		break;
10800
10801	case group_imbalanced:
10802	case group_asym_packing:
10803	case group_smt_balance:
10804		/* Those type are not used in the slow wakeup path */
10805		return NULL;
10806
10807	case group_misfit_task:
10808		/* Select group with the highest max capacity */
10809		if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
10810			return NULL;
10811		break;
10812
10813	case group_has_spare:
10814#ifdef CONFIG_NUMA
10815		if (sd->flags & SD_NUMA) {
10816			int imb_numa_nr = sd->imb_numa_nr;
10817#ifdef CONFIG_NUMA_BALANCING
10818			int idlest_cpu;
10819			/*
10820			 * If there is spare capacity at NUMA, try to select
10821			 * the preferred node
10822			 */
10823			if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
10824				return NULL;
10825
10826			idlest_cpu = cpumask_first(sched_group_span(idlest));
10827			if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
10828				return idlest;
10829#endif /* CONFIG_NUMA_BALANCING */
10830			/*
10831			 * Otherwise, keep the task close to the wakeup source
10832			 * and improve locality if the number of running tasks
10833			 * would remain below threshold where an imbalance is
10834			 * allowed while accounting for the possibility the
10835			 * task is pinned to a subset of CPUs. If there is a
10836			 * real need of migration, periodic load balance will
10837			 * take care of it.
10838			 */
10839			if (p->nr_cpus_allowed != NR_CPUS) {
10840				struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
10841
10842				cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
10843				imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
10844			}
10845
10846			imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
10847			if (!adjust_numa_imbalance(imbalance,
10848						   local_sgs.sum_nr_running + 1,
10849						   imb_numa_nr)) {
10850				return NULL;
10851			}
10852		}
10853#endif /* CONFIG_NUMA */
10854
10855		/*
10856		 * Select group with highest number of idle CPUs. We could also
10857		 * compare the utilization which is more stable but it can end
10858		 * up that the group has less spare capacity but finally more
10859		 * idle CPUs which means more opportunity to run task.
10860		 */
10861		if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
10862			return NULL;
10863		break;
10864	}
10865
10866	return idlest;
10867}
10868
10869static void update_idle_cpu_scan(struct lb_env *env,
10870				 unsigned long sum_util)
10871{
10872	struct sched_domain_shared *sd_share;
10873	int llc_weight, pct;
10874	u64 x, y, tmp;
10875	/*
10876	 * Update the number of CPUs to scan in LLC domain, which could
10877	 * be used as a hint in select_idle_cpu(). The update of sd_share
10878	 * could be expensive because it is within a shared cache line.
10879	 * So the write of this hint only occurs during periodic load
10880	 * balancing, rather than CPU_NEWLY_IDLE, because the latter
10881	 * can fire way more frequently than the former.
10882	 */
10883	if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
10884		return;
10885
10886	llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
10887	if (env->sd->span_weight != llc_weight)
10888		return;
10889
10890	sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
10891	if (!sd_share)
10892		return;
10893
10894	/*
10895	 * The number of CPUs to search drops as sum_util increases, when
10896	 * sum_util hits 85% or above, the scan stops.
10897	 * The reason to choose 85% as the threshold is because this is the
10898	 * imbalance_pct(117) when a LLC sched group is overloaded.
10899	 *
10900	 * let y = SCHED_CAPACITY_SCALE - p * x^2                       [1]
10901	 * and y'= y / SCHED_CAPACITY_SCALE
10902	 *
10903	 * x is the ratio of sum_util compared to the CPU capacity:
10904	 * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
10905	 * y' is the ratio of CPUs to be scanned in the LLC domain,
10906	 * and the number of CPUs to scan is calculated by:
10907	 *
10908	 * nr_scan = llc_weight * y'                                    [2]
10909	 *
10910	 * When x hits the threshold of overloaded, AKA, when
10911	 * x = 100 / pct, y drops to 0. According to [1],
10912	 * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
10913	 *
10914	 * Scale x by SCHED_CAPACITY_SCALE:
10915	 * x' = sum_util / llc_weight;                                  [3]
10916	 *
10917	 * and finally [1] becomes:
10918	 * y = SCHED_CAPACITY_SCALE -
10919	 *     x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE)            [4]
10920	 *
10921	 */
10922	/* equation [3] */
10923	x = sum_util;
10924	do_div(x, llc_weight);
10925
10926	/* equation [4] */
10927	pct = env->sd->imbalance_pct;
10928	tmp = x * x * pct * pct;
10929	do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
10930	tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
10931	y = SCHED_CAPACITY_SCALE - tmp;
10932
10933	/* equation [2] */
10934	y *= llc_weight;
10935	do_div(y, SCHED_CAPACITY_SCALE);
10936	if ((int)y != sd_share->nr_idle_scan)
10937		WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
10938}
10939
10940/**
10941 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
10942 * @env: The load balancing environment.
10943 * @sds: variable to hold the statistics for this sched_domain.
10944 */
10945
10946static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
10947{
10948	struct sched_group *sg = env->sd->groups;
10949	struct sg_lb_stats *local = &sds->local_stat;
10950	struct sg_lb_stats tmp_sgs;
10951	unsigned long sum_util = 0;
10952	bool sg_overloaded = 0, sg_overutilized = 0;
10953
10954	do {
10955		struct sg_lb_stats *sgs = &tmp_sgs;
10956		int local_group;
10957
10958		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
10959		if (local_group) {
10960			sds->local = sg;
10961			sgs = local;
10962
10963			if (env->idle != CPU_NEWLY_IDLE ||
10964			    time_after_eq(jiffies, sg->sgc->next_update))
10965				update_group_capacity(env->sd, env->dst_cpu);
10966		}
10967
10968		update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
10969
10970		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
10971			sds->busiest = sg;
10972			sds->busiest_stat = *sgs;
10973		}
10974
10975		/* Now, start updating sd_lb_stats */
10976		sds->total_load += sgs->group_load;
10977		sds->total_capacity += sgs->group_capacity;
10978
10979		sum_util += sgs->group_util;
10980		sg = sg->next;
10981	} while (sg != env->sd->groups);
10982
10983	/*
10984	 * Indicate that the child domain of the busiest group prefers tasks
10985	 * go to a child's sibling domains first. NB the flags of a sched group
10986	 * are those of the child domain.
10987	 */
10988	if (sds->busiest)
10989		sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
10990
10991
10992	if (env->sd->flags & SD_NUMA)
10993		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
10994
10995	if (!env->sd->parent) {
10996		/* update overload indicator if we are at root domain */
10997		set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
10998
10999		/* Update over-utilization (tipping point, U >= 0) indicator */
11000		set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
11001	} else if (sg_overutilized) {
11002		set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
11003	}
11004
11005	update_idle_cpu_scan(env, sum_util);
11006}
11007
11008/**
11009 * calculate_imbalance - Calculate the amount of imbalance present within the
11010 *			 groups of a given sched_domain during load balance.
11011 * @env: load balance environment
11012 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
11013 */
11014static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
11015{
11016	struct sg_lb_stats *local, *busiest;
11017
11018	local = &sds->local_stat;
11019	busiest = &sds->busiest_stat;
11020
11021	if (busiest->group_type == group_misfit_task) {
11022		if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
11023			/* Set imbalance to allow misfit tasks to be balanced. */
11024			env->migration_type = migrate_misfit;
11025			env->imbalance = 1;
11026		} else {
11027			/*
11028			 * Set load imbalance to allow moving task from cpu
11029			 * with reduced capacity.
11030			 */
11031			env->migration_type = migrate_load;
11032			env->imbalance = busiest->group_misfit_task_load;
11033		}
11034		return;
11035	}
11036
11037	if (busiest->group_type == group_asym_packing) {
11038		/*
11039		 * In case of asym capacity, we will try to migrate all load to
11040		 * the preferred CPU.
11041		 */
11042		env->migration_type = migrate_task;
11043		env->imbalance = busiest->sum_h_nr_running;
11044		return;
11045	}
11046
11047	if (busiest->group_type == group_smt_balance) {
11048		/* Reduce number of tasks sharing CPU capacity */
11049		env->migration_type = migrate_task;
11050		env->imbalance = 1;
11051		return;
11052	}
11053
11054	if (busiest->group_type == group_imbalanced) {
11055		/*
11056		 * In the group_imb case we cannot rely on group-wide averages
11057		 * to ensure CPU-load equilibrium, try to move any task to fix
11058		 * the imbalance. The next load balance will take care of
11059		 * balancing back the system.
11060		 */
11061		env->migration_type = migrate_task;
11062		env->imbalance = 1;
11063		return;
11064	}
11065
11066	/*
11067	 * Try to use spare capacity of local group without overloading it or
11068	 * emptying busiest.
11069	 */
11070	if (local->group_type == group_has_spare) {
11071		if ((busiest->group_type > group_fully_busy) &&
11072		    !(env->sd->flags & SD_SHARE_LLC)) {
11073			/*
11074			 * If busiest is overloaded, try to fill spare
11075			 * capacity. This might end up creating spare capacity
11076			 * in busiest or busiest still being overloaded but
11077			 * there is no simple way to directly compute the
11078			 * amount of load to migrate in order to balance the
11079			 * system.
11080			 */
11081			env->migration_type = migrate_util;
11082			env->imbalance = max(local->group_capacity, local->group_util) -
11083					 local->group_util;
11084
11085			/*
11086			 * In some cases, the group's utilization is max or even
11087			 * higher than capacity because of migrations but the
11088			 * local CPU is (newly) idle. There is at least one
11089			 * waiting task in this overloaded busiest group. Let's
11090			 * try to pull it.
11091			 */
11092			if (env->idle && env->imbalance == 0) {
11093				env->migration_type = migrate_task;
11094				env->imbalance = 1;
11095			}
11096
11097			return;
11098		}
11099
11100		if (busiest->group_weight == 1 || sds->prefer_sibling) {
11101			/*
11102			 * When prefer sibling, evenly spread running tasks on
11103			 * groups.
11104			 */
11105			env->migration_type = migrate_task;
11106			env->imbalance = sibling_imbalance(env, sds, busiest, local);
11107		} else {
11108
11109			/*
11110			 * If there is no overload, we just want to even the number of
11111			 * idle CPUs.
11112			 */
11113			env->migration_type = migrate_task;
11114			env->imbalance = max_t(long, 0,
11115					       (local->idle_cpus - busiest->idle_cpus));
11116		}
11117
11118#ifdef CONFIG_NUMA
11119		/* Consider allowing a small imbalance between NUMA groups */
11120		if (env->sd->flags & SD_NUMA) {
11121			env->imbalance = adjust_numa_imbalance(env->imbalance,
11122							       local->sum_nr_running + 1,
11123							       env->sd->imb_numa_nr);
11124		}
11125#endif
11126
11127		/* Number of tasks to move to restore balance */
11128		env->imbalance >>= 1;
11129
11130		return;
11131	}
11132
11133	/*
11134	 * Local is fully busy but has to take more load to relieve the
11135	 * busiest group
11136	 */
11137	if (local->group_type < group_overloaded) {
11138		/*
11139		 * Local will become overloaded so the avg_load metrics are
11140		 * finally needed.
11141		 */
11142
11143		local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
11144				  local->group_capacity;
11145
11146		/*
11147		 * If the local group is more loaded than the selected
11148		 * busiest group don't try to pull any tasks.
11149		 */
11150		if (local->avg_load >= busiest->avg_load) {
11151			env->imbalance = 0;
11152			return;
11153		}
11154
11155		sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
11156				sds->total_capacity;
11157
11158		/*
11159		 * If the local group is more loaded than the average system
11160		 * load, don't try to pull any tasks.
11161		 */
11162		if (local->avg_load >= sds->avg_load) {
11163			env->imbalance = 0;
11164			return;
11165		}
11166
11167	}
11168
11169	/*
11170	 * Both group are or will become overloaded and we're trying to get all
11171	 * the CPUs to the average_load, so we don't want to push ourselves
11172	 * above the average load, nor do we wish to reduce the max loaded CPU
11173	 * below the average load. At the same time, we also don't want to
11174	 * reduce the group load below the group capacity. Thus we look for
11175	 * the minimum possible imbalance.
11176	 */
11177	env->migration_type = migrate_load;
11178	env->imbalance = min(
11179		(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
11180		(sds->avg_load - local->avg_load) * local->group_capacity
11181	) / SCHED_CAPACITY_SCALE;
11182}
11183
11184/******* sched_balance_find_src_group() helpers end here *********************/
11185
11186/*
11187 * Decision matrix according to the local and busiest group type:
11188 *
11189 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
11190 * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
11191 * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
11192 * misfit_task      force     N/A        N/A    N/A  N/A        N/A
11193 * asym_packing     force     force      N/A    N/A  force      force
11194 * imbalanced       force     force      N/A    N/A  force      force
11195 * overloaded       force     force      N/A    N/A  force      avg_load
11196 *
11197 * N/A :      Not Applicable because already filtered while updating
11198 *            statistics.
11199 * balanced : The system is balanced for these 2 groups.
11200 * force :    Calculate the imbalance as load migration is probably needed.
11201 * avg_load : Only if imbalance is significant enough.
11202 * nr_idle :  dst_cpu is not busy and the number of idle CPUs is quite
11203 *            different in groups.
11204 */
11205
11206/**
11207 * sched_balance_find_src_group - Returns the busiest group within the sched_domain
11208 * if there is an imbalance.
11209 * @env: The load balancing environment.
11210 *
11211 * Also calculates the amount of runnable load which should be moved
11212 * to restore balance.
11213 *
11214 * Return:	- The busiest group if imbalance exists.
11215 */
11216static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
11217{
11218	struct sg_lb_stats *local, *busiest;
11219	struct sd_lb_stats sds;
11220
11221	init_sd_lb_stats(&sds);
11222
11223	/*
11224	 * Compute the various statistics relevant for load balancing at
11225	 * this level.
11226	 */
11227	update_sd_lb_stats(env, &sds);
11228
11229	/* There is no busy sibling group to pull tasks from */
11230	if (!sds.busiest)
11231		goto out_balanced;
11232
11233	busiest = &sds.busiest_stat;
11234
11235	/* Misfit tasks should be dealt with regardless of the avg load */
11236	if (busiest->group_type == group_misfit_task)
11237		goto force_balance;
11238
11239	if (!is_rd_overutilized(env->dst_rq->rd) &&
11240	    rcu_dereference(env->dst_rq->rd->pd))
11241		goto out_balanced;
11242
11243	/* ASYM feature bypasses nice load balance check */
11244	if (busiest->group_type == group_asym_packing)
11245		goto force_balance;
11246
11247	/*
11248	 * If the busiest group is imbalanced the below checks don't
11249	 * work because they assume all things are equal, which typically
11250	 * isn't true due to cpus_ptr constraints and the like.
11251	 */
11252	if (busiest->group_type == group_imbalanced)
11253		goto force_balance;
11254
11255	local = &sds.local_stat;
11256	/*
11257	 * If the local group is busier than the selected busiest group
11258	 * don't try and pull any tasks.
11259	 */
11260	if (local->group_type > busiest->group_type)
11261		goto out_balanced;
11262
11263	/*
11264	 * When groups are overloaded, use the avg_load to ensure fairness
11265	 * between tasks.
11266	 */
11267	if (local->group_type == group_overloaded) {
11268		/*
11269		 * If the local group is more loaded than the selected
11270		 * busiest group don't try to pull any tasks.
11271		 */
11272		if (local->avg_load >= busiest->avg_load)
11273			goto out_balanced;
11274
11275		/* XXX broken for overlapping NUMA groups */
11276		sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
11277				sds.total_capacity;
11278
11279		/*
11280		 * Don't pull any tasks if this group is already above the
11281		 * domain average load.
11282		 */
11283		if (local->avg_load >= sds.avg_load)
11284			goto out_balanced;
11285
11286		/*
11287		 * If the busiest group is more loaded, use imbalance_pct to be
11288		 * conservative.
11289		 */
11290		if (100 * busiest->avg_load <=
11291				env->sd->imbalance_pct * local->avg_load)
11292			goto out_balanced;
11293	}
11294
11295	/*
11296	 * Try to move all excess tasks to a sibling domain of the busiest
11297	 * group's child domain.
11298	 */
11299	if (sds.prefer_sibling && local->group_type == group_has_spare &&
11300	    sibling_imbalance(env, &sds, busiest, local) > 1)
11301		goto force_balance;
11302
11303	if (busiest->group_type != group_overloaded) {
11304		if (!env->idle) {
11305			/*
11306			 * If the busiest group is not overloaded (and as a
11307			 * result the local one too) but this CPU is already
11308			 * busy, let another idle CPU try to pull task.
11309			 */
11310			goto out_balanced;
11311		}
11312
11313		if (busiest->group_type == group_smt_balance &&
11314		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
11315			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
11316			goto force_balance;
11317		}
11318
11319		if (busiest->group_weight > 1 &&
11320		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
11321			/*
11322			 * If the busiest group is not overloaded
11323			 * and there is no imbalance between this and busiest
11324			 * group wrt idle CPUs, it is balanced. The imbalance
11325			 * becomes significant if the diff is greater than 1
11326			 * otherwise we might end up to just move the imbalance
11327			 * on another group. Of course this applies only if
11328			 * there is more than 1 CPU per group.
11329			 */
11330			goto out_balanced;
11331		}
11332
11333		if (busiest->sum_h_nr_running == 1) {
11334			/*
11335			 * busiest doesn't have any tasks waiting to run
11336			 */
11337			goto out_balanced;
11338		}
11339	}
11340
11341force_balance:
11342	/* Looks like there is an imbalance. Compute it */
11343	calculate_imbalance(env, &sds);
11344	return env->imbalance ? sds.busiest : NULL;
11345
11346out_balanced:
11347	env->imbalance = 0;
11348	return NULL;
11349}
11350
11351/*
11352 * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
11353 */
11354static struct rq *sched_balance_find_src_rq(struct lb_env *env,
11355				     struct sched_group *group)
11356{
11357	struct rq *busiest = NULL, *rq;
11358	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
11359	unsigned int busiest_nr = 0;
11360	int i;
11361
11362	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
11363		unsigned long capacity, load, util;
11364		unsigned int nr_running;
11365		enum fbq_type rt;
11366
11367		rq = cpu_rq(i);
11368		rt = fbq_classify_rq(rq);
11369
11370		/*
11371		 * We classify groups/runqueues into three groups:
11372		 *  - regular: there are !numa tasks
11373		 *  - remote:  there are numa tasks that run on the 'wrong' node
11374		 *  - all:     there is no distinction
11375		 *
11376		 * In order to avoid migrating ideally placed numa tasks,
11377		 * ignore those when there's better options.
11378		 *
11379		 * If we ignore the actual busiest queue to migrate another
11380		 * task, the next balance pass can still reduce the busiest
11381		 * queue by moving tasks around inside the node.
11382		 *
11383		 * If we cannot move enough load due to this classification
11384		 * the next pass will adjust the group classification and
11385		 * allow migration of more tasks.
11386		 *
11387		 * Both cases only affect the total convergence complexity.
11388		 */
11389		if (rt > env->fbq_type)
11390			continue;
11391
11392		nr_running = rq->cfs.h_nr_running;
11393		if (!nr_running)
11394			continue;
11395
11396		capacity = capacity_of(i);
11397
11398		/*
11399		 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
11400		 * eventually lead to active_balancing high->low capacity.
11401		 * Higher per-CPU capacity is considered better than balancing
11402		 * average load.
11403		 */
11404		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
11405		    !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
11406		    nr_running == 1)
11407			continue;
11408
11409		/*
11410		 * Make sure we only pull tasks from a CPU of lower priority
11411		 * when balancing between SMT siblings.
11412		 *
11413		 * If balancing between cores, let lower priority CPUs help
11414		 * SMT cores with more than one busy sibling.
11415		 */
11416		if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
11417			continue;
11418
11419		switch (env->migration_type) {
11420		case migrate_load:
11421			/*
11422			 * When comparing with load imbalance, use cpu_load()
11423			 * which is not scaled with the CPU capacity.
11424			 */
11425			load = cpu_load(rq);
11426
11427			if (nr_running == 1 && load > env->imbalance &&
11428			    !check_cpu_capacity(rq, env->sd))
11429				break;
11430
11431			/*
11432			 * For the load comparisons with the other CPUs,
11433			 * consider the cpu_load() scaled with the CPU
11434			 * capacity, so that the load can be moved away
11435			 * from the CPU that is potentially running at a
11436			 * lower capacity.
11437			 *
11438			 * Thus we're looking for max(load_i / capacity_i),
11439			 * crosswise multiplication to rid ourselves of the
11440			 * division works out to:
11441			 * load_i * capacity_j > load_j * capacity_i;
11442			 * where j is our previous maximum.
11443			 */
11444			if (load * busiest_capacity > busiest_load * capacity) {
11445				busiest_load = load;
11446				busiest_capacity = capacity;
11447				busiest = rq;
11448			}
11449			break;
11450
11451		case migrate_util:
11452			util = cpu_util_cfs_boost(i);
11453
11454			/*
11455			 * Don't try to pull utilization from a CPU with one
11456			 * running task. Whatever its utilization, we will fail
11457			 * detach the task.
11458			 */
11459			if (nr_running <= 1)
11460				continue;
11461
11462			if (busiest_util < util) {
11463				busiest_util = util;
11464				busiest = rq;
11465			}
11466			break;
11467
11468		case migrate_task:
11469			if (busiest_nr < nr_running) {
11470				busiest_nr = nr_running;
11471				busiest = rq;
11472			}
11473			break;
11474
11475		case migrate_misfit:
11476			/*
11477			 * For ASYM_CPUCAPACITY domains with misfit tasks we
11478			 * simply seek the "biggest" misfit task.
11479			 */
11480			if (rq->misfit_task_load > busiest_load) {
11481				busiest_load = rq->misfit_task_load;
11482				busiest = rq;
11483			}
11484
11485			break;
11486
11487		}
11488	}
11489
11490	return busiest;
11491}
11492
11493/*
11494 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
11495 * so long as it is large enough.
11496 */
11497#define MAX_PINNED_INTERVAL	512
11498
11499static inline bool
11500asym_active_balance(struct lb_env *env)
11501{
11502	/*
11503	 * ASYM_PACKING needs to force migrate tasks from busy but lower
11504	 * priority CPUs in order to pack all tasks in the highest priority
11505	 * CPUs. When done between cores, do it only if the whole core if the
11506	 * whole core is idle.
11507	 *
11508	 * If @env::src_cpu is an SMT core with busy siblings, let
11509	 * the lower priority @env::dst_cpu help it. Do not follow
11510	 * CPU priority.
11511	 */
11512	return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
11513	       (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
11514		!sched_use_asym_prio(env->sd, env->src_cpu));
11515}
11516
11517static inline bool
11518imbalanced_active_balance(struct lb_env *env)
11519{
11520	struct sched_domain *sd = env->sd;
11521
11522	/*
11523	 * The imbalanced case includes the case of pinned tasks preventing a fair
11524	 * distribution of the load on the system but also the even distribution of the
11525	 * threads on a system with spare capacity
11526	 */
11527	if ((env->migration_type == migrate_task) &&
11528	    (sd->nr_balance_failed > sd->cache_nice_tries+2))
11529		return 1;
11530
11531	return 0;
11532}
11533
11534static int need_active_balance(struct lb_env *env)
11535{
11536	struct sched_domain *sd = env->sd;
11537
11538	if (asym_active_balance(env))
11539		return 1;
11540
11541	if (imbalanced_active_balance(env))
11542		return 1;
11543
11544	/*
11545	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
11546	 * It's worth migrating the task if the src_cpu's capacity is reduced
11547	 * because of other sched_class or IRQs if more capacity stays
11548	 * available on dst_cpu.
11549	 */
11550	if (env->idle &&
11551	    (env->src_rq->cfs.h_nr_running == 1)) {
11552		if ((check_cpu_capacity(env->src_rq, sd)) &&
11553		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
11554			return 1;
11555	}
11556
11557	if (env->migration_type == migrate_misfit)
11558		return 1;
11559
11560	return 0;
11561}
11562
11563static int active_load_balance_cpu_stop(void *data);
11564
11565static int should_we_balance(struct lb_env *env)
11566{
11567	struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
11568	struct sched_group *sg = env->sd->groups;
11569	int cpu, idle_smt = -1;
11570
11571	/*
11572	 * Ensure the balancing environment is consistent; can happen
11573	 * when the softirq triggers 'during' hotplug.
11574	 */
11575	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
11576		return 0;
11577
11578	/*
11579	 * In the newly idle case, we will allow all the CPUs
11580	 * to do the newly idle load balance.
11581	 *
11582	 * However, we bail out if we already have tasks or a wakeup pending,
11583	 * to optimize wakeup latency.
11584	 */
11585	if (env->idle == CPU_NEWLY_IDLE) {
11586		if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
11587			return 0;
11588		return 1;
11589	}
11590
11591	cpumask_copy(swb_cpus, group_balance_mask(sg));
11592	/* Try to find first idle CPU */
11593	for_each_cpu_and(cpu, swb_cpus, env->cpus) {
11594		if (!idle_cpu(cpu))
11595			continue;
11596
11597		/*
11598		 * Don't balance to idle SMT in busy core right away when
11599		 * balancing cores, but remember the first idle SMT CPU for
11600		 * later consideration.  Find CPU on an idle core first.
11601		 */
11602		if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
11603			if (idle_smt == -1)
11604				idle_smt = cpu;
11605			/*
11606			 * If the core is not idle, and first SMT sibling which is
11607			 * idle has been found, then its not needed to check other
11608			 * SMT siblings for idleness:
11609			 */
11610#ifdef CONFIG_SCHED_SMT
11611			cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
11612#endif
11613			continue;
11614		}
11615
11616		/*
11617		 * Are we the first idle core in a non-SMT domain or higher,
11618		 * or the first idle CPU in a SMT domain?
11619		 */
11620		return cpu == env->dst_cpu;
11621	}
11622
11623	/* Are we the first idle CPU with busy siblings? */
11624	if (idle_smt != -1)
11625		return idle_smt == env->dst_cpu;
11626
11627	/* Are we the first CPU of this group ? */
11628	return group_balance_cpu(sg) == env->dst_cpu;
11629}
11630
11631/*
11632 * Check this_cpu to ensure it is balanced within domain. Attempt to move
11633 * tasks if there is an imbalance.
11634 */
11635static int sched_balance_rq(int this_cpu, struct rq *this_rq,
11636			struct sched_domain *sd, enum cpu_idle_type idle,
11637			int *continue_balancing)
11638{
11639	int ld_moved, cur_ld_moved, active_balance = 0;
11640	struct sched_domain *sd_parent = sd->parent;
11641	struct sched_group *group;
11642	struct rq *busiest;
11643	struct rq_flags rf;
11644	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
11645	struct lb_env env = {
11646		.sd		= sd,
11647		.dst_cpu	= this_cpu,
11648		.dst_rq		= this_rq,
11649		.dst_grpmask    = group_balance_mask(sd->groups),
11650		.idle		= idle,
11651		.loop_break	= SCHED_NR_MIGRATE_BREAK,
11652		.cpus		= cpus,
11653		.fbq_type	= all,
11654		.tasks		= LIST_HEAD_INIT(env.tasks),
11655	};
11656
11657	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
11658
11659	schedstat_inc(sd->lb_count[idle]);
11660
11661redo:
11662	if (!should_we_balance(&env)) {
11663		*continue_balancing = 0;
11664		goto out_balanced;
11665	}
11666
11667	group = sched_balance_find_src_group(&env);
11668	if (!group) {
11669		schedstat_inc(sd->lb_nobusyg[idle]);
11670		goto out_balanced;
11671	}
11672
11673	busiest = sched_balance_find_src_rq(&env, group);
11674	if (!busiest) {
11675		schedstat_inc(sd->lb_nobusyq[idle]);
11676		goto out_balanced;
11677	}
11678
11679	WARN_ON_ONCE(busiest == env.dst_rq);
11680
11681	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
11682
11683	env.src_cpu = busiest->cpu;
11684	env.src_rq = busiest;
11685
11686	ld_moved = 0;
11687	/* Clear this flag as soon as we find a pullable task */
11688	env.flags |= LBF_ALL_PINNED;
11689	if (busiest->nr_running > 1) {
11690		/*
11691		 * Attempt to move tasks. If sched_balance_find_src_group has found
11692		 * an imbalance but busiest->nr_running <= 1, the group is
11693		 * still unbalanced. ld_moved simply stays zero, so it is
11694		 * correctly treated as an imbalance.
11695		 */
11696		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
11697
11698more_balance:
11699		rq_lock_irqsave(busiest, &rf);
11700		update_rq_clock(busiest);
11701
11702		/*
11703		 * cur_ld_moved - load moved in current iteration
11704		 * ld_moved     - cumulative load moved across iterations
11705		 */
11706		cur_ld_moved = detach_tasks(&env);
11707
11708		/*
11709		 * We've detached some tasks from busiest_rq. Every
11710		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
11711		 * unlock busiest->lock, and we are able to be sure
11712		 * that nobody can manipulate the tasks in parallel.
11713		 * See task_rq_lock() family for the details.
11714		 */
11715
11716		rq_unlock(busiest, &rf);
11717
11718		if (cur_ld_moved) {
11719			attach_tasks(&env);
11720			ld_moved += cur_ld_moved;
11721		}
11722
11723		local_irq_restore(rf.flags);
11724
11725		if (env.flags & LBF_NEED_BREAK) {
11726			env.flags &= ~LBF_NEED_BREAK;
11727			goto more_balance;
11728		}
11729
11730		/*
11731		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
11732		 * us and move them to an alternate dst_cpu in our sched_group
11733		 * where they can run. The upper limit on how many times we
11734		 * iterate on same src_cpu is dependent on number of CPUs in our
11735		 * sched_group.
11736		 *
11737		 * This changes load balance semantics a bit on who can move
11738		 * load to a given_cpu. In addition to the given_cpu itself
11739		 * (or a ilb_cpu acting on its behalf where given_cpu is
11740		 * nohz-idle), we now have balance_cpu in a position to move
11741		 * load to given_cpu. In rare situations, this may cause
11742		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
11743		 * _independently_ and at _same_ time to move some load to
11744		 * given_cpu) causing excess load to be moved to given_cpu.
11745		 * This however should not happen so much in practice and
11746		 * moreover subsequent load balance cycles should correct the
11747		 * excess load moved.
11748		 */
11749		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
11750
11751			/* Prevent to re-select dst_cpu via env's CPUs */
11752			__cpumask_clear_cpu(env.dst_cpu, env.cpus);
11753
11754			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
11755			env.dst_cpu	 = env.new_dst_cpu;
11756			env.flags	&= ~LBF_DST_PINNED;
11757			env.loop	 = 0;
11758			env.loop_break	 = SCHED_NR_MIGRATE_BREAK;
11759
11760			/*
11761			 * Go back to "more_balance" rather than "redo" since we
11762			 * need to continue with same src_cpu.
11763			 */
11764			goto more_balance;
11765		}
11766
11767		/*
11768		 * We failed to reach balance because of affinity.
11769		 */
11770		if (sd_parent) {
11771			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11772
11773			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
11774				*group_imbalance = 1;
11775		}
11776
11777		/* All tasks on this runqueue were pinned by CPU affinity */
11778		if (unlikely(env.flags & LBF_ALL_PINNED)) {
11779			__cpumask_clear_cpu(cpu_of(busiest), cpus);
11780			/*
11781			 * Attempting to continue load balancing at the current
11782			 * sched_domain level only makes sense if there are
11783			 * active CPUs remaining as possible busiest CPUs to
11784			 * pull load from which are not contained within the
11785			 * destination group that is receiving any migrated
11786			 * load.
11787			 */
11788			if (!cpumask_subset(cpus, env.dst_grpmask)) {
11789				env.loop = 0;
11790				env.loop_break = SCHED_NR_MIGRATE_BREAK;
11791				goto redo;
11792			}
11793			goto out_all_pinned;
11794		}
11795	}
11796
11797	if (!ld_moved) {
11798		schedstat_inc(sd->lb_failed[idle]);
11799		/*
11800		 * Increment the failure counter only on periodic balance.
11801		 * We do not want newidle balance, which can be very
11802		 * frequent, pollute the failure counter causing
11803		 * excessive cache_hot migrations and active balances.
11804		 *
11805		 * Similarly for migration_misfit which is not related to
11806		 * load/util migration, don't pollute nr_balance_failed.
11807		 */
11808		if (idle != CPU_NEWLY_IDLE &&
11809		    env.migration_type != migrate_misfit)
11810			sd->nr_balance_failed++;
11811
11812		if (need_active_balance(&env)) {
11813			unsigned long flags;
11814
11815			raw_spin_rq_lock_irqsave(busiest, flags);
11816
11817			/*
11818			 * Don't kick the active_load_balance_cpu_stop,
11819			 * if the curr task on busiest CPU can't be
11820			 * moved to this_cpu:
11821			 */
11822			if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
11823				raw_spin_rq_unlock_irqrestore(busiest, flags);
11824				goto out_one_pinned;
11825			}
11826
11827			/* Record that we found at least one task that could run on this_cpu */
11828			env.flags &= ~LBF_ALL_PINNED;
11829
11830			/*
11831			 * ->active_balance synchronizes accesses to
11832			 * ->active_balance_work.  Once set, it's cleared
11833			 * only after active load balance is finished.
11834			 */
11835			if (!busiest->active_balance) {
11836				busiest->active_balance = 1;
11837				busiest->push_cpu = this_cpu;
11838				active_balance = 1;
11839			}
11840
11841			preempt_disable();
11842			raw_spin_rq_unlock_irqrestore(busiest, flags);
11843			if (active_balance) {
11844				stop_one_cpu_nowait(cpu_of(busiest),
11845					active_load_balance_cpu_stop, busiest,
11846					&busiest->active_balance_work);
11847			}
11848			preempt_enable();
11849		}
11850	} else {
11851		sd->nr_balance_failed = 0;
11852	}
11853
11854	if (likely(!active_balance) || need_active_balance(&env)) {
11855		/* We were unbalanced, so reset the balancing interval */
11856		sd->balance_interval = sd->min_interval;
11857	}
11858
11859	goto out;
11860
11861out_balanced:
11862	/*
11863	 * We reach balance although we may have faced some affinity
11864	 * constraints. Clear the imbalance flag only if other tasks got
11865	 * a chance to move and fix the imbalance.
11866	 */
11867	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
11868		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11869
11870		if (*group_imbalance)
11871			*group_imbalance = 0;
11872	}
11873
11874out_all_pinned:
11875	/*
11876	 * We reach balance because all tasks are pinned at this level so
11877	 * we can't migrate them. Let the imbalance flag set so parent level
11878	 * can try to migrate them.
11879	 */
11880	schedstat_inc(sd->lb_balanced[idle]);
11881
11882	sd->nr_balance_failed = 0;
11883
11884out_one_pinned:
11885	ld_moved = 0;
11886
11887	/*
11888	 * sched_balance_newidle() disregards balance intervals, so we could
11889	 * repeatedly reach this code, which would lead to balance_interval
11890	 * skyrocketing in a short amount of time. Skip the balance_interval
11891	 * increase logic to avoid that.
11892	 *
11893	 * Similarly misfit migration which is not necessarily an indication of
11894	 * the system being busy and requires lb to backoff to let it settle
11895	 * down.
11896	 */
11897	if (env.idle == CPU_NEWLY_IDLE ||
11898	    env.migration_type == migrate_misfit)
11899		goto out;
11900
11901	/* tune up the balancing interval */
11902	if ((env.flags & LBF_ALL_PINNED &&
11903	     sd->balance_interval < MAX_PINNED_INTERVAL) ||
11904	    sd->balance_interval < sd->max_interval)
11905		sd->balance_interval *= 2;
11906out:
11907	return ld_moved;
11908}
11909
11910static inline unsigned long
11911get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
11912{
11913	unsigned long interval = sd->balance_interval;
11914
11915	if (cpu_busy)
11916		interval *= sd->busy_factor;
11917
11918	/* scale ms to jiffies */
11919	interval = msecs_to_jiffies(interval);
11920
11921	/*
11922	 * Reduce likelihood of busy balancing at higher domains racing with
11923	 * balancing at lower domains by preventing their balancing periods
11924	 * from being multiples of each other.
11925	 */
11926	if (cpu_busy)
11927		interval -= 1;
11928
11929	interval = clamp(interval, 1UL, max_load_balance_interval);
11930
11931	return interval;
11932}
11933
11934static inline void
11935update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
11936{
11937	unsigned long interval, next;
11938
11939	/* used by idle balance, so cpu_busy = 0 */
11940	interval = get_sd_balance_interval(sd, 0);
11941	next = sd->last_balance + interval;
11942
11943	if (time_after(*next_balance, next))
11944		*next_balance = next;
11945}
11946
11947/*
11948 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
11949 * running tasks off the busiest CPU onto idle CPUs. It requires at
11950 * least 1 task to be running on each physical CPU where possible, and
11951 * avoids physical / logical imbalances.
11952 */
11953static int active_load_balance_cpu_stop(void *data)
11954{
11955	struct rq *busiest_rq = data;
11956	int busiest_cpu = cpu_of(busiest_rq);
11957	int target_cpu = busiest_rq->push_cpu;
11958	struct rq *target_rq = cpu_rq(target_cpu);
11959	struct sched_domain *sd;
11960	struct task_struct *p = NULL;
11961	struct rq_flags rf;
11962
11963	rq_lock_irq(busiest_rq, &rf);
11964	/*
11965	 * Between queueing the stop-work and running it is a hole in which
11966	 * CPUs can become inactive. We should not move tasks from or to
11967	 * inactive CPUs.
11968	 */
11969	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
11970		goto out_unlock;
11971
11972	/* Make sure the requested CPU hasn't gone down in the meantime: */
11973	if (unlikely(busiest_cpu != smp_processor_id() ||
11974		     !busiest_rq->active_balance))
11975		goto out_unlock;
11976
11977	/* Is there any task to move? */
11978	if (busiest_rq->nr_running <= 1)
11979		goto out_unlock;
11980
11981	/*
11982	 * This condition is "impossible", if it occurs
11983	 * we need to fix it. Originally reported by
11984	 * Bjorn Helgaas on a 128-CPU setup.
11985	 */
11986	WARN_ON_ONCE(busiest_rq == target_rq);
11987
11988	/* Search for an sd spanning us and the target CPU. */
11989	rcu_read_lock();
11990	for_each_domain(target_cpu, sd) {
11991		if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
11992			break;
11993	}
11994
11995	if (likely(sd)) {
11996		struct lb_env env = {
11997			.sd		= sd,
11998			.dst_cpu	= target_cpu,
11999			.dst_rq		= target_rq,
12000			.src_cpu	= busiest_rq->cpu,
12001			.src_rq		= busiest_rq,
12002			.idle		= CPU_IDLE,
12003			.flags		= LBF_ACTIVE_LB,
12004		};
12005
12006		schedstat_inc(sd->alb_count);
12007		update_rq_clock(busiest_rq);
12008
12009		p = detach_one_task(&env);
12010		if (p) {
12011			schedstat_inc(sd->alb_pushed);
12012			/* Active balancing done, reset the failure counter. */
12013			sd->nr_balance_failed = 0;
12014		} else {
12015			schedstat_inc(sd->alb_failed);
12016		}
12017	}
12018	rcu_read_unlock();
12019out_unlock:
12020	busiest_rq->active_balance = 0;
12021	rq_unlock(busiest_rq, &rf);
12022
12023	if (p)
12024		attach_one_task(target_rq, p);
12025
12026	local_irq_enable();
12027
12028	return 0;
12029}
12030
12031/*
12032 * This flag serializes load-balancing passes over large domains
12033 * (above the NODE topology level) - only one load-balancing instance
12034 * may run at a time, to reduce overhead on very large systems with
12035 * lots of CPUs and large NUMA distances.
12036 *
12037 * - Note that load-balancing passes triggered while another one
12038 *   is executing are skipped and not re-tried.
12039 *
12040 * - Also note that this does not serialize rebalance_domains()
12041 *   execution, as non-SD_SERIALIZE domains will still be
12042 *   load-balanced in parallel.
12043 */
12044static atomic_t sched_balance_running = ATOMIC_INIT(0);
12045
12046/*
12047 * Scale the max sched_balance_rq interval with the number of CPUs in the system.
12048 * This trades load-balance latency on larger machines for less cross talk.
12049 */
12050void update_max_interval(void)
12051{
12052	max_load_balance_interval = HZ*num_online_cpus()/10;
12053}
12054
12055static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
12056{
12057	if (cost > sd->max_newidle_lb_cost) {
12058		/*
12059		 * Track max cost of a domain to make sure to not delay the
12060		 * next wakeup on the CPU.
12061		 */
12062		sd->max_newidle_lb_cost = cost;
12063		sd->last_decay_max_lb_cost = jiffies;
12064	} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
12065		/*
12066		 * Decay the newidle max times by ~1% per second to ensure that
12067		 * it is not outdated and the current max cost is actually
12068		 * shorter.
12069		 */
12070		sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
12071		sd->last_decay_max_lb_cost = jiffies;
12072
12073		return true;
12074	}
12075
12076	return false;
12077}
12078
12079/*
12080 * It checks each scheduling domain to see if it is due to be balanced,
12081 * and initiates a balancing operation if so.
12082 *
12083 * Balancing parameters are set up in init_sched_domains.
12084 */
12085static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
12086{
12087	int continue_balancing = 1;
12088	int cpu = rq->cpu;
12089	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
12090	unsigned long interval;
12091	struct sched_domain *sd;
12092	/* Earliest time when we have to do rebalance again */
12093	unsigned long next_balance = jiffies + 60*HZ;
12094	int update_next_balance = 0;
12095	int need_serialize, need_decay = 0;
12096	u64 max_cost = 0;
12097
12098	rcu_read_lock();
12099	for_each_domain(cpu, sd) {
12100		/*
12101		 * Decay the newidle max times here because this is a regular
12102		 * visit to all the domains.
12103		 */
12104		need_decay = update_newidle_cost(sd, 0);
12105		max_cost += sd->max_newidle_lb_cost;
12106
12107		/*
12108		 * Stop the load balance at this level. There is another
12109		 * CPU in our sched group which is doing load balancing more
12110		 * actively.
12111		 */
12112		if (!continue_balancing) {
12113			if (need_decay)
12114				continue;
12115			break;
12116		}
12117
12118		interval = get_sd_balance_interval(sd, busy);
12119
12120		need_serialize = sd->flags & SD_SERIALIZE;
12121		if (need_serialize) {
12122			if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
12123				goto out;
12124		}
12125
12126		if (time_after_eq(jiffies, sd->last_balance + interval)) {
12127			if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
12128				/*
12129				 * The LBF_DST_PINNED logic could have changed
12130				 * env->dst_cpu, so we can't know our idle
12131				 * state even if we migrated tasks. Update it.
12132				 */
12133				idle = idle_cpu(cpu);
12134				busy = !idle && !sched_idle_cpu(cpu);
12135			}
12136			sd->last_balance = jiffies;
12137			interval = get_sd_balance_interval(sd, busy);
12138		}
12139		if (need_serialize)
12140			atomic_set_release(&sched_balance_running, 0);
12141out:
12142		if (time_after(next_balance, sd->last_balance + interval)) {
12143			next_balance = sd->last_balance + interval;
12144			update_next_balance = 1;
12145		}
12146	}
12147	if (need_decay) {
12148		/*
12149		 * Ensure the rq-wide value also decays but keep it at a
12150		 * reasonable floor to avoid funnies with rq->avg_idle.
12151		 */
12152		rq->max_idle_balance_cost =
12153			max((u64)sysctl_sched_migration_cost, max_cost);
12154	}
12155	rcu_read_unlock();
12156
12157	/*
12158	 * next_balance will be updated only when there is a need.
12159	 * When the cpu is attached to null domain for ex, it will not be
12160	 * updated.
12161	 */
12162	if (likely(update_next_balance))
12163		rq->next_balance = next_balance;
12164
12165}
12166
12167static inline int on_null_domain(struct rq *rq)
12168{
12169	return unlikely(!rcu_dereference_sched(rq->sd));
12170}
12171
12172#ifdef CONFIG_NO_HZ_COMMON
12173/*
12174 * NOHZ idle load balancing (ILB) details:
12175 *
12176 * - When one of the busy CPUs notices that there may be an idle rebalancing
12177 *   needed, they will kick the idle load balancer, which then does idle
12178 *   load balancing for all the idle CPUs.
12179 *
12180 * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
12181 *   anywhere yet.
12182 */
12183static inline int find_new_ilb(void)
12184{
12185	const struct cpumask *hk_mask;
12186	int ilb_cpu;
12187
12188	hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
12189
12190	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
12191
12192		if (ilb_cpu == smp_processor_id())
12193			continue;
12194
12195		if (idle_cpu(ilb_cpu))
12196			return ilb_cpu;
12197	}
12198
12199	return -1;
12200}
12201
12202/*
12203 * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
12204 * SMP function call (IPI).
12205 *
12206 * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
12207 */
12208static void kick_ilb(unsigned int flags)
12209{
12210	int ilb_cpu;
12211
12212	/*
12213	 * Increase nohz.next_balance only when if full ilb is triggered but
12214	 * not if we only update stats.
12215	 */
12216	if (flags & NOHZ_BALANCE_KICK)
12217		nohz.next_balance = jiffies+1;
12218
12219	ilb_cpu = find_new_ilb();
12220	if (ilb_cpu < 0)
12221		return;
12222
12223	/*
12224	 * Don't bother if no new NOHZ balance work items for ilb_cpu,
12225	 * i.e. all bits in flags are already set in ilb_cpu.
12226	 */
12227	if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
12228		return;
12229
12230	/*
12231	 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
12232	 * the first flag owns it; cleared by nohz_csd_func().
12233	 */
12234	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
12235	if (flags & NOHZ_KICK_MASK)
12236		return;
12237
12238	/*
12239	 * This way we generate an IPI on the target CPU which
12240	 * is idle, and the softirq performing NOHZ idle load balancing
12241	 * will be run before returning from the IPI.
12242	 */
12243	smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
12244}
12245
12246/*
12247 * Current decision point for kicking the idle load balancer in the presence
12248 * of idle CPUs in the system.
12249 */
12250static void nohz_balancer_kick(struct rq *rq)
12251{
12252	unsigned long now = jiffies;
12253	struct sched_domain_shared *sds;
12254	struct sched_domain *sd;
12255	int nr_busy, i, cpu = rq->cpu;
12256	unsigned int flags = 0;
12257
12258	if (unlikely(rq->idle_balance))
12259		return;
12260
12261	/*
12262	 * We may be recently in ticked or tickless idle mode. At the first
12263	 * busy tick after returning from idle, we will update the busy stats.
12264	 */
12265	nohz_balance_exit_idle(rq);
12266
12267	/*
12268	 * None are in tickless mode and hence no need for NOHZ idle load
12269	 * balancing:
12270	 */
12271	if (likely(!atomic_read(&nohz.nr_cpus)))
12272		return;
12273
12274	if (READ_ONCE(nohz.has_blocked) &&
12275	    time_after(now, READ_ONCE(nohz.next_blocked)))
12276		flags = NOHZ_STATS_KICK;
12277
12278	if (time_before(now, nohz.next_balance))
12279		goto out;
12280
12281	if (rq->nr_running >= 2) {
12282		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12283		goto out;
12284	}
12285
12286	rcu_read_lock();
12287
12288	sd = rcu_dereference(rq->sd);
12289	if (sd) {
12290		/*
12291		 * If there's a runnable CFS task and the current CPU has reduced
12292		 * capacity, kick the ILB to see if there's a better CPU to run on:
12293		 */
12294		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
12295			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12296			goto unlock;
12297		}
12298	}
12299
12300	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
12301	if (sd) {
12302		/*
12303		 * When ASYM_PACKING; see if there's a more preferred CPU
12304		 * currently idle; in which case, kick the ILB to move tasks
12305		 * around.
12306		 *
12307		 * When balancing between cores, all the SMT siblings of the
12308		 * preferred CPU must be idle.
12309		 */
12310		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
12311			if (sched_asym(sd, i, cpu)) {
12312				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12313				goto unlock;
12314			}
12315		}
12316	}
12317
12318	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
12319	if (sd) {
12320		/*
12321		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
12322		 * to run the misfit task on.
12323		 */
12324		if (check_misfit_status(rq)) {
12325			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12326			goto unlock;
12327		}
12328
12329		/*
12330		 * For asymmetric systems, we do not want to nicely balance
12331		 * cache use, instead we want to embrace asymmetry and only
12332		 * ensure tasks have enough CPU capacity.
12333		 *
12334		 * Skip the LLC logic because it's not relevant in that case.
12335		 */
12336		goto unlock;
12337	}
12338
12339	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
12340	if (sds) {
12341		/*
12342		 * If there is an imbalance between LLC domains (IOW we could
12343		 * increase the overall cache utilization), we need a less-loaded LLC
12344		 * domain to pull some load from. Likewise, we may need to spread
12345		 * load within the current LLC domain (e.g. packed SMT cores but
12346		 * other CPUs are idle). We can't really know from here how busy
12347		 * the others are - so just get a NOHZ balance going if it looks
12348		 * like this LLC domain has tasks we could move.
12349		 */
12350		nr_busy = atomic_read(&sds->nr_busy_cpus);
12351		if (nr_busy > 1) {
12352			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12353			goto unlock;
12354		}
12355	}
12356unlock:
12357	rcu_read_unlock();
12358out:
12359	if (READ_ONCE(nohz.needs_update))
12360		flags |= NOHZ_NEXT_KICK;
12361
12362	if (flags)
12363		kick_ilb(flags);
12364}
12365
12366static void set_cpu_sd_state_busy(int cpu)
12367{
12368	struct sched_domain *sd;
12369
12370	rcu_read_lock();
12371	sd = rcu_dereference(per_cpu(sd_llc, cpu));
12372
12373	if (!sd || !sd->nohz_idle)
12374		goto unlock;
12375	sd->nohz_idle = 0;
12376
12377	atomic_inc(&sd->shared->nr_busy_cpus);
12378unlock:
12379	rcu_read_unlock();
12380}
12381
12382void nohz_balance_exit_idle(struct rq *rq)
12383{
12384	SCHED_WARN_ON(rq != this_rq());
12385
12386	if (likely(!rq->nohz_tick_stopped))
12387		return;
12388
12389	rq->nohz_tick_stopped = 0;
12390	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
12391	atomic_dec(&nohz.nr_cpus);
12392
12393	set_cpu_sd_state_busy(rq->cpu);
12394}
12395
12396static void set_cpu_sd_state_idle(int cpu)
12397{
12398	struct sched_domain *sd;
12399
12400	rcu_read_lock();
12401	sd = rcu_dereference(per_cpu(sd_llc, cpu));
12402
12403	if (!sd || sd->nohz_idle)
12404		goto unlock;
12405	sd->nohz_idle = 1;
12406
12407	atomic_dec(&sd->shared->nr_busy_cpus);
12408unlock:
12409	rcu_read_unlock();
12410}
12411
12412/*
12413 * This routine will record that the CPU is going idle with tick stopped.
12414 * This info will be used in performing idle load balancing in the future.
12415 */
12416void nohz_balance_enter_idle(int cpu)
12417{
12418	struct rq *rq = cpu_rq(cpu);
12419
12420	SCHED_WARN_ON(cpu != smp_processor_id());
12421
12422	/* If this CPU is going down, then nothing needs to be done: */
12423	if (!cpu_active(cpu))
12424		return;
12425
12426	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
12427	if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
12428		return;
12429
12430	/*
12431	 * Can be set safely without rq->lock held
12432	 * If a clear happens, it will have evaluated last additions because
12433	 * rq->lock is held during the check and the clear
12434	 */
12435	rq->has_blocked_load = 1;
12436
12437	/*
12438	 * The tick is still stopped but load could have been added in the
12439	 * meantime. We set the nohz.has_blocked flag to trig a check of the
12440	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
12441	 * of nohz.has_blocked can only happen after checking the new load
12442	 */
12443	if (rq->nohz_tick_stopped)
12444		goto out;
12445
12446	/* If we're a completely isolated CPU, we don't play: */
12447	if (on_null_domain(rq))
12448		return;
12449
12450	rq->nohz_tick_stopped = 1;
12451
12452	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
12453	atomic_inc(&nohz.nr_cpus);
12454
12455	/*
12456	 * Ensures that if nohz_idle_balance() fails to observe our
12457	 * @idle_cpus_mask store, it must observe the @has_blocked
12458	 * and @needs_update stores.
12459	 */
12460	smp_mb__after_atomic();
12461
12462	set_cpu_sd_state_idle(cpu);
12463
12464	WRITE_ONCE(nohz.needs_update, 1);
12465out:
12466	/*
12467	 * Each time a cpu enter idle, we assume that it has blocked load and
12468	 * enable the periodic update of the load of idle CPUs
12469	 */
12470	WRITE_ONCE(nohz.has_blocked, 1);
12471}
12472
12473static bool update_nohz_stats(struct rq *rq)
12474{
12475	unsigned int cpu = rq->cpu;
12476
12477	if (!rq->has_blocked_load)
12478		return false;
12479
12480	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
12481		return false;
12482
12483	if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
12484		return true;
12485
12486	sched_balance_update_blocked_averages(cpu);
12487
12488	return rq->has_blocked_load;
12489}
12490
12491/*
12492 * Internal function that runs load balance for all idle CPUs. The load balance
12493 * can be a simple update of blocked load or a complete load balance with
12494 * tasks movement depending of flags.
12495 */
12496static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
12497{
12498	/* Earliest time when we have to do rebalance again */
12499	unsigned long now = jiffies;
12500	unsigned long next_balance = now + 60*HZ;
12501	bool has_blocked_load = false;
12502	int update_next_balance = 0;
12503	int this_cpu = this_rq->cpu;
12504	int balance_cpu;
12505	struct rq *rq;
12506
12507	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
12508
12509	/*
12510	 * We assume there will be no idle load after this update and clear
12511	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
12512	 * set the has_blocked flag and trigger another update of idle load.
12513	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
12514	 * setting the flag, we are sure to not clear the state and not
12515	 * check the load of an idle cpu.
12516	 *
12517	 * Same applies to idle_cpus_mask vs needs_update.
12518	 */
12519	if (flags & NOHZ_STATS_KICK)
12520		WRITE_ONCE(nohz.has_blocked, 0);
12521	if (flags & NOHZ_NEXT_KICK)
12522		WRITE_ONCE(nohz.needs_update, 0);
12523
12524	/*
12525	 * Ensures that if we miss the CPU, we must see the has_blocked
12526	 * store from nohz_balance_enter_idle().
12527	 */
12528	smp_mb();
12529
12530	/*
12531	 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
12532	 * chance for other idle cpu to pull load.
12533	 */
12534	for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
12535		if (!idle_cpu(balance_cpu))
12536			continue;
12537
12538		/*
12539		 * If this CPU gets work to do, stop the load balancing
12540		 * work being done for other CPUs. Next load
12541		 * balancing owner will pick it up.
12542		 */
12543		if (!idle_cpu(this_cpu) && need_resched()) {
12544			if (flags & NOHZ_STATS_KICK)
12545				has_blocked_load = true;
12546			if (flags & NOHZ_NEXT_KICK)
12547				WRITE_ONCE(nohz.needs_update, 1);
12548			goto abort;
12549		}
12550
12551		rq = cpu_rq(balance_cpu);
12552
12553		if (flags & NOHZ_STATS_KICK)
12554			has_blocked_load |= update_nohz_stats(rq);
12555
12556		/*
12557		 * If time for next balance is due,
12558		 * do the balance.
12559		 */
12560		if (time_after_eq(jiffies, rq->next_balance)) {
12561			struct rq_flags rf;
12562
12563			rq_lock_irqsave(rq, &rf);
12564			update_rq_clock(rq);
12565			rq_unlock_irqrestore(rq, &rf);
12566
12567			if (flags & NOHZ_BALANCE_KICK)
12568				sched_balance_domains(rq, CPU_IDLE);
12569		}
12570
12571		if (time_after(next_balance, rq->next_balance)) {
12572			next_balance = rq->next_balance;
12573			update_next_balance = 1;
12574		}
12575	}
12576
12577	/*
12578	 * next_balance will be updated only when there is a need.
12579	 * When the CPU is attached to null domain for ex, it will not be
12580	 * updated.
12581	 */
12582	if (likely(update_next_balance))
12583		nohz.next_balance = next_balance;
12584
12585	if (flags & NOHZ_STATS_KICK)
12586		WRITE_ONCE(nohz.next_blocked,
12587			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
12588
12589abort:
12590	/* There is still blocked load, enable periodic update */
12591	if (has_blocked_load)
12592		WRITE_ONCE(nohz.has_blocked, 1);
12593}
12594
12595/*
12596 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
12597 * rebalancing for all the CPUs for whom scheduler ticks are stopped.
12598 */
12599static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12600{
12601	unsigned int flags = this_rq->nohz_idle_balance;
12602
12603	if (!flags)
12604		return false;
12605
12606	this_rq->nohz_idle_balance = 0;
12607
12608	if (idle != CPU_IDLE)
12609		return false;
12610
12611	_nohz_idle_balance(this_rq, flags);
12612
12613	return true;
12614}
12615
12616/*
12617 * Check if we need to directly run the ILB for updating blocked load before
12618 * entering idle state. Here we run ILB directly without issuing IPIs.
12619 *
12620 * Note that when this function is called, the tick may not yet be stopped on
12621 * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
12622 * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
12623 * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
12624 * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
12625 * called from this function on (this) CPU that's not yet in the mask. That's
12626 * OK because the goal of nohz_run_idle_balance() is to run ILB only for
12627 * updating the blocked load of already idle CPUs without waking up one of
12628 * those idle CPUs and outside the preempt disable / IRQ off phase of the local
12629 * cpu about to enter idle, because it can take a long time.
12630 */
12631void nohz_run_idle_balance(int cpu)
12632{
12633	unsigned int flags;
12634
12635	flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
12636
12637	/*
12638	 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
12639	 * (i.e. NOHZ_STATS_KICK set) and will do the same.
12640	 */
12641	if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
12642		_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
12643}
12644
12645static void nohz_newidle_balance(struct rq *this_rq)
12646{
12647	int this_cpu = this_rq->cpu;
12648
12649	/*
12650	 * This CPU doesn't want to be disturbed by scheduler
12651	 * housekeeping
12652	 */
12653	if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
12654		return;
12655
12656	/* Will wake up very soon. No time for doing anything else*/
12657	if (this_rq->avg_idle < sysctl_sched_migration_cost)
12658		return;
12659
12660	/* Don't need to update blocked load of idle CPUs*/
12661	if (!READ_ONCE(nohz.has_blocked) ||
12662	    time_before(jiffies, READ_ONCE(nohz.next_blocked)))
12663		return;
12664
12665	/*
12666	 * Set the need to trigger ILB in order to update blocked load
12667	 * before entering idle state.
12668	 */
12669	atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
12670}
12671
12672#else /* !CONFIG_NO_HZ_COMMON */
12673static inline void nohz_balancer_kick(struct rq *rq) { }
12674
12675static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12676{
12677	return false;
12678}
12679
12680static inline void nohz_newidle_balance(struct rq *this_rq) { }
12681#endif /* CONFIG_NO_HZ_COMMON */
12682
12683/*
12684 * sched_balance_newidle is called by schedule() if this_cpu is about to become
12685 * idle. Attempts to pull tasks from other CPUs.
12686 *
12687 * Returns:
12688 *   < 0 - we released the lock and there are !fair tasks present
12689 *     0 - failed, no new tasks
12690 *   > 0 - success, new (fair) tasks present
12691 */
12692static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
12693{
12694	unsigned long next_balance = jiffies + HZ;
12695	int this_cpu = this_rq->cpu;
12696	int continue_balancing = 1;
12697	u64 t0, t1, curr_cost = 0;
12698	struct sched_domain *sd;
12699	int pulled_task = 0;
12700
12701	update_misfit_status(NULL, this_rq);
12702
12703	/*
12704	 * There is a task waiting to run. No need to search for one.
12705	 * Return 0; the task will be enqueued when switching to idle.
12706	 */
12707	if (this_rq->ttwu_pending)
12708		return 0;
12709
12710	/*
12711	 * We must set idle_stamp _before_ calling sched_balance_rq()
12712	 * for CPU_NEWLY_IDLE, such that we measure the this duration
12713	 * as idle time.
12714	 */
12715	this_rq->idle_stamp = rq_clock(this_rq);
12716
12717	/*
12718	 * Do not pull tasks towards !active CPUs...
12719	 */
12720	if (!cpu_active(this_cpu))
12721		return 0;
12722
12723	/*
12724	 * This is OK, because current is on_cpu, which avoids it being picked
12725	 * for load-balance and preemption/IRQs are still disabled avoiding
12726	 * further scheduler activity on it and we're being very careful to
12727	 * re-start the picking loop.
12728	 */
12729	rq_unpin_lock(this_rq, rf);
12730
12731	rcu_read_lock();
12732	sd = rcu_dereference_check_sched_domain(this_rq->sd);
12733
12734	if (!get_rd_overloaded(this_rq->rd) ||
12735	    (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
12736
12737		if (sd)
12738			update_next_balance(sd, &next_balance);
12739		rcu_read_unlock();
12740
12741		goto out;
12742	}
12743	rcu_read_unlock();
12744
12745	raw_spin_rq_unlock(this_rq);
12746
12747	t0 = sched_clock_cpu(this_cpu);
12748	sched_balance_update_blocked_averages(this_cpu);
12749
12750	rcu_read_lock();
12751	for_each_domain(this_cpu, sd) {
12752		u64 domain_cost;
12753
12754		update_next_balance(sd, &next_balance);
12755
12756		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
12757			break;
12758
12759		if (sd->flags & SD_BALANCE_NEWIDLE) {
12760
12761			pulled_task = sched_balance_rq(this_cpu, this_rq,
12762						   sd, CPU_NEWLY_IDLE,
12763						   &continue_balancing);
12764
12765			t1 = sched_clock_cpu(this_cpu);
12766			domain_cost = t1 - t0;
12767			update_newidle_cost(sd, domain_cost);
12768
12769			curr_cost += domain_cost;
12770			t0 = t1;
12771		}
12772
12773		/*
12774		 * Stop searching for tasks to pull if there are
12775		 * now runnable tasks on this rq.
12776		 */
12777		if (pulled_task || !continue_balancing)
12778			break;
12779	}
12780	rcu_read_unlock();
12781
12782	raw_spin_rq_lock(this_rq);
12783
12784	if (curr_cost > this_rq->max_idle_balance_cost)
12785		this_rq->max_idle_balance_cost = curr_cost;
12786
12787	/*
12788	 * While browsing the domains, we released the rq lock, a task could
12789	 * have been enqueued in the meantime. Since we're not going idle,
12790	 * pretend we pulled a task.
12791	 */
12792	if (this_rq->cfs.h_nr_running && !pulled_task)
12793		pulled_task = 1;
12794
12795	/* Is there a task of a high priority class? */
12796	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
12797		pulled_task = -1;
12798
12799out:
12800	/* Move the next balance forward */
12801	if (time_after(this_rq->next_balance, next_balance))
12802		this_rq->next_balance = next_balance;
12803
12804	if (pulled_task)
12805		this_rq->idle_stamp = 0;
12806	else
12807		nohz_newidle_balance(this_rq);
12808
12809	rq_repin_lock(this_rq, rf);
12810
12811	return pulled_task;
12812}
12813
12814/*
12815 * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
12816 *
12817 * - directly from the local scheduler_tick() for periodic load balancing
12818 *
12819 * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
12820 *   through the SMP cross-call nohz_csd_func()
12821 */
12822static __latent_entropy void sched_balance_softirq(void)
12823{
12824	struct rq *this_rq = this_rq();
12825	enum cpu_idle_type idle = this_rq->idle_balance;
12826	/*
12827	 * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
12828	 * balancing on behalf of the other idle CPUs whose ticks are
12829	 * stopped. Do nohz_idle_balance *before* sched_balance_domains to
12830	 * give the idle CPUs a chance to load balance. Else we may
12831	 * load balance only within the local sched_domain hierarchy
12832	 * and abort nohz_idle_balance altogether if we pull some load.
12833	 */
12834	if (nohz_idle_balance(this_rq, idle))
12835		return;
12836
12837	/* normal load balance */
12838	sched_balance_update_blocked_averages(this_rq->cpu);
12839	sched_balance_domains(this_rq, idle);
12840}
12841
12842/*
12843 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
12844 */
12845void sched_balance_trigger(struct rq *rq)
12846{
12847	/*
12848	 * Don't need to rebalance while attached to NULL domain or
12849	 * runqueue CPU is not active
12850	 */
12851	if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
12852		return;
12853
12854	if (time_after_eq(jiffies, rq->next_balance))
12855		raise_softirq(SCHED_SOFTIRQ);
12856
12857	nohz_balancer_kick(rq);
12858}
12859
12860static void rq_online_fair(struct rq *rq)
12861{
12862	update_sysctl();
12863
12864	update_runtime_enabled(rq);
12865}
12866
12867static void rq_offline_fair(struct rq *rq)
12868{
12869	update_sysctl();
12870
12871	/* Ensure any throttled groups are reachable by pick_next_task */
12872	unthrottle_offline_cfs_rqs(rq);
12873
12874	/* Ensure that we remove rq contribution to group share: */
12875	clear_tg_offline_cfs_rqs(rq);
12876}
12877
12878#endif /* CONFIG_SMP */
12879
12880#ifdef CONFIG_SCHED_CORE
12881static inline bool
12882__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
12883{
12884	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
12885	u64 slice = se->slice;
12886
12887	return (rtime * min_nr_tasks > slice);
12888}
12889
12890#define MIN_NR_TASKS_DURING_FORCEIDLE	2
12891static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
12892{
12893	if (!sched_core_enabled(rq))
12894		return;
12895
12896	/*
12897	 * If runqueue has only one task which used up its slice and
12898	 * if the sibling is forced idle, then trigger schedule to
12899	 * give forced idle task a chance.
12900	 *
12901	 * sched_slice() considers only this active rq and it gets the
12902	 * whole slice. But during force idle, we have siblings acting
12903	 * like a single runqueue and hence we need to consider runnable
12904	 * tasks on this CPU and the forced idle CPU. Ideally, we should
12905	 * go through the forced idle rq, but that would be a perf hit.
12906	 * We can assume that the forced idle CPU has at least
12907	 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
12908	 * if we need to give up the CPU.
12909	 */
12910	if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
12911	    __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
12912		resched_curr(rq);
12913}
12914
12915/*
12916 * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
12917 */
12918static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
12919			 bool forceidle)
12920{
12921	for_each_sched_entity(se) {
12922		struct cfs_rq *cfs_rq = cfs_rq_of(se);
12923
12924		if (forceidle) {
12925			if (cfs_rq->forceidle_seq == fi_seq)
12926				break;
12927			cfs_rq->forceidle_seq = fi_seq;
12928		}
12929
12930		cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
12931	}
12932}
12933
12934void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
12935{
12936	struct sched_entity *se = &p->se;
12937
12938	if (p->sched_class != &fair_sched_class)
12939		return;
12940
12941	se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
12942}
12943
12944bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
12945			bool in_fi)
12946{
12947	struct rq *rq = task_rq(a);
12948	const struct sched_entity *sea = &a->se;
12949	const struct sched_entity *seb = &b->se;
12950	struct cfs_rq *cfs_rqa;
12951	struct cfs_rq *cfs_rqb;
12952	s64 delta;
12953
12954	SCHED_WARN_ON(task_rq(b)->core != rq->core);
12955
12956#ifdef CONFIG_FAIR_GROUP_SCHED
12957	/*
12958	 * Find an se in the hierarchy for tasks a and b, such that the se's
12959	 * are immediate siblings.
12960	 */
12961	while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
12962		int sea_depth = sea->depth;
12963		int seb_depth = seb->depth;
12964
12965		if (sea_depth >= seb_depth)
12966			sea = parent_entity(sea);
12967		if (sea_depth <= seb_depth)
12968			seb = parent_entity(seb);
12969	}
12970
12971	se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
12972	se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
12973
12974	cfs_rqa = sea->cfs_rq;
12975	cfs_rqb = seb->cfs_rq;
12976#else
12977	cfs_rqa = &task_rq(a)->cfs;
12978	cfs_rqb = &task_rq(b)->cfs;
12979#endif
12980
12981	/*
12982	 * Find delta after normalizing se's vruntime with its cfs_rq's
12983	 * min_vruntime_fi, which would have been updated in prior calls
12984	 * to se_fi_update().
12985	 */
12986	delta = (s64)(sea->vruntime - seb->vruntime) +
12987		(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
12988
12989	return delta > 0;
12990}
12991
12992static int task_is_throttled_fair(struct task_struct *p, int cpu)
12993{
12994	struct cfs_rq *cfs_rq;
12995
12996#ifdef CONFIG_FAIR_GROUP_SCHED
12997	cfs_rq = task_group(p)->cfs_rq[cpu];
12998#else
12999	cfs_rq = &cpu_rq(cpu)->cfs;
13000#endif
13001	return throttled_hierarchy(cfs_rq);
13002}
13003#else
13004static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
13005#endif
13006
13007/*
13008 * scheduler tick hitting a task of our scheduling class.
13009 *
13010 * NOTE: This function can be called remotely by the tick offload that
13011 * goes along full dynticks. Therefore no local assumption can be made
13012 * and everything must be accessed through the @rq and @curr passed in
13013 * parameters.
13014 */
13015static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
13016{
13017	struct cfs_rq *cfs_rq;
13018	struct sched_entity *se = &curr->se;
13019
13020	for_each_sched_entity(se) {
13021		cfs_rq = cfs_rq_of(se);
13022		entity_tick(cfs_rq, se, queued);
13023	}
13024
13025	if (static_branch_unlikely(&sched_numa_balancing))
13026		task_tick_numa(rq, curr);
13027
13028	update_misfit_status(curr, rq);
13029	check_update_overutilized_status(task_rq(curr));
13030
13031	task_tick_core(rq, curr);
13032}
13033
13034/*
13035 * called on fork with the child task as argument from the parent's context
13036 *  - child not yet on the tasklist
13037 *  - preemption disabled
13038 */
13039static void task_fork_fair(struct task_struct *p)
13040{
13041	set_task_max_allowed_capacity(p);
13042}
13043
13044/*
13045 * Priority of the task has changed. Check to see if we preempt
13046 * the current task.
13047 */
13048static void
13049prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
13050{
13051	if (!task_on_rq_queued(p))
13052		return;
13053
13054	if (rq->cfs.nr_running == 1)
13055		return;
13056
13057	/*
13058	 * Reschedule if we are currently running on this runqueue and
13059	 * our priority decreased, or if we are not currently running on
13060	 * this runqueue and our priority is higher than the current's
13061	 */
13062	if (task_current_donor(rq, p)) {
13063		if (p->prio > oldprio)
13064			resched_curr(rq);
13065	} else
13066		wakeup_preempt(rq, p, 0);
13067}
13068
13069#ifdef CONFIG_FAIR_GROUP_SCHED
13070/*
13071 * Propagate the changes of the sched_entity across the tg tree to make it
13072 * visible to the root
13073 */
13074static void propagate_entity_cfs_rq(struct sched_entity *se)
13075{
13076	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13077
13078	if (cfs_rq_throttled(cfs_rq))
13079		return;
13080
13081	if (!throttled_hierarchy(cfs_rq))
13082		list_add_leaf_cfs_rq(cfs_rq);
13083
13084	/* Start to propagate at parent */
13085	se = se->parent;
13086
13087	for_each_sched_entity(se) {
13088		cfs_rq = cfs_rq_of(se);
13089
13090		update_load_avg(cfs_rq, se, UPDATE_TG);
13091
13092		if (cfs_rq_throttled(cfs_rq))
13093			break;
13094
13095		if (!throttled_hierarchy(cfs_rq))
13096			list_add_leaf_cfs_rq(cfs_rq);
13097	}
13098}
13099#else
13100static void propagate_entity_cfs_rq(struct sched_entity *se) { }
13101#endif
13102
13103static void detach_entity_cfs_rq(struct sched_entity *se)
13104{
13105	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13106
13107#ifdef CONFIG_SMP
13108	/*
13109	 * In case the task sched_avg hasn't been attached:
13110	 * - A forked task which hasn't been woken up by wake_up_new_task().
13111	 * - A task which has been woken up by try_to_wake_up() but is
13112	 *   waiting for actually being woken up by sched_ttwu_pending().
13113	 */
13114	if (!se->avg.last_update_time)
13115		return;
13116#endif
13117
13118	/* Catch up with the cfs_rq and remove our load when we leave */
13119	update_load_avg(cfs_rq, se, 0);
13120	detach_entity_load_avg(cfs_rq, se);
13121	update_tg_load_avg(cfs_rq);
13122	propagate_entity_cfs_rq(se);
13123}
13124
13125static void attach_entity_cfs_rq(struct sched_entity *se)
13126{
13127	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13128
13129	/* Synchronize entity with its cfs_rq */
13130	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
13131	attach_entity_load_avg(cfs_rq, se);
13132	update_tg_load_avg(cfs_rq);
13133	propagate_entity_cfs_rq(se);
13134}
13135
13136static void detach_task_cfs_rq(struct task_struct *p)
13137{
13138	struct sched_entity *se = &p->se;
13139
13140	detach_entity_cfs_rq(se);
13141}
13142
13143static void attach_task_cfs_rq(struct task_struct *p)
13144{
13145	struct sched_entity *se = &p->se;
13146
13147	attach_entity_cfs_rq(se);
13148}
13149
13150static void switched_from_fair(struct rq *rq, struct task_struct *p)
13151{
13152	detach_task_cfs_rq(p);
13153}
13154
13155static void switched_to_fair(struct rq *rq, struct task_struct *p)
13156{
13157	SCHED_WARN_ON(p->se.sched_delayed);
13158
13159	attach_task_cfs_rq(p);
13160
13161	set_task_max_allowed_capacity(p);
13162
13163	if (task_on_rq_queued(p)) {
13164		/*
13165		 * We were most likely switched from sched_rt, so
13166		 * kick off the schedule if running, otherwise just see
13167		 * if we can still preempt the current task.
13168		 */
13169		if (task_current_donor(rq, p))
13170			resched_curr(rq);
13171		else
13172			wakeup_preempt(rq, p, 0);
13173	}
13174}
13175
13176static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
13177{
13178	struct sched_entity *se = &p->se;
13179
13180#ifdef CONFIG_SMP
13181	if (task_on_rq_queued(p)) {
13182		/*
13183		 * Move the next running task to the front of the list, so our
13184		 * cfs_tasks list becomes MRU one.
13185		 */
13186		list_move(&se->group_node, &rq->cfs_tasks);
13187	}
13188#endif
13189	if (!first)
13190		return;
13191
13192	SCHED_WARN_ON(se->sched_delayed);
13193
13194	if (hrtick_enabled_fair(rq))
13195		hrtick_start_fair(rq, p);
13196
13197	update_misfit_status(p, rq);
13198	sched_fair_update_stop_tick(rq, p);
13199}
13200
13201/*
13202 * Account for a task changing its policy or group.
13203 *
13204 * This routine is mostly called to set cfs_rq->curr field when a task
13205 * migrates between groups/classes.
13206 */
13207static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
13208{
13209	struct sched_entity *se = &p->se;
13210
13211	for_each_sched_entity(se) {
13212		struct cfs_rq *cfs_rq = cfs_rq_of(se);
13213
13214		set_next_entity(cfs_rq, se);
13215		/* ensure bandwidth has been allocated on our new cfs_rq */
13216		account_cfs_rq_runtime(cfs_rq, 0);
13217	}
13218
13219	__set_next_task_fair(rq, p, first);
13220}
13221
13222void init_cfs_rq(struct cfs_rq *cfs_rq)
13223{
13224	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
13225	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
13226#ifdef CONFIG_SMP
13227	raw_spin_lock_init(&cfs_rq->removed.lock);
13228#endif
13229}
13230
13231#ifdef CONFIG_FAIR_GROUP_SCHED
13232static void task_change_group_fair(struct task_struct *p)
13233{
13234	/*
13235	 * We couldn't detach or attach a forked task which
13236	 * hasn't been woken up by wake_up_new_task().
13237	 */
13238	if (READ_ONCE(p->__state) == TASK_NEW)
13239		return;
13240
13241	detach_task_cfs_rq(p);
13242
13243#ifdef CONFIG_SMP
13244	/* Tell se's cfs_rq has been changed -- migrated */
13245	p->se.avg.last_update_time = 0;
13246#endif
13247	set_task_rq(p, task_cpu(p));
13248	attach_task_cfs_rq(p);
13249}
13250
13251void free_fair_sched_group(struct task_group *tg)
13252{
13253	int i;
13254
13255	for_each_possible_cpu(i) {
13256		if (tg->cfs_rq)
13257			kfree(tg->cfs_rq[i]);
13258		if (tg->se)
13259			kfree(tg->se[i]);
13260	}
13261
13262	kfree(tg->cfs_rq);
13263	kfree(tg->se);
13264}
13265
13266int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
13267{
13268	struct sched_entity *se;
13269	struct cfs_rq *cfs_rq;
13270	int i;
13271
13272	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
13273	if (!tg->cfs_rq)
13274		goto err;
13275	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
13276	if (!tg->se)
13277		goto err;
13278
13279	tg->shares = NICE_0_LOAD;
13280
13281	init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
13282
13283	for_each_possible_cpu(i) {
13284		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
13285				      GFP_KERNEL, cpu_to_node(i));
13286		if (!cfs_rq)
13287			goto err;
13288
13289		se = kzalloc_node(sizeof(struct sched_entity_stats),
13290				  GFP_KERNEL, cpu_to_node(i));
13291		if (!se)
13292			goto err_free_rq;
13293
13294		init_cfs_rq(cfs_rq);
13295		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
13296		init_entity_runnable_average(se);
13297	}
13298
13299	return 1;
13300
13301err_free_rq:
13302	kfree(cfs_rq);
13303err:
13304	return 0;
13305}
13306
13307void online_fair_sched_group(struct task_group *tg)
13308{
13309	struct sched_entity *se;
13310	struct rq_flags rf;
13311	struct rq *rq;
13312	int i;
13313
13314	for_each_possible_cpu(i) {
13315		rq = cpu_rq(i);
13316		se = tg->se[i];
13317		rq_lock_irq(rq, &rf);
13318		update_rq_clock(rq);
13319		attach_entity_cfs_rq(se);
13320		sync_throttle(tg, i);
13321		rq_unlock_irq(rq, &rf);
13322	}
13323}
13324
13325void unregister_fair_sched_group(struct task_group *tg)
13326{
13327	int cpu;
13328
13329	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
13330
13331	for_each_possible_cpu(cpu) {
13332		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
13333		struct sched_entity *se = tg->se[cpu];
13334		struct rq *rq = cpu_rq(cpu);
13335
13336		if (se) {
13337			if (se->sched_delayed) {
13338				guard(rq_lock_irqsave)(rq);
13339				if (se->sched_delayed) {
13340					update_rq_clock(rq);
13341					dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
13342				}
13343				list_del_leaf_cfs_rq(cfs_rq);
13344			}
13345			remove_entity_load_avg(se);
13346		}
13347
13348		/*
13349		 * Only empty task groups can be destroyed; so we can speculatively
13350		 * check on_list without danger of it being re-added.
13351		 */
13352		if (cfs_rq->on_list) {
13353			guard(rq_lock_irqsave)(rq);
13354			list_del_leaf_cfs_rq(cfs_rq);
13355		}
13356	}
13357}
13358
13359void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
13360			struct sched_entity *se, int cpu,
13361			struct sched_entity *parent)
13362{
13363	struct rq *rq = cpu_rq(cpu);
13364
13365	cfs_rq->tg = tg;
13366	cfs_rq->rq = rq;
13367	init_cfs_rq_runtime(cfs_rq);
13368
13369	tg->cfs_rq[cpu] = cfs_rq;
13370	tg->se[cpu] = se;
13371
13372	/* se could be NULL for root_task_group */
13373	if (!se)
13374		return;
13375
13376	if (!parent) {
13377		se->cfs_rq = &rq->cfs;
13378		se->depth = 0;
13379	} else {
13380		se->cfs_rq = parent->my_q;
13381		se->depth = parent->depth + 1;
13382	}
13383
13384	se->my_q = cfs_rq;
13385	/* guarantee group entities always have weight */
13386	update_load_set(&se->load, NICE_0_LOAD);
13387	se->parent = parent;
13388}
13389
13390static DEFINE_MUTEX(shares_mutex);
13391
13392static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
13393{
13394	int i;
13395
13396	lockdep_assert_held(&shares_mutex);
13397
13398	/*
13399	 * We can't change the weight of the root cgroup.
13400	 */
13401	if (!tg->se[0])
13402		return -EINVAL;
13403
13404	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
13405
13406	if (tg->shares == shares)
13407		return 0;
13408
13409	tg->shares = shares;
13410	for_each_possible_cpu(i) {
13411		struct rq *rq = cpu_rq(i);
13412		struct sched_entity *se = tg->se[i];
13413		struct rq_flags rf;
13414
13415		/* Propagate contribution to hierarchy */
13416		rq_lock_irqsave(rq, &rf);
13417		update_rq_clock(rq);
13418		for_each_sched_entity(se) {
13419			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
13420			update_cfs_group(se);
13421		}
13422		rq_unlock_irqrestore(rq, &rf);
13423	}
13424
13425	return 0;
13426}
13427
13428int sched_group_set_shares(struct task_group *tg, unsigned long shares)
13429{
13430	int ret;
13431
13432	mutex_lock(&shares_mutex);
13433	if (tg_is_idle(tg))
13434		ret = -EINVAL;
13435	else
13436		ret = __sched_group_set_shares(tg, shares);
13437	mutex_unlock(&shares_mutex);
13438
13439	return ret;
13440}
13441
13442int sched_group_set_idle(struct task_group *tg, long idle)
13443{
13444	int i;
13445
13446	if (tg == &root_task_group)
13447		return -EINVAL;
13448
13449	if (idle < 0 || idle > 1)
13450		return -EINVAL;
13451
13452	mutex_lock(&shares_mutex);
13453
13454	if (tg->idle == idle) {
13455		mutex_unlock(&shares_mutex);
13456		return 0;
13457	}
13458
13459	tg->idle = idle;
13460
13461	for_each_possible_cpu(i) {
13462		struct rq *rq = cpu_rq(i);
13463		struct sched_entity *se = tg->se[i];
13464		struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
13465		bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
13466		long idle_task_delta;
13467		struct rq_flags rf;
13468
13469		rq_lock_irqsave(rq, &rf);
13470
13471		grp_cfs_rq->idle = idle;
13472		if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
13473			goto next_cpu;
13474
13475		if (se->on_rq) {
13476			parent_cfs_rq = cfs_rq_of(se);
13477			if (cfs_rq_is_idle(grp_cfs_rq))
13478				parent_cfs_rq->idle_nr_running++;
13479			else
13480				parent_cfs_rq->idle_nr_running--;
13481		}
13482
13483		idle_task_delta = grp_cfs_rq->h_nr_running -
13484				  grp_cfs_rq->idle_h_nr_running;
13485		if (!cfs_rq_is_idle(grp_cfs_rq))
13486			idle_task_delta *= -1;
13487
13488		for_each_sched_entity(se) {
13489			struct cfs_rq *cfs_rq = cfs_rq_of(se);
13490
13491			if (!se->on_rq)
13492				break;
13493
13494			cfs_rq->idle_h_nr_running += idle_task_delta;
13495
13496			/* Already accounted at parent level and above. */
13497			if (cfs_rq_is_idle(cfs_rq))
13498				break;
13499		}
13500
13501next_cpu:
13502		rq_unlock_irqrestore(rq, &rf);
13503	}
13504
13505	/* Idle groups have minimum weight. */
13506	if (tg_is_idle(tg))
13507		__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
13508	else
13509		__sched_group_set_shares(tg, NICE_0_LOAD);
13510
13511	mutex_unlock(&shares_mutex);
13512	return 0;
13513}
13514
13515#endif /* CONFIG_FAIR_GROUP_SCHED */
13516
13517
13518static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
13519{
13520	struct sched_entity *se = &task->se;
13521	unsigned int rr_interval = 0;
13522
13523	/*
13524	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
13525	 * idle runqueue:
13526	 */
13527	if (rq->cfs.load.weight)
13528		rr_interval = NS_TO_JIFFIES(se->slice);
13529
13530	return rr_interval;
13531}
13532
13533/*
13534 * All the scheduling class methods:
13535 */
13536DEFINE_SCHED_CLASS(fair) = {
13537
13538	.enqueue_task		= enqueue_task_fair,
13539	.dequeue_task		= dequeue_task_fair,
13540	.yield_task		= yield_task_fair,
13541	.yield_to_task		= yield_to_task_fair,
13542
13543	.wakeup_preempt		= check_preempt_wakeup_fair,
13544
13545	.pick_task		= pick_task_fair,
13546	.pick_next_task		= __pick_next_task_fair,
13547	.put_prev_task		= put_prev_task_fair,
13548	.set_next_task          = set_next_task_fair,
13549
13550#ifdef CONFIG_SMP
13551	.balance		= balance_fair,
13552	.select_task_rq		= select_task_rq_fair,
13553	.migrate_task_rq	= migrate_task_rq_fair,
13554
13555	.rq_online		= rq_online_fair,
13556	.rq_offline		= rq_offline_fair,
13557
13558	.task_dead		= task_dead_fair,
13559	.set_cpus_allowed	= set_cpus_allowed_fair,
13560#endif
13561
13562	.task_tick		= task_tick_fair,
13563	.task_fork		= task_fork_fair,
13564
13565	.reweight_task		= reweight_task_fair,
13566	.prio_changed		= prio_changed_fair,
13567	.switched_from		= switched_from_fair,
13568	.switched_to		= switched_to_fair,
13569
13570	.get_rr_interval	= get_rr_interval_fair,
13571
13572	.update_curr		= update_curr_fair,
13573
13574#ifdef CONFIG_FAIR_GROUP_SCHED
13575	.task_change_group	= task_change_group_fair,
13576#endif
13577
13578#ifdef CONFIG_SCHED_CORE
13579	.task_is_throttled	= task_is_throttled_fair,
13580#endif
13581
13582#ifdef CONFIG_UCLAMP_TASK
13583	.uclamp_enabled		= 1,
13584#endif
13585};
13586
13587#ifdef CONFIG_SCHED_DEBUG
13588void print_cfs_stats(struct seq_file *m, int cpu)
13589{
13590	struct cfs_rq *cfs_rq, *pos;
13591
13592	rcu_read_lock();
13593	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
13594		print_cfs_rq(m, cpu, cfs_rq);
13595	rcu_read_unlock();
13596}
13597
13598#ifdef CONFIG_NUMA_BALANCING
13599void show_numa_stats(struct task_struct *p, struct seq_file *m)
13600{
13601	int node;
13602	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
13603	struct numa_group *ng;
13604
13605	rcu_read_lock();
13606	ng = rcu_dereference(p->numa_group);
13607	for_each_online_node(node) {
13608		if (p->numa_faults) {
13609			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
13610			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
13611		}
13612		if (ng) {
13613			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
13614			gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
13615		}
13616		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
13617	}
13618	rcu_read_unlock();
13619}
13620#endif /* CONFIG_NUMA_BALANCING */
13621#endif /* CONFIG_SCHED_DEBUG */
13622
13623__init void init_sched_fair_class(void)
13624{
13625#ifdef CONFIG_SMP
13626	int i;
13627
13628	for_each_possible_cpu(i) {
13629		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
13630		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
13631		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
13632					GFP_KERNEL, cpu_to_node(i));
13633
13634#ifdef CONFIG_CFS_BANDWIDTH
13635		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
13636		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
13637#endif
13638	}
13639
13640	open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
13641
13642#ifdef CONFIG_NO_HZ_COMMON
13643	nohz.next_balance = jiffies;
13644	nohz.next_blocked = jiffies;
13645	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
13646#endif
13647#endif /* SMP */
13648
13649}