fair.c - kernel/sched/fair.c - Linux source code v4.17

Note: File does not exist in v3.1.
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
    4 *
    5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
    6 *
    7 *  Interactivity improvements by Mike Galbraith
    8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
    9 *
   10 *  Various enhancements by Dmitry Adamushko.
   11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
   12 *
   13 *  Group scheduling enhancements by Srivatsa Vaddagiri
   14 *  Copyright IBM Corporation, 2007
   15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
   16 *
   17 *  Scaled math optimizations by Thomas Gleixner
   18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
   19 *
   20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
   21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
   22 */
   23#include "sched.h"
   24
   25#include <trace/events/sched.h>
   26
   27/*
   28 * Targeted preemption latency for CPU-bound tasks:
   29 *
   30 * NOTE: this latency value is not the same as the concept of
   31 * 'timeslice length' - timeslices in CFS are of variable length
   32 * and have no persistent notion like in traditional, time-slice
   33 * based scheduling concepts.
   34 *
   35 * (to see the precise effective timeslice length of your workload,
   36 *  run vmstat and monitor the context-switches (cs) field)
   37 *
   38 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
   39 */
   40unsigned int sysctl_sched_latency			= 6000000ULL;
   41unsigned int normalized_sysctl_sched_latency		= 6000000ULL;
   42
   43/*
   44 * The initial- and re-scaling of tunables is configurable
   45 *
   46 * Options are:
   47 *
   48 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
   49 *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
   50 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
   51 *
   52 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
   53 */
   54enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
   55
   56/*
   57 * Minimal preemption granularity for CPU-bound tasks:
   58 *
   59 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
   60 */
   61unsigned int sysctl_sched_min_granularity		= 750000ULL;
   62unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
   63
   64/*
   65 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
   66 */
   67static unsigned int sched_nr_latency = 8;
   68
   69/*
   70 * After fork, child runs first. If set to 0 (default) then
   71 * parent will (try to) run first.
   72 */
   73unsigned int sysctl_sched_child_runs_first __read_mostly;
   74
   75/*
   76 * SCHED_OTHER wake-up granularity.
   77 *
   78 * This option delays the preemption effects of decoupled workloads
   79 * and reduces their over-scheduling. Synchronous workloads will still
   80 * have immediate wakeup/sleep latencies.
   81 *
   82 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
   83 */
   84unsigned int sysctl_sched_wakeup_granularity		= 1000000UL;
   85unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
   86
   87const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
   88
   89#ifdef CONFIG_SMP
   90/*
   91 * For asym packing, by default the lower numbered CPU has higher priority.
   92 */
   93int __weak arch_asym_cpu_priority(int cpu)
   94{
   95	return -cpu;
   96}
   97#endif
   98
   99#ifdef CONFIG_CFS_BANDWIDTH
  100/*
  101 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
  102 * each time a cfs_rq requests quota.
  103 *
  104 * Note: in the case that the slice exceeds the runtime remaining (either due
  105 * to consumption or the quota being specified to be smaller than the slice)
  106 * we will always only issue the remaining available time.
  107 *
  108 * (default: 5 msec, units: microseconds)
  109 */
  110unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
  111#endif
  112
  113/*
  114 * The margin used when comparing utilization with CPU capacity:
  115 * util * margin < capacity * 1024
  116 *
  117 * (default: ~20%)
  118 */
  119unsigned int capacity_margin				= 1280;
  120
  121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  122{
  123	lw->weight += inc;
  124	lw->inv_weight = 0;
  125}
  126
  127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
  128{
  129	lw->weight -= dec;
  130	lw->inv_weight = 0;
  131}
  132
  133static inline void update_load_set(struct load_weight *lw, unsigned long w)
  134{
  135	lw->weight = w;
  136	lw->inv_weight = 0;
  137}
  138
  139/*
  140 * Increase the granularity value when there are more CPUs,
  141 * because with more CPUs the 'effective latency' as visible
  142 * to users decreases. But the relationship is not linear,
  143 * so pick a second-best guess by going with the log2 of the
  144 * number of CPUs.
  145 *
  146 * This idea comes from the SD scheduler of Con Kolivas:
  147 */
  148static unsigned int get_update_sysctl_factor(void)
  149{
  150	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
  151	unsigned int factor;
  152
  153	switch (sysctl_sched_tunable_scaling) {
  154	case SCHED_TUNABLESCALING_NONE:
  155		factor = 1;
  156		break;
  157	case SCHED_TUNABLESCALING_LINEAR:
  158		factor = cpus;
  159		break;
  160	case SCHED_TUNABLESCALING_LOG:
  161	default:
  162		factor = 1 + ilog2(cpus);
  163		break;
  164	}
  165
  166	return factor;
  167}
  168
  169static void update_sysctl(void)
  170{
  171	unsigned int factor = get_update_sysctl_factor();
  172
  173#define SET_SYSCTL(name) \
  174	(sysctl_##name = (factor) * normalized_sysctl_##name)
  175	SET_SYSCTL(sched_min_granularity);
  176	SET_SYSCTL(sched_latency);
  177	SET_SYSCTL(sched_wakeup_granularity);
  178#undef SET_SYSCTL
  179}
  180
  181void sched_init_granularity(void)
  182{
  183	update_sysctl();
  184}
  185
  186#define WMULT_CONST	(~0U)
  187#define WMULT_SHIFT	32
  188
  189static void __update_inv_weight(struct load_weight *lw)
  190{
  191	unsigned long w;
  192
  193	if (likely(lw->inv_weight))
  194		return;
  195
  196	w = scale_load_down(lw->weight);
  197
  198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
  199		lw->inv_weight = 1;
  200	else if (unlikely(!w))
  201		lw->inv_weight = WMULT_CONST;
  202	else
  203		lw->inv_weight = WMULT_CONST / w;
  204}
  205
  206/*
  207 * delta_exec * weight / lw.weight
  208 *   OR
  209 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
  210 *
  211 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
  212 * we're guaranteed shift stays positive because inv_weight is guaranteed to
  213 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
  214 *
  215 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
  216 * weight/lw.weight <= 1, and therefore our shift will also be positive.
  217 */
  218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
  219{
  220	u64 fact = scale_load_down(weight);
  221	int shift = WMULT_SHIFT;
  222
  223	__update_inv_weight(lw);
  224
  225	if (unlikely(fact >> 32)) {
  226		while (fact >> 32) {
  227			fact >>= 1;
  228			shift--;
  229		}
  230	}
  231
  232	/* hint to use a 32x32->64 mul */
  233	fact = (u64)(u32)fact * lw->inv_weight;
  234
  235	while (fact >> 32) {
  236		fact >>= 1;
  237		shift--;
  238	}
  239
  240	return mul_u64_u32_shr(delta_exec, fact, shift);
  241}
  242
  243
  244const struct sched_class fair_sched_class;
  245
  246/**************************************************************
  247 * CFS operations on generic schedulable entities:
  248 */
  249
  250#ifdef CONFIG_FAIR_GROUP_SCHED
  251
  252/* cpu runqueue to which this cfs_rq is attached */
  253static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  254{
  255	return cfs_rq->rq;
  256}
  257
  258/* An entity is a task if it doesn't "own" a runqueue */
  259#define entity_is_task(se)	(!se->my_q)
  260
  261static inline struct task_struct *task_of(struct sched_entity *se)
  262{
  263	SCHED_WARN_ON(!entity_is_task(se));
  264	return container_of(se, struct task_struct, se);
  265}
  266
  267/* Walk up scheduling entities hierarchy */
  268#define for_each_sched_entity(se) \
  269		for (; se; se = se->parent)
  270
  271static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
  272{
  273	return p->se.cfs_rq;
  274}
  275
  276/* runqueue on which this entity is (to be) queued */
  277static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  278{
  279	return se->cfs_rq;
  280}
  281
  282/* runqueue "owned" by this group */
  283static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  284{
  285	return grp->my_q;
  286}
  287
  288static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  289{
  290	if (!cfs_rq->on_list) {
  291		struct rq *rq = rq_of(cfs_rq);
  292		int cpu = cpu_of(rq);
  293		/*
  294		 * Ensure we either appear before our parent (if already
  295		 * enqueued) or force our parent to appear after us when it is
  296		 * enqueued. The fact that we always enqueue bottom-up
  297		 * reduces this to two cases and a special case for the root
  298		 * cfs_rq. Furthermore, it also means that we will always reset
  299		 * tmp_alone_branch either when the branch is connected
  300		 * to a tree or when we reach the beg of the tree
  301		 */
  302		if (cfs_rq->tg->parent &&
  303		    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
  304			/*
  305			 * If parent is already on the list, we add the child
  306			 * just before. Thanks to circular linked property of
  307			 * the list, this means to put the child at the tail
  308			 * of the list that starts by parent.
  309			 */
  310			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  311				&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
  312			/*
  313			 * The branch is now connected to its tree so we can
  314			 * reset tmp_alone_branch to the beginning of the
  315			 * list.
  316			 */
  317			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  318		} else if (!cfs_rq->tg->parent) {
  319			/*
  320			 * cfs rq without parent should be put
  321			 * at the tail of the list.
  322			 */
  323			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  324				&rq->leaf_cfs_rq_list);
  325			/*
  326			 * We have reach the beg of a tree so we can reset
  327			 * tmp_alone_branch to the beginning of the list.
  328			 */
  329			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  330		} else {
  331			/*
  332			 * The parent has not already been added so we want to
  333			 * make sure that it will be put after us.
  334			 * tmp_alone_branch points to the beg of the branch
  335			 * where we will add parent.
  336			 */
  337			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
  338				rq->tmp_alone_branch);
  339			/*
  340			 * update tmp_alone_branch to points to the new beg
  341			 * of the branch
  342			 */
  343			rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
  344		}
  345
  346		cfs_rq->on_list = 1;
  347	}
  348}
  349
  350static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  351{
  352	if (cfs_rq->on_list) {
  353		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
  354		cfs_rq->on_list = 0;
  355	}
  356}
  357
  358/* Iterate thr' all leaf cfs_rq's on a runqueue */
  359#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
  360	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
  361				 leaf_cfs_rq_list)
  362
  363/* Do the two (enqueued) entities belong to the same group ? */
  364static inline struct cfs_rq *
  365is_same_group(struct sched_entity *se, struct sched_entity *pse)
  366{
  367	if (se->cfs_rq == pse->cfs_rq)
  368		return se->cfs_rq;
  369
  370	return NULL;
  371}
  372
  373static inline struct sched_entity *parent_entity(struct sched_entity *se)
  374{
  375	return se->parent;
  376}
  377
  378static void
  379find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  380{
  381	int se_depth, pse_depth;
  382
  383	/*
  384	 * preemption test can be made between sibling entities who are in the
  385	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
  386	 * both tasks until we find their ancestors who are siblings of common
  387	 * parent.
  388	 */
  389
  390	/* First walk up until both entities are at same depth */
  391	se_depth = (*se)->depth;
  392	pse_depth = (*pse)->depth;
  393
  394	while (se_depth > pse_depth) {
  395		se_depth--;
  396		*se = parent_entity(*se);
  397	}
  398
  399	while (pse_depth > se_depth) {
  400		pse_depth--;
  401		*pse = parent_entity(*pse);
  402	}
  403
  404	while (!is_same_group(*se, *pse)) {
  405		*se = parent_entity(*se);
  406		*pse = parent_entity(*pse);
  407	}
  408}
  409
  410#else	/* !CONFIG_FAIR_GROUP_SCHED */
  411
  412static inline struct task_struct *task_of(struct sched_entity *se)
  413{
  414	return container_of(se, struct task_struct, se);
  415}
  416
  417static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  418{
  419	return container_of(cfs_rq, struct rq, cfs);
  420}
  421
  422#define entity_is_task(se)	1
  423
  424#define for_each_sched_entity(se) \
  425		for (; se; se = NULL)
  426
  427static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
  428{
  429	return &task_rq(p)->cfs;
  430}
  431
  432static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  433{
  434	struct task_struct *p = task_of(se);
  435	struct rq *rq = task_rq(p);
  436
  437	return &rq->cfs;
  438}
  439
  440/* runqueue "owned" by this group */
  441static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  442{
  443	return NULL;
  444}
  445
  446static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  447{
  448}
  449
  450static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  451{
  452}
  453
  454#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
  455		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
  456
  457static inline struct sched_entity *parent_entity(struct sched_entity *se)
  458{
  459	return NULL;
  460}
  461
  462static inline void
  463find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  464{
  465}
  466
  467#endif	/* CONFIG_FAIR_GROUP_SCHED */
  468
  469static __always_inline
  470void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
  471
  472/**************************************************************
  473 * Scheduling class tree data structure manipulation methods:
  474 */
  475
  476static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
  477{
  478	s64 delta = (s64)(vruntime - max_vruntime);
  479	if (delta > 0)
  480		max_vruntime = vruntime;
  481
  482	return max_vruntime;
  483}
  484
  485static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
  486{
  487	s64 delta = (s64)(vruntime - min_vruntime);
  488	if (delta < 0)
  489		min_vruntime = vruntime;
  490
  491	return min_vruntime;
  492}
  493
  494static inline int entity_before(struct sched_entity *a,
  495				struct sched_entity *b)
  496{
  497	return (s64)(a->vruntime - b->vruntime) < 0;
  498}
  499
  500static void update_min_vruntime(struct cfs_rq *cfs_rq)
  501{
  502	struct sched_entity *curr = cfs_rq->curr;
  503	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
  504
  505	u64 vruntime = cfs_rq->min_vruntime;
  506
  507	if (curr) {
  508		if (curr->on_rq)
  509			vruntime = curr->vruntime;
  510		else
  511			curr = NULL;
  512	}
  513
  514	if (leftmost) { /* non-empty tree */
  515		struct sched_entity *se;
  516		se = rb_entry(leftmost, struct sched_entity, run_node);
  517
  518		if (!curr)
  519			vruntime = se->vruntime;
  520		else
  521			vruntime = min_vruntime(vruntime, se->vruntime);
  522	}
  523
  524	/* ensure we never gain time by being placed backwards. */
  525	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
  526#ifndef CONFIG_64BIT
  527	smp_wmb();
  528	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  529#endif
  530}
  531
  532/*
  533 * Enqueue an entity into the rb-tree:
  534 */
  535static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  536{
  537	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
  538	struct rb_node *parent = NULL;
  539	struct sched_entity *entry;
  540	bool leftmost = true;
  541
  542	/*
  543	 * Find the right place in the rbtree:
  544	 */
  545	while (*link) {
  546		parent = *link;
  547		entry = rb_entry(parent, struct sched_entity, run_node);
  548		/*
  549		 * We dont care about collisions. Nodes with
  550		 * the same key stay together.
  551		 */
  552		if (entity_before(se, entry)) {
  553			link = &parent->rb_left;
  554		} else {
  555			link = &parent->rb_right;
  556			leftmost = false;
  557		}
  558	}
  559
  560	rb_link_node(&se->run_node, parent, link);
  561	rb_insert_color_cached(&se->run_node,
  562			       &cfs_rq->tasks_timeline, leftmost);
  563}
  564
  565static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  566{
  567	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
  568}
  569
  570struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  571{
  572	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
  573
  574	if (!left)
  575		return NULL;
  576
  577	return rb_entry(left, struct sched_entity, run_node);
  578}
  579
  580static struct sched_entity *__pick_next_entity(struct sched_entity *se)
  581{
  582	struct rb_node *next = rb_next(&se->run_node);
  583
  584	if (!next)
  585		return NULL;
  586
  587	return rb_entry(next, struct sched_entity, run_node);
  588}
  589
  590#ifdef CONFIG_SCHED_DEBUG
  591struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  592{
  593	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
  594
  595	if (!last)
  596		return NULL;
  597
  598	return rb_entry(last, struct sched_entity, run_node);
  599}
  600
  601/**************************************************************
  602 * Scheduling class statistics methods:
  603 */
  604
  605int sched_proc_update_handler(struct ctl_table *table, int write,
  606		void __user *buffer, size_t *lenp,
  607		loff_t *ppos)
  608{
  609	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  610	unsigned int factor = get_update_sysctl_factor();
  611
  612	if (ret || !write)
  613		return ret;
  614
  615	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
  616					sysctl_sched_min_granularity);
  617
  618#define WRT_SYSCTL(name) \
  619	(normalized_sysctl_##name = sysctl_##name / (factor))
  620	WRT_SYSCTL(sched_min_granularity);
  621	WRT_SYSCTL(sched_latency);
  622	WRT_SYSCTL(sched_wakeup_granularity);
  623#undef WRT_SYSCTL
  624
  625	return 0;
  626}
  627#endif
  628
  629/*
  630 * delta /= w
  631 */
  632static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
  633{
  634	if (unlikely(se->load.weight != NICE_0_LOAD))
  635		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
  636
  637	return delta;
  638}
  639
  640/*
  641 * The idea is to set a period in which each task runs once.
  642 *
  643 * When there are too many tasks (sched_nr_latency) we have to stretch
  644 * this period because otherwise the slices get too small.
  645 *
  646 * p = (nr <= nl) ? l : l*nr/nl
  647 */
  648static u64 __sched_period(unsigned long nr_running)
  649{
  650	if (unlikely(nr_running > sched_nr_latency))
  651		return nr_running * sysctl_sched_min_granularity;
  652	else
  653		return sysctl_sched_latency;
  654}
  655
  656/*
  657 * We calculate the wall-time slice from the period by taking a part
  658 * proportional to the weight.
  659 *
  660 * s = p*P[w/rw]
  661 */
  662static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  663{
  664	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
  665
  666	for_each_sched_entity(se) {
  667		struct load_weight *load;
  668		struct load_weight lw;
  669
  670		cfs_rq = cfs_rq_of(se);
  671		load = &cfs_rq->load;
  672
  673		if (unlikely(!se->on_rq)) {
  674			lw = cfs_rq->load;
  675
  676			update_load_add(&lw, se->load.weight);
  677			load = &lw;
  678		}
  679		slice = __calc_delta(slice, se->load.weight, load);
  680	}
  681	return slice;
  682}
  683
  684/*
  685 * We calculate the vruntime slice of a to-be-inserted task.
  686 *
  687 * vs = s/w
  688 */
  689static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  690{
  691	return calc_delta_fair(sched_slice(cfs_rq, se), se);
  692}
  693
  694#ifdef CONFIG_SMP
  695
  696#include "sched-pelt.h"
  697
  698static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  699static unsigned long task_h_load(struct task_struct *p);
  700
  701/* Give new sched_entity start runnable values to heavy its load in infant time */
  702void init_entity_runnable_average(struct sched_entity *se)
  703{
  704	struct sched_avg *sa = &se->avg;
  705
  706	memset(sa, 0, sizeof(*sa));
  707
  708	/*
  709	 * Tasks are intialized with full load to be seen as heavy tasks until
  710	 * they get a chance to stabilize to their real load level.
  711	 * Group entities are intialized with zero load to reflect the fact that
  712	 * nothing has been attached to the task group yet.
  713	 */
  714	if (entity_is_task(se))
  715		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
  716
  717	se->runnable_weight = se->load.weight;
  718
  719	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  720}
  721
  722static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  723static void attach_entity_cfs_rq(struct sched_entity *se);
  724
  725/*
  726 * With new tasks being created, their initial util_avgs are extrapolated
  727 * based on the cfs_rq's current util_avg:
  728 *
  729 *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
  730 *
  731 * However, in many cases, the above util_avg does not give a desired
  732 * value. Moreover, the sum of the util_avgs may be divergent, such
  733 * as when the series is a harmonic series.
  734 *
  735 * To solve this problem, we also cap the util_avg of successive tasks to
  736 * only 1/2 of the left utilization budget:
  737 *
  738 *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
  739 *
  740 * where n denotes the nth task.
  741 *
  742 * For example, a simplest series from the beginning would be like:
  743 *
  744 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
  745 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
  746 *
  747 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
  748 * if util_avg > util_avg_cap.
  749 */
  750void post_init_entity_util_avg(struct sched_entity *se)
  751{
  752	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  753	struct sched_avg *sa = &se->avg;
  754	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
  755
  756	if (cap > 0) {
  757		if (cfs_rq->avg.util_avg != 0) {
  758			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
  759			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
  760
  761			if (sa->util_avg > cap)
  762				sa->util_avg = cap;
  763		} else {
  764			sa->util_avg = cap;
  765		}
  766	}
  767
  768	if (entity_is_task(se)) {
  769		struct task_struct *p = task_of(se);
  770		if (p->sched_class != &fair_sched_class) {
  771			/*
  772			 * For !fair tasks do:
  773			 *
  774			update_cfs_rq_load_avg(now, cfs_rq);
  775			attach_entity_load_avg(cfs_rq, se, 0);
  776			switched_from_fair(rq, p);
  777			 *
  778			 * such that the next switched_to_fair() has the
  779			 * expected state.
  780			 */
  781			se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
  782			return;
  783		}
  784	}
  785
  786	attach_entity_cfs_rq(se);
  787}
  788
  789#else /* !CONFIG_SMP */
  790void init_entity_runnable_average(struct sched_entity *se)
  791{
  792}
  793void post_init_entity_util_avg(struct sched_entity *se)
  794{
  795}
  796static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  797{
  798}
  799#endif /* CONFIG_SMP */
  800
  801/*
  802 * Update the current task's runtime statistics.
  803 */
  804static void update_curr(struct cfs_rq *cfs_rq)
  805{
  806	struct sched_entity *curr = cfs_rq->curr;
  807	u64 now = rq_clock_task(rq_of(cfs_rq));
  808	u64 delta_exec;
  809
  810	if (unlikely(!curr))
  811		return;
  812
  813	delta_exec = now - curr->exec_start;
  814	if (unlikely((s64)delta_exec <= 0))
  815		return;
  816
  817	curr->exec_start = now;
  818
  819	schedstat_set(curr->statistics.exec_max,
  820		      max(delta_exec, curr->statistics.exec_max));
  821
  822	curr->sum_exec_runtime += delta_exec;
  823	schedstat_add(cfs_rq->exec_clock, delta_exec);
  824
  825	curr->vruntime += calc_delta_fair(delta_exec, curr);
  826	update_min_vruntime(cfs_rq);
  827
  828	if (entity_is_task(curr)) {
  829		struct task_struct *curtask = task_of(curr);
  830
  831		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
  832		cgroup_account_cputime(curtask, delta_exec);
  833		account_group_exec_runtime(curtask, delta_exec);
  834	}
  835
  836	account_cfs_rq_runtime(cfs_rq, delta_exec);
  837}
  838
  839static void update_curr_fair(struct rq *rq)
  840{
  841	update_curr(cfs_rq_of(&rq->curr->se));
  842}
  843
  844static inline void
  845update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  846{
  847	u64 wait_start, prev_wait_start;
  848
  849	if (!schedstat_enabled())
  850		return;
  851
  852	wait_start = rq_clock(rq_of(cfs_rq));
  853	prev_wait_start = schedstat_val(se->statistics.wait_start);
  854
  855	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
  856	    likely(wait_start > prev_wait_start))
  857		wait_start -= prev_wait_start;
  858
  859	__schedstat_set(se->statistics.wait_start, wait_start);
  860}
  861
  862static inline void
  863update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  864{
  865	struct task_struct *p;
  866	u64 delta;
  867
  868	if (!schedstat_enabled())
  869		return;
  870
  871	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
  872
  873	if (entity_is_task(se)) {
  874		p = task_of(se);
  875		if (task_on_rq_migrating(p)) {
  876			/*
  877			 * Preserve migrating task's wait time so wait_start
  878			 * time stamp can be adjusted to accumulate wait time
  879			 * prior to migration.
  880			 */
  881			__schedstat_set(se->statistics.wait_start, delta);
  882			return;
  883		}
  884		trace_sched_stat_wait(p, delta);
  885	}
  886
  887	__schedstat_set(se->statistics.wait_max,
  888		      max(schedstat_val(se->statistics.wait_max), delta));
  889	__schedstat_inc(se->statistics.wait_count);
  890	__schedstat_add(se->statistics.wait_sum, delta);
  891	__schedstat_set(se->statistics.wait_start, 0);
  892}
  893
  894static inline void
  895update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  896{
  897	struct task_struct *tsk = NULL;
  898	u64 sleep_start, block_start;
  899
  900	if (!schedstat_enabled())
  901		return;
  902
  903	sleep_start = schedstat_val(se->statistics.sleep_start);
  904	block_start = schedstat_val(se->statistics.block_start);
  905
  906	if (entity_is_task(se))
  907		tsk = task_of(se);
  908
  909	if (sleep_start) {
  910		u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
  911
  912		if ((s64)delta < 0)
  913			delta = 0;
  914
  915		if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
  916			__schedstat_set(se->statistics.sleep_max, delta);
  917
  918		__schedstat_set(se->statistics.sleep_start, 0);
  919		__schedstat_add(se->statistics.sum_sleep_runtime, delta);
  920
  921		if (tsk) {
  922			account_scheduler_latency(tsk, delta >> 10, 1);
  923			trace_sched_stat_sleep(tsk, delta);
  924		}
  925	}
  926	if (block_start) {
  927		u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
  928
  929		if ((s64)delta < 0)
  930			delta = 0;
  931
  932		if (unlikely(delta > schedstat_val(se->statistics.block_max)))
  933			__schedstat_set(se->statistics.block_max, delta);
  934
  935		__schedstat_set(se->statistics.block_start, 0);
  936		__schedstat_add(se->statistics.sum_sleep_runtime, delta);
  937
  938		if (tsk) {
  939			if (tsk->in_iowait) {
  940				__schedstat_add(se->statistics.iowait_sum, delta);
  941				__schedstat_inc(se->statistics.iowait_count);
  942				trace_sched_stat_iowait(tsk, delta);
  943			}
  944
  945			trace_sched_stat_blocked(tsk, delta);
  946
  947			/*
  948			 * Blocking time is in units of nanosecs, so shift by
  949			 * 20 to get a milliseconds-range estimation of the
  950			 * amount of time that the task spent sleeping:
  951			 */
  952			if (unlikely(prof_on == SLEEP_PROFILING)) {
  953				profile_hits(SLEEP_PROFILING,
  954						(void *)get_wchan(tsk),
  955						delta >> 20);
  956			}
  957			account_scheduler_latency(tsk, delta >> 10, 0);
  958		}
  959	}
  960}
  961
  962/*
  963 * Task is being enqueued - update stats:
  964 */
  965static inline void
  966update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  967{
  968	if (!schedstat_enabled())
  969		return;
  970
  971	/*
  972	 * Are we enqueueing a waiting task? (for current tasks
  973	 * a dequeue/enqueue event is a NOP)
  974	 */
  975	if (se != cfs_rq->curr)
  976		update_stats_wait_start(cfs_rq, se);
  977
  978	if (flags & ENQUEUE_WAKEUP)
  979		update_stats_enqueue_sleeper(cfs_rq, se);
  980}
  981
  982static inline void
  983update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  984{
  985
  986	if (!schedstat_enabled())
  987		return;
  988
  989	/*
  990	 * Mark the end of the wait period if dequeueing a
  991	 * waiting task:
  992	 */
  993	if (se != cfs_rq->curr)
  994		update_stats_wait_end(cfs_rq, se);
  995
  996	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
  997		struct task_struct *tsk = task_of(se);
  998
  999		if (tsk->state & TASK_INTERRUPTIBLE)
 1000			__schedstat_set(se->statistics.sleep_start,
 1001				      rq_clock(rq_of(cfs_rq)));
 1002		if (tsk->state & TASK_UNINTERRUPTIBLE)
 1003			__schedstat_set(se->statistics.block_start,
 1004				      rq_clock(rq_of(cfs_rq)));
 1005	}
 1006}
 1007
 1008/*
 1009 * We are picking a new current task - update its stats:
 1010 */
 1011static inline void
 1012update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 1013{
 1014	/*
 1015	 * We are starting a new run period:
 1016	 */
 1017	se->exec_start = rq_clock_task(rq_of(cfs_rq));
 1018}
 1019
 1020/**************************************************
 1021 * Scheduling class queueing methods:
 1022 */
 1023
 1024#ifdef CONFIG_NUMA_BALANCING
 1025/*
 1026 * Approximate time to scan a full NUMA task in ms. The task scan period is
 1027 * calculated based on the tasks virtual memory size and
 1028 * numa_balancing_scan_size.
 1029 */
 1030unsigned int sysctl_numa_balancing_scan_period_min = 1000;
 1031unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 1032
 1033/* Portion of address space to scan in MB */
 1034unsigned int sysctl_numa_balancing_scan_size = 256;
 1035
 1036/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 1037unsigned int sysctl_numa_balancing_scan_delay = 1000;
 1038
 1039struct numa_group {
 1040	atomic_t refcount;
 1041
 1042	spinlock_t lock; /* nr_tasks, tasks */
 1043	int nr_tasks;
 1044	pid_t gid;
 1045	int active_nodes;
 1046
 1047	struct rcu_head rcu;
 1048	unsigned long total_faults;
 1049	unsigned long max_faults_cpu;
 1050	/*
 1051	 * Faults_cpu is used to decide whether memory should move
 1052	 * towards the CPU. As a consequence, these stats are weighted
 1053	 * more by CPU use than by memory faults.
 1054	 */
 1055	unsigned long *faults_cpu;
 1056	unsigned long faults[0];
 1057};
 1058
 1059static inline unsigned long group_faults_priv(struct numa_group *ng);
 1060static inline unsigned long group_faults_shared(struct numa_group *ng);
 1061
 1062static unsigned int task_nr_scan_windows(struct task_struct *p)
 1063{
 1064	unsigned long rss = 0;
 1065	unsigned long nr_scan_pages;
 1066
 1067	/*
 1068	 * Calculations based on RSS as non-present and empty pages are skipped
 1069	 * by the PTE scanner and NUMA hinting faults should be trapped based
 1070	 * on resident pages
 1071	 */
 1072	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
 1073	rss = get_mm_rss(p->mm);
 1074	if (!rss)
 1075		rss = nr_scan_pages;
 1076
 1077	rss = round_up(rss, nr_scan_pages);
 1078	return rss / nr_scan_pages;
 1079}
 1080
 1081/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
 1082#define MAX_SCAN_WINDOW 2560
 1083
 1084static unsigned int task_scan_min(struct task_struct *p)
 1085{
 1086	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 1087	unsigned int scan, floor;
 1088	unsigned int windows = 1;
 1089
 1090	if (scan_size < MAX_SCAN_WINDOW)
 1091		windows = MAX_SCAN_WINDOW / scan_size;
 1092	floor = 1000 / windows;
 1093
 1094	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
 1095	return max_t(unsigned int, floor, scan);
 1096}
 1097
 1098static unsigned int task_scan_start(struct task_struct *p)
 1099{
 1100	unsigned long smin = task_scan_min(p);
 1101	unsigned long period = smin;
 1102
 1103	/* Scale the maximum scan period with the amount of shared memory. */
 1104	if (p->numa_group) {
 1105		struct numa_group *ng = p->numa_group;
 1106		unsigned long shared = group_faults_shared(ng);
 1107		unsigned long private = group_faults_priv(ng);
 1108
 1109		period *= atomic_read(&ng->refcount);
 1110		period *= shared + 1;
 1111		period /= private + shared + 1;
 1112	}
 1113
 1114	return max(smin, period);
 1115}
 1116
 1117static unsigned int task_scan_max(struct task_struct *p)
 1118{
 1119	unsigned long smin = task_scan_min(p);
 1120	unsigned long smax;
 1121
 1122	/* Watch for min being lower than max due to floor calculations */
 1123	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
 1124
 1125	/* Scale the maximum scan period with the amount of shared memory. */
 1126	if (p->numa_group) {
 1127		struct numa_group *ng = p->numa_group;
 1128		unsigned long shared = group_faults_shared(ng);
 1129		unsigned long private = group_faults_priv(ng);
 1130		unsigned long period = smax;
 1131
 1132		period *= atomic_read(&ng->refcount);
 1133		period *= shared + 1;
 1134		period /= private + shared + 1;
 1135
 1136		smax = max(smax, period);
 1137	}
 1138
 1139	return max(smin, smax);
 1140}
 1141
 1142static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 1143{
 1144	rq->nr_numa_running += (p->numa_preferred_nid != -1);
 1145	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
 1146}
 1147
 1148static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 1149{
 1150	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
 1151	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 1152}
 1153
 1154/* Shared or private faults. */
 1155#define NR_NUMA_HINT_FAULT_TYPES 2
 1156
 1157/* Memory and CPU locality */
 1158#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
 1159
 1160/* Averaged statistics, and temporary buffers. */
 1161#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 1162
 1163pid_t task_numa_group_id(struct task_struct *p)
 1164{
 1165	return p->numa_group ? p->numa_group->gid : 0;
 1166}
 1167
 1168/*
 1169 * The averaged statistics, shared & private, memory & CPU,
 1170 * occupy the first half of the array. The second half of the
 1171 * array is for current counters, which are averaged into the
 1172 * first set by task_numa_placement.
 1173 */
 1174static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
 1175{
 1176	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
 1177}
 1178
 1179static inline unsigned long task_faults(struct task_struct *p, int nid)
 1180{
 1181	if (!p->numa_faults)
 1182		return 0;
 1183
 1184	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 1185		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
 1186}
 1187
 1188static inline unsigned long group_faults(struct task_struct *p, int nid)
 1189{
 1190	if (!p->numa_group)
 1191		return 0;
 1192
 1193	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 1194		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
 1195}
 1196
 1197static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 1198{
 1199	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
 1200		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 1201}
 1202
 1203static inline unsigned long group_faults_priv(struct numa_group *ng)
 1204{
 1205	unsigned long faults = 0;
 1206	int node;
 1207
 1208	for_each_online_node(node) {
 1209		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
 1210	}
 1211
 1212	return faults;
 1213}
 1214
 1215static inline unsigned long group_faults_shared(struct numa_group *ng)
 1216{
 1217	unsigned long faults = 0;
 1218	int node;
 1219
 1220	for_each_online_node(node) {
 1221		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
 1222	}
 1223
 1224	return faults;
 1225}
 1226
 1227/*
 1228 * A node triggering more than 1/3 as many NUMA faults as the maximum is
 1229 * considered part of a numa group's pseudo-interleaving set. Migrations
 1230 * between these nodes are slowed down, to allow things to settle down.
 1231 */
 1232#define ACTIVE_NODE_FRACTION 3
 1233
 1234static bool numa_is_active_node(int nid, struct numa_group *ng)
 1235{
 1236	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
 1237}
 1238
 1239/* Handle placement on systems where not all nodes are directly connected. */
 1240static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 1241					int maxdist, bool task)
 1242{
 1243	unsigned long score = 0;
 1244	int node;
 1245
 1246	/*
 1247	 * All nodes are directly connected, and the same distance
 1248	 * from each other. No need for fancy placement algorithms.
 1249	 */
 1250	if (sched_numa_topology_type == NUMA_DIRECT)
 1251		return 0;
 1252
 1253	/*
 1254	 * This code is called for each node, introducing N^2 complexity,
 1255	 * which should be ok given the number of nodes rarely exceeds 8.
 1256	 */
 1257	for_each_online_node(node) {
 1258		unsigned long faults;
 1259		int dist = node_distance(nid, node);
 1260
 1261		/*
 1262		 * The furthest away nodes in the system are not interesting
 1263		 * for placement; nid was already counted.
 1264		 */
 1265		if (dist == sched_max_numa_distance || node == nid)
 1266			continue;
 1267
 1268		/*
 1269		 * On systems with a backplane NUMA topology, compare groups
 1270		 * of nodes, and move tasks towards the group with the most
 1271		 * memory accesses. When comparing two nodes at distance
 1272		 * "hoplimit", only nodes closer by than "hoplimit" are part
 1273		 * of each group. Skip other nodes.
 1274		 */
 1275		if (sched_numa_topology_type == NUMA_BACKPLANE &&
 1276					dist > maxdist)
 1277			continue;
 1278
 1279		/* Add up the faults from nearby nodes. */
 1280		if (task)
 1281			faults = task_faults(p, node);
 1282		else
 1283			faults = group_faults(p, node);
 1284
 1285		/*
 1286		 * On systems with a glueless mesh NUMA topology, there are
 1287		 * no fixed "groups of nodes". Instead, nodes that are not
 1288		 * directly connected bounce traffic through intermediate
 1289		 * nodes; a numa_group can occupy any set of nodes.
 1290		 * The further away a node is, the less the faults count.
 1291		 * This seems to result in good task placement.
 1292		 */
 1293		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
 1294			faults *= (sched_max_numa_distance - dist);
 1295			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
 1296		}
 1297
 1298		score += faults;
 1299	}
 1300
 1301	return score;
 1302}
 1303
 1304/*
 1305 * These return the fraction of accesses done by a particular task, or
 1306 * task group, on a particular numa node.  The group weight is given a
 1307 * larger multiplier, in order to group tasks together that are almost
 1308 * evenly spread out between numa nodes.
 1309 */
 1310static inline unsigned long task_weight(struct task_struct *p, int nid,
 1311					int dist)
 1312{
 1313	unsigned long faults, total_faults;
 1314
 1315	if (!p->numa_faults)
 1316		return 0;
 1317
 1318	total_faults = p->total_numa_faults;
 1319
 1320	if (!total_faults)
 1321		return 0;
 1322
 1323	faults = task_faults(p, nid);
 1324	faults += score_nearby_nodes(p, nid, dist, true);
 1325
 1326	return 1000 * faults / total_faults;
 1327}
 1328
 1329static inline unsigned long group_weight(struct task_struct *p, int nid,
 1330					 int dist)
 1331{
 1332	unsigned long faults, total_faults;
 1333
 1334	if (!p->numa_group)
 1335		return 0;
 1336
 1337	total_faults = p->numa_group->total_faults;
 1338
 1339	if (!total_faults)
 1340		return 0;
 1341
 1342	faults = group_faults(p, nid);
 1343	faults += score_nearby_nodes(p, nid, dist, false);
 1344
 1345	return 1000 * faults / total_faults;
 1346}
 1347
 1348bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 1349				int src_nid, int dst_cpu)
 1350{
 1351	struct numa_group *ng = p->numa_group;
 1352	int dst_nid = cpu_to_node(dst_cpu);
 1353	int last_cpupid, this_cpupid;
 1354
 1355	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
 1356
 1357	/*
 1358	 * Multi-stage node selection is used in conjunction with a periodic
 1359	 * migration fault to build a temporal task<->page relation. By using
 1360	 * a two-stage filter we remove short/unlikely relations.
 1361	 *
 1362	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
 1363	 * a task's usage of a particular page (n_p) per total usage of this
 1364	 * page (n_t) (in a given time-span) to a probability.
 1365	 *
 1366	 * Our periodic faults will sample this probability and getting the
 1367	 * same result twice in a row, given these samples are fully
 1368	 * independent, is then given by P(n)^2, provided our sample period
 1369	 * is sufficiently short compared to the usage pattern.
 1370	 *
 1371	 * This quadric squishes small probabilities, making it less likely we
 1372	 * act on an unlikely task<->page relation.
 1373	 */
 1374	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
 1375	if (!cpupid_pid_unset(last_cpupid) &&
 1376				cpupid_to_nid(last_cpupid) != dst_nid)
 1377		return false;
 1378
 1379	/* Always allow migrate on private faults */
 1380	if (cpupid_match_pid(p, last_cpupid))
 1381		return true;
 1382
 1383	/* A shared fault, but p->numa_group has not been set up yet. */
 1384	if (!ng)
 1385		return true;
 1386
 1387	/*
 1388	 * Destination node is much more heavily used than the source
 1389	 * node? Allow migration.
 1390	 */
 1391	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
 1392					ACTIVE_NODE_FRACTION)
 1393		return true;
 1394
 1395	/*
 1396	 * Distribute memory according to CPU & memory use on each node,
 1397	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
 1398	 *
 1399	 * faults_cpu(dst)   3   faults_cpu(src)
 1400	 * --------------- * - > ---------------
 1401	 * faults_mem(dst)   4   faults_mem(src)
 1402	 */
 1403	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
 1404	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 1405}
 1406
 1407static unsigned long weighted_cpuload(struct rq *rq);
 1408static unsigned long source_load(int cpu, int type);
 1409static unsigned long target_load(int cpu, int type);
 1410static unsigned long capacity_of(int cpu);
 1411
 1412/* Cached statistics for all CPUs within a node */
 1413struct numa_stats {
 1414	unsigned long nr_running;
 1415	unsigned long load;
 1416
 1417	/* Total compute capacity of CPUs on a node */
 1418	unsigned long compute_capacity;
 1419
 1420	/* Approximate capacity in terms of runnable tasks on a node */
 1421	unsigned long task_capacity;
 1422	int has_free_capacity;
 1423};
 1424
 1425/*
 1426 * XXX borrowed from update_sg_lb_stats
 1427 */
 1428static void update_numa_stats(struct numa_stats *ns, int nid)
 1429{
 1430	int smt, cpu, cpus = 0;
 1431	unsigned long capacity;
 1432
 1433	memset(ns, 0, sizeof(*ns));
 1434	for_each_cpu(cpu, cpumask_of_node(nid)) {
 1435		struct rq *rq = cpu_rq(cpu);
 1436
 1437		ns->nr_running += rq->nr_running;
 1438		ns->load += weighted_cpuload(rq);
 1439		ns->compute_capacity += capacity_of(cpu);
 1440
 1441		cpus++;
 1442	}
 1443
 1444	/*
 1445	 * If we raced with hotplug and there are no CPUs left in our mask
 1446	 * the @ns structure is NULL'ed and task_numa_compare() will
 1447	 * not find this node attractive.
 1448	 *
 1449	 * We'll either bail at !has_free_capacity, or we'll detect a huge
 1450	 * imbalance and bail there.
 1451	 */
 1452	if (!cpus)
 1453		return;
 1454
 1455	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
 1456	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
 1457	capacity = cpus / smt; /* cores */
 1458
 1459	ns->task_capacity = min_t(unsigned, capacity,
 1460		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
 1461	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 1462}
 1463
 1464struct task_numa_env {
 1465	struct task_struct *p;
 1466
 1467	int src_cpu, src_nid;
 1468	int dst_cpu, dst_nid;
 1469
 1470	struct numa_stats src_stats, dst_stats;
 1471
 1472	int imbalance_pct;
 1473	int dist;
 1474
 1475	struct task_struct *best_task;
 1476	long best_imp;
 1477	int best_cpu;
 1478};
 1479
 1480static void task_numa_assign(struct task_numa_env *env,
 1481			     struct task_struct *p, long imp)
 1482{
 1483	if (env->best_task)
 1484		put_task_struct(env->best_task);
 1485	if (p)
 1486		get_task_struct(p);
 1487
 1488	env->best_task = p;
 1489	env->best_imp = imp;
 1490	env->best_cpu = env->dst_cpu;
 1491}
 1492
 1493static bool load_too_imbalanced(long src_load, long dst_load,
 1494				struct task_numa_env *env)
 1495{
 1496	long imb, old_imb;
 1497	long orig_src_load, orig_dst_load;
 1498	long src_capacity, dst_capacity;
 1499
 1500	/*
 1501	 * The load is corrected for the CPU capacity available on each node.
 1502	 *
 1503	 * src_load        dst_load
 1504	 * ------------ vs ---------
 1505	 * src_capacity    dst_capacity
 1506	 */
 1507	src_capacity = env->src_stats.compute_capacity;
 1508	dst_capacity = env->dst_stats.compute_capacity;
 1509
 1510	/* We care about the slope of the imbalance, not the direction. */
 1511	if (dst_load < src_load)
 1512		swap(dst_load, src_load);
 1513
 1514	/* Is the difference below the threshold? */
 1515	imb = dst_load * src_capacity * 100 -
 1516	      src_load * dst_capacity * env->imbalance_pct;
 1517	if (imb <= 0)
 1518		return false;
 1519
 1520	/*
 1521	 * The imbalance is above the allowed threshold.
 1522	 * Compare it with the old imbalance.
 1523	 */
 1524	orig_src_load = env->src_stats.load;
 1525	orig_dst_load = env->dst_stats.load;
 1526
 1527	if (orig_dst_load < orig_src_load)
 1528		swap(orig_dst_load, orig_src_load);
 1529
 1530	old_imb = orig_dst_load * src_capacity * 100 -
 1531		  orig_src_load * dst_capacity * env->imbalance_pct;
 1532
 1533	/* Would this change make things worse? */
 1534	return (imb > old_imb);
 1535}
 1536
 1537/*
 1538 * This checks if the overall compute and NUMA accesses of the system would
 1539 * be improved if the source tasks was migrated to the target dst_cpu taking
 1540 * into account that it might be best if task running on the dst_cpu should
 1541 * be exchanged with the source task
 1542 */
 1543static void task_numa_compare(struct task_numa_env *env,
 1544			      long taskimp, long groupimp)
 1545{
 1546	struct rq *src_rq = cpu_rq(env->src_cpu);
 1547	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 1548	struct task_struct *cur;
 1549	long src_load, dst_load;
 1550	long load;
 1551	long imp = env->p->numa_group ? groupimp : taskimp;
 1552	long moveimp = imp;
 1553	int dist = env->dist;
 1554
 1555	rcu_read_lock();
 1556	cur = task_rcu_dereference(&dst_rq->curr);
 1557	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
 1558		cur = NULL;
 1559
 1560	/*
 1561	 * Because we have preemption enabled we can get migrated around and
 1562	 * end try selecting ourselves (current == env->p) as a swap candidate.
 1563	 */
 1564	if (cur == env->p)
 1565		goto unlock;
 1566
 1567	/*
 1568	 * "imp" is the fault differential for the source task between the
 1569	 * source and destination node. Calculate the total differential for
 1570	 * the source task and potential destination task. The more negative
 1571	 * the value is, the more rmeote accesses that would be expected to
 1572	 * be incurred if the tasks were swapped.
 1573	 */
 1574	if (cur) {
 1575		/* Skip this swap candidate if cannot move to the source CPU: */
 1576		if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
 1577			goto unlock;
 1578
 1579		/*
 1580		 * If dst and source tasks are in the same NUMA group, or not
 1581		 * in any group then look only at task weights.
 1582		 */
 1583		if (cur->numa_group == env->p->numa_group) {
 1584			imp = taskimp + task_weight(cur, env->src_nid, dist) -
 1585			      task_weight(cur, env->dst_nid, dist);
 1586			/*
 1587			 * Add some hysteresis to prevent swapping the
 1588			 * tasks within a group over tiny differences.
 1589			 */
 1590			if (cur->numa_group)
 1591				imp -= imp/16;
 1592		} else {
 1593			/*
 1594			 * Compare the group weights. If a task is all by
 1595			 * itself (not part of a group), use the task weight
 1596			 * instead.
 1597			 */
 1598			if (cur->numa_group)
 1599				imp += group_weight(cur, env->src_nid, dist) -
 1600				       group_weight(cur, env->dst_nid, dist);
 1601			else
 1602				imp += task_weight(cur, env->src_nid, dist) -
 1603				       task_weight(cur, env->dst_nid, dist);
 1604		}
 1605	}
 1606
 1607	if (imp <= env->best_imp && moveimp <= env->best_imp)
 1608		goto unlock;
 1609
 1610	if (!cur) {
 1611		/* Is there capacity at our destination? */
 1612		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
 1613		    !env->dst_stats.has_free_capacity)
 1614			goto unlock;
 1615
 1616		goto balance;
 1617	}
 1618
 1619	/* Balance doesn't matter much if we're running a task per CPU: */
 1620	if (imp > env->best_imp && src_rq->nr_running == 1 &&
 1621			dst_rq->nr_running == 1)
 1622		goto assign;
 1623
 1624	/*
 1625	 * In the overloaded case, try and keep the load balanced.
 1626	 */
 1627balance:
 1628	load = task_h_load(env->p);
 1629	dst_load = env->dst_stats.load + load;
 1630	src_load = env->src_stats.load - load;
 1631
 1632	if (moveimp > imp && moveimp > env->best_imp) {
 1633		/*
 1634		 * If the improvement from just moving env->p direction is
 1635		 * better than swapping tasks around, check if a move is
 1636		 * possible. Store a slightly smaller score than moveimp,
 1637		 * so an actually idle CPU will win.
 1638		 */
 1639		if (!load_too_imbalanced(src_load, dst_load, env)) {
 1640			imp = moveimp - 1;
 1641			cur = NULL;
 1642			goto assign;
 1643		}
 1644	}
 1645
 1646	if (imp <= env->best_imp)
 1647		goto unlock;
 1648
 1649	if (cur) {
 1650		load = task_h_load(cur);
 1651		dst_load -= load;
 1652		src_load += load;
 1653	}
 1654
 1655	if (load_too_imbalanced(src_load, dst_load, env))
 1656		goto unlock;
 1657
 1658	/*
 1659	 * One idle CPU per node is evaluated for a task numa move.
 1660	 * Call select_idle_sibling to maybe find a better one.
 1661	 */
 1662	if (!cur) {
 1663		/*
 1664		 * select_idle_siblings() uses an per-CPU cpumask that
 1665		 * can be used from IRQ context.
 1666		 */
 1667		local_irq_disable();
 1668		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
 1669						   env->dst_cpu);
 1670		local_irq_enable();
 1671	}
 1672
 1673assign:
 1674	task_numa_assign(env, cur, imp);
 1675unlock:
 1676	rcu_read_unlock();
 1677}
 1678
 1679static void task_numa_find_cpu(struct task_numa_env *env,
 1680				long taskimp, long groupimp)
 1681{
 1682	int cpu;
 1683
 1684	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 1685		/* Skip this CPU if the source task cannot migrate */
 1686		if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
 1687			continue;
 1688
 1689		env->dst_cpu = cpu;
 1690		task_numa_compare(env, taskimp, groupimp);
 1691	}
 1692}
 1693
 1694/* Only move tasks to a NUMA node less busy than the current node. */
 1695static bool numa_has_capacity(struct task_numa_env *env)
 1696{
 1697	struct numa_stats *src = &env->src_stats;
 1698	struct numa_stats *dst = &env->dst_stats;
 1699
 1700	if (src->has_free_capacity && !dst->has_free_capacity)
 1701		return false;
 1702
 1703	/*
 1704	 * Only consider a task move if the source has a higher load
 1705	 * than the destination, corrected for CPU capacity on each node.
 1706	 *
 1707	 *      src->load                dst->load
 1708	 * --------------------- vs ---------------------
 1709	 * src->compute_capacity    dst->compute_capacity
 1710	 */
 1711	if (src->load * dst->compute_capacity * env->imbalance_pct >
 1712
 1713	    dst->load * src->compute_capacity * 100)
 1714		return true;
 1715
 1716	return false;
 1717}
 1718
 1719static int task_numa_migrate(struct task_struct *p)
 1720{
 1721	struct task_numa_env env = {
 1722		.p = p,
 1723
 1724		.src_cpu = task_cpu(p),
 1725		.src_nid = task_node(p),
 1726
 1727		.imbalance_pct = 112,
 1728
 1729		.best_task = NULL,
 1730		.best_imp = 0,
 1731		.best_cpu = -1,
 1732	};
 1733	struct sched_domain *sd;
 1734	unsigned long taskweight, groupweight;
 1735	int nid, ret, dist;
 1736	long taskimp, groupimp;
 1737
 1738	/*
 1739	 * Pick the lowest SD_NUMA domain, as that would have the smallest
 1740	 * imbalance and would be the first to start moving tasks about.
 1741	 *
 1742	 * And we want to avoid any moving of tasks about, as that would create
 1743	 * random movement of tasks -- counter the numa conditions we're trying
 1744	 * to satisfy here.
 1745	 */
 1746	rcu_read_lock();
 1747	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
 1748	if (sd)
 1749		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
 1750	rcu_read_unlock();
 1751
 1752	/*
 1753	 * Cpusets can break the scheduler domain tree into smaller
 1754	 * balance domains, some of which do not cross NUMA boundaries.
 1755	 * Tasks that are "trapped" in such domains cannot be migrated
 1756	 * elsewhere, so there is no point in (re)trying.
 1757	 */
 1758	if (unlikely(!sd)) {
 1759		p->numa_preferred_nid = task_node(p);
 1760		return -EINVAL;
 1761	}
 1762
 1763	env.dst_nid = p->numa_preferred_nid;
 1764	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
 1765	taskweight = task_weight(p, env.src_nid, dist);
 1766	groupweight = group_weight(p, env.src_nid, dist);
 1767	update_numa_stats(&env.src_stats, env.src_nid);
 1768	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
 1769	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
 1770	update_numa_stats(&env.dst_stats, env.dst_nid);
 1771
 1772	/* Try to find a spot on the preferred nid. */
 1773	if (numa_has_capacity(&env))
 1774		task_numa_find_cpu(&env, taskimp, groupimp);
 1775
 1776	/*
 1777	 * Look at other nodes in these cases:
 1778	 * - there is no space available on the preferred_nid
 1779	 * - the task is part of a numa_group that is interleaved across
 1780	 *   multiple NUMA nodes; in order to better consolidate the group,
 1781	 *   we need to check other locations.
 1782	 */
 1783	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
 1784		for_each_online_node(nid) {
 1785			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 1786				continue;
 1787
 1788			dist = node_distance(env.src_nid, env.dst_nid);
 1789			if (sched_numa_topology_type == NUMA_BACKPLANE &&
 1790						dist != env.dist) {
 1791				taskweight = task_weight(p, env.src_nid, dist);
 1792				groupweight = group_weight(p, env.src_nid, dist);
 1793			}
 1794
 1795			/* Only consider nodes where both task and groups benefit */
 1796			taskimp = task_weight(p, nid, dist) - taskweight;
 1797			groupimp = group_weight(p, nid, dist) - groupweight;
 1798			if (taskimp < 0 && groupimp < 0)
 1799				continue;
 1800
 1801			env.dist = dist;
 1802			env.dst_nid = nid;
 1803			update_numa_stats(&env.dst_stats, env.dst_nid);
 1804			if (numa_has_capacity(&env))
 1805				task_numa_find_cpu(&env, taskimp, groupimp);
 1806		}
 1807	}
 1808
 1809	/*
 1810	 * If the task is part of a workload that spans multiple NUMA nodes,
 1811	 * and is migrating into one of the workload's active nodes, remember
 1812	 * this node as the task's preferred numa node, so the workload can
 1813	 * settle down.
 1814	 * A task that migrated to a second choice node will be better off
 1815	 * trying for a better one later. Do not set the preferred node here.
 1816	 */
 1817	if (p->numa_group) {
 1818		struct numa_group *ng = p->numa_group;
 1819
 1820		if (env.best_cpu == -1)
 1821			nid = env.src_nid;
 1822		else
 1823			nid = env.dst_nid;
 1824
 1825		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
 1826			sched_setnuma(p, env.dst_nid);
 1827	}
 1828
 1829	/* No better CPU than the current one was found. */
 1830	if (env.best_cpu == -1)
 1831		return -EAGAIN;
 1832
 1833	/*
 1834	 * Reset the scan period if the task is being rescheduled on an
 1835	 * alternative node to recheck if the tasks is now properly placed.
 1836	 */
 1837	p->numa_scan_period = task_scan_start(p);
 1838
 1839	if (env.best_task == NULL) {
 1840		ret = migrate_task_to(p, env.best_cpu);
 1841		if (ret != 0)
 1842			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
 1843		return ret;
 1844	}
 1845
 1846	ret = migrate_swap(p, env.best_task);
 1847	if (ret != 0)
 1848		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
 1849	put_task_struct(env.best_task);
 1850	return ret;
 1851}
 1852
 1853/* Attempt to migrate a task to a CPU on the preferred node. */
 1854static void numa_migrate_preferred(struct task_struct *p)
 1855{
 1856	unsigned long interval = HZ;
 1857
 1858	/* This task has no NUMA fault statistics yet */
 1859	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
 1860		return;
 1861
 1862	/* Periodically retry migrating the task to the preferred node */
 1863	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
 1864	p->numa_migrate_retry = jiffies + interval;
 1865
 1866	/* Success if task is already running on preferred CPU */
 1867	if (task_node(p) == p->numa_preferred_nid)
 1868		return;
 1869
 1870	/* Otherwise, try migrate to a CPU on the preferred node */
 1871	task_numa_migrate(p);
 1872}
 1873
 1874/*
 1875 * Find out how many nodes on the workload is actively running on. Do this by
 1876 * tracking the nodes from which NUMA hinting faults are triggered. This can
 1877 * be different from the set of nodes where the workload's memory is currently
 1878 * located.
 1879 */
 1880static void numa_group_count_active_nodes(struct numa_group *numa_group)
 1881{
 1882	unsigned long faults, max_faults = 0;
 1883	int nid, active_nodes = 0;
 1884
 1885	for_each_online_node(nid) {
 1886		faults = group_faults_cpu(numa_group, nid);
 1887		if (faults > max_faults)
 1888			max_faults = faults;
 1889	}
 1890
 1891	for_each_online_node(nid) {
 1892		faults = group_faults_cpu(numa_group, nid);
 1893		if (faults * ACTIVE_NODE_FRACTION > max_faults)
 1894			active_nodes++;
 1895	}
 1896
 1897	numa_group->max_faults_cpu = max_faults;
 1898	numa_group->active_nodes = active_nodes;
 1899}
 1900
 1901/*
 1902 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
 1903 * increments. The more local the fault statistics are, the higher the scan
 1904 * period will be for the next scan window. If local/(local+remote) ratio is
 1905 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
 1906 * the scan period will decrease. Aim for 70% local accesses.
 1907 */
 1908#define NUMA_PERIOD_SLOTS 10
 1909#define NUMA_PERIOD_THRESHOLD 7
 1910
 1911/*
 1912 * Increase the scan period (slow down scanning) if the majority of
 1913 * our memory is already on our local node, or if the majority of
 1914 * the page accesses are shared with other processes.
 1915 * Otherwise, decrease the scan period.
 1916 */
 1917static void update_task_scan_period(struct task_struct *p,
 1918			unsigned long shared, unsigned long private)
 1919{
 1920	unsigned int period_slot;
 1921	int lr_ratio, ps_ratio;
 1922	int diff;
 1923
 1924	unsigned long remote = p->numa_faults_locality[0];
 1925	unsigned long local = p->numa_faults_locality[1];
 1926
 1927	/*
 1928	 * If there were no record hinting faults then either the task is
 1929	 * completely idle or all activity is areas that are not of interest
 1930	 * to automatic numa balancing. Related to that, if there were failed
 1931	 * migration then it implies we are migrating too quickly or the local
 1932	 * node is overloaded. In either case, scan slower
 1933	 */
 1934	if (local + shared == 0 || p->numa_faults_locality[2]) {
 1935		p->numa_scan_period = min(p->numa_scan_period_max,
 1936			p->numa_scan_period << 1);
 1937
 1938		p->mm->numa_next_scan = jiffies +
 1939			msecs_to_jiffies(p->numa_scan_period);
 1940
 1941		return;
 1942	}
 1943
 1944	/*
 1945	 * Prepare to scale scan period relative to the current period.
 1946	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
 1947	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
 1948	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
 1949	 */
 1950	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
 1951	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
 1952	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
 1953
 1954	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
 1955		/*
 1956		 * Most memory accesses are local. There is no need to
 1957		 * do fast NUMA scanning, since memory is already local.
 1958		 */
 1959		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
 1960		if (!slot)
 1961			slot = 1;
 1962		diff = slot * period_slot;
 1963	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
 1964		/*
 1965		 * Most memory accesses are shared with other tasks.
 1966		 * There is no point in continuing fast NUMA scanning,
 1967		 * since other tasks may just move the memory elsewhere.
 1968		 */
 1969		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
 1970		if (!slot)
 1971			slot = 1;
 1972		diff = slot * period_slot;
 1973	} else {
 1974		/*
 1975		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
 1976		 * yet they are not on the local NUMA node. Speed up
 1977		 * NUMA scanning to get the memory moved over.
 1978		 */
 1979		int ratio = max(lr_ratio, ps_ratio);
 1980		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
 1981	}
 1982
 1983	p->numa_scan_period = clamp(p->numa_scan_period + diff,
 1984			task_scan_min(p), task_scan_max(p));
 1985	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 1986}
 1987
 1988/*
 1989 * Get the fraction of time the task has been running since the last
 1990 * NUMA placement cycle. The scheduler keeps similar statistics, but
 1991 * decays those on a 32ms period, which is orders of magnitude off
 1992 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
 1993 * stats only if the task is so new there are no NUMA statistics yet.
 1994 */
 1995static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 1996{
 1997	u64 runtime, delta, now;
 1998	/* Use the start of this time slice to avoid calculations. */
 1999	now = p->se.exec_start;
 2000	runtime = p->se.sum_exec_runtime;
 2001
 2002	if (p->last_task_numa_placement) {
 2003		delta = runtime - p->last_sum_exec_runtime;
 2004		*period = now - p->last_task_numa_placement;
 2005	} else {
 2006		delta = p->se.avg.load_sum;
 2007		*period = LOAD_AVG_MAX;
 2008	}
 2009
 2010	p->last_sum_exec_runtime = runtime;
 2011	p->last_task_numa_placement = now;
 2012
 2013	return delta;
 2014}
 2015
 2016/*
 2017 * Determine the preferred nid for a task in a numa_group. This needs to
 2018 * be done in a way that produces consistent results with group_weight,
 2019 * otherwise workloads might not converge.
 2020 */
 2021static int preferred_group_nid(struct task_struct *p, int nid)
 2022{
 2023	nodemask_t nodes;
 2024	int dist;
 2025
 2026	/* Direct connections between all NUMA nodes. */
 2027	if (sched_numa_topology_type == NUMA_DIRECT)
 2028		return nid;
 2029
 2030	/*
 2031	 * On a system with glueless mesh NUMA topology, group_weight
 2032	 * scores nodes according to the number of NUMA hinting faults on
 2033	 * both the node itself, and on nearby nodes.
 2034	 */
 2035	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
 2036		unsigned long score, max_score = 0;
 2037		int node, max_node = nid;
 2038
 2039		dist = sched_max_numa_distance;
 2040
 2041		for_each_online_node(node) {
 2042			score = group_weight(p, node, dist);
 2043			if (score > max_score) {
 2044				max_score = score;
 2045				max_node = node;
 2046			}
 2047		}
 2048		return max_node;
 2049	}
 2050
 2051	/*
 2052	 * Finding the preferred nid in a system with NUMA backplane
 2053	 * interconnect topology is more involved. The goal is to locate
 2054	 * tasks from numa_groups near each other in the system, and
 2055	 * untangle workloads from different sides of the system. This requires
 2056	 * searching down the hierarchy of node groups, recursively searching
 2057	 * inside the highest scoring group of nodes. The nodemask tricks
 2058	 * keep the complexity of the search down.
 2059	 */
 2060	nodes = node_online_map;
 2061	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
 2062		unsigned long max_faults = 0;
 2063		nodemask_t max_group = NODE_MASK_NONE;
 2064		int a, b;
 2065
 2066		/* Are there nodes at this distance from each other? */
 2067		if (!find_numa_distance(dist))
 2068			continue;
 2069
 2070		for_each_node_mask(a, nodes) {
 2071			unsigned long faults = 0;
 2072			nodemask_t this_group;
 2073			nodes_clear(this_group);
 2074
 2075			/* Sum group's NUMA faults; includes a==b case. */
 2076			for_each_node_mask(b, nodes) {
 2077				if (node_distance(a, b) < dist) {
 2078					faults += group_faults(p, b);
 2079					node_set(b, this_group);
 2080					node_clear(b, nodes);
 2081				}
 2082			}
 2083
 2084			/* Remember the top group. */
 2085			if (faults > max_faults) {
 2086				max_faults = faults;
 2087				max_group = this_group;
 2088				/*
 2089				 * subtle: at the smallest distance there is
 2090				 * just one node left in each "group", the
 2091				 * winner is the preferred nid.
 2092				 */
 2093				nid = a;
 2094			}
 2095		}
 2096		/* Next round, evaluate the nodes within max_group. */
 2097		if (!max_faults)
 2098			break;
 2099		nodes = max_group;
 2100	}
 2101	return nid;
 2102}
 2103
 2104static void task_numa_placement(struct task_struct *p)
 2105{
 2106	int seq, nid, max_nid = -1, max_group_nid = -1;
 2107	unsigned long max_faults = 0, max_group_faults = 0;
 2108	unsigned long fault_types[2] = { 0, 0 };
 2109	unsigned long total_faults;
 2110	u64 runtime, period;
 2111	spinlock_t *group_lock = NULL;
 2112
 2113	/*
 2114	 * The p->mm->numa_scan_seq field gets updated without
 2115	 * exclusive access. Use READ_ONCE() here to ensure
 2116	 * that the field is read in a single access:
 2117	 */
 2118	seq = READ_ONCE(p->mm->numa_scan_seq);
 2119	if (p->numa_scan_seq == seq)
 2120		return;
 2121	p->numa_scan_seq = seq;
 2122	p->numa_scan_period_max = task_scan_max(p);
 2123
 2124	total_faults = p->numa_faults_locality[0] +
 2125		       p->numa_faults_locality[1];
 2126	runtime = numa_get_avg_runtime(p, &period);
 2127
 2128	/* If the task is part of a group prevent parallel updates to group stats */
 2129	if (p->numa_group) {
 2130		group_lock = &p->numa_group->lock;
 2131		spin_lock_irq(group_lock);
 2132	}
 2133
 2134	/* Find the node with the highest number of faults */
 2135	for_each_online_node(nid) {
 2136		/* Keep track of the offsets in numa_faults array */
 2137		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
 2138		unsigned long faults = 0, group_faults = 0;
 2139		int priv;
 2140
 2141		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
 2142			long diff, f_diff, f_weight;
 2143
 2144			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
 2145			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
 2146			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
 2147			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
 2148
 2149			/* Decay existing window, copy faults since last scan */
 2150			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
 2151			fault_types[priv] += p->numa_faults[membuf_idx];
 2152			p->numa_faults[membuf_idx] = 0;
 2153
 2154			/*
 2155			 * Normalize the faults_from, so all tasks in a group
 2156			 * count according to CPU use, instead of by the raw
 2157			 * number of faults. Tasks with little runtime have
 2158			 * little over-all impact on throughput, and thus their
 2159			 * faults are less important.
 2160			 */
 2161			f_weight = div64_u64(runtime << 16, period + 1);
 2162			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
 2163				   (total_faults + 1);
 2164			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
 2165			p->numa_faults[cpubuf_idx] = 0;
 2166
 2167			p->numa_faults[mem_idx] += diff;
 2168			p->numa_faults[cpu_idx] += f_diff;
 2169			faults += p->numa_faults[mem_idx];
 2170			p->total_numa_faults += diff;
 2171			if (p->numa_group) {
 2172				/*
 2173				 * safe because we can only change our own group
 2174				 *
 2175				 * mem_idx represents the offset for a given
 2176				 * nid and priv in a specific region because it
 2177				 * is at the beginning of the numa_faults array.
 2178				 */
 2179				p->numa_group->faults[mem_idx] += diff;
 2180				p->numa_group->faults_cpu[mem_idx] += f_diff;
 2181				p->numa_group->total_faults += diff;
 2182				group_faults += p->numa_group->faults[mem_idx];
 2183			}
 2184		}
 2185
 2186		if (faults > max_faults) {
 2187			max_faults = faults;
 2188			max_nid = nid;
 2189		}
 2190
 2191		if (group_faults > max_group_faults) {
 2192			max_group_faults = group_faults;
 2193			max_group_nid = nid;
 2194		}
 2195	}
 2196
 2197	update_task_scan_period(p, fault_types[0], fault_types[1]);
 2198
 2199	if (p->numa_group) {
 2200		numa_group_count_active_nodes(p->numa_group);
 2201		spin_unlock_irq(group_lock);
 2202		max_nid = preferred_group_nid(p, max_group_nid);
 2203	}
 2204
 2205	if (max_faults) {
 2206		/* Set the new preferred node */
 2207		if (max_nid != p->numa_preferred_nid)
 2208			sched_setnuma(p, max_nid);
 2209
 2210		if (task_node(p) != p->numa_preferred_nid)
 2211			numa_migrate_preferred(p);
 2212	}
 2213}
 2214
 2215static inline int get_numa_group(struct numa_group *grp)
 2216{
 2217	return atomic_inc_not_zero(&grp->refcount);
 2218}
 2219
 2220static inline void put_numa_group(struct numa_group *grp)
 2221{
 2222	if (atomic_dec_and_test(&grp->refcount))
 2223		kfree_rcu(grp, rcu);
 2224}
 2225
 2226static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 2227			int *priv)
 2228{
 2229	struct numa_group *grp, *my_grp;
 2230	struct task_struct *tsk;
 2231	bool join = false;
 2232	int cpu = cpupid_to_cpu(cpupid);
 2233	int i;
 2234
 2235	if (unlikely(!p->numa_group)) {
 2236		unsigned int size = sizeof(struct numa_group) +
 2237				    4*nr_node_ids*sizeof(unsigned long);
 2238
 2239		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
 2240		if (!grp)
 2241			return;
 2242
 2243		atomic_set(&grp->refcount, 1);
 2244		grp->active_nodes = 1;
 2245		grp->max_faults_cpu = 0;
 2246		spin_lock_init(&grp->lock);
 2247		grp->gid = p->pid;
 2248		/* Second half of the array tracks nids where faults happen */
 2249		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
 2250						nr_node_ids;
 2251
 2252		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 2253			grp->faults[i] = p->numa_faults[i];
 2254
 2255		grp->total_faults = p->total_numa_faults;
 2256
 2257		grp->nr_tasks++;
 2258		rcu_assign_pointer(p->numa_group, grp);
 2259	}
 2260
 2261	rcu_read_lock();
 2262	tsk = READ_ONCE(cpu_rq(cpu)->curr);
 2263
 2264	if (!cpupid_match_pid(tsk, cpupid))
 2265		goto no_join;
 2266
 2267	grp = rcu_dereference(tsk->numa_group);
 2268	if (!grp)
 2269		goto no_join;
 2270
 2271	my_grp = p->numa_group;
 2272	if (grp == my_grp)
 2273		goto no_join;
 2274
 2275	/*
 2276	 * Only join the other group if its bigger; if we're the bigger group,
 2277	 * the other task will join us.
 2278	 */
 2279	if (my_grp->nr_tasks > grp->nr_tasks)
 2280		goto no_join;
 2281
 2282	/*
 2283	 * Tie-break on the grp address.
 2284	 */
 2285	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
 2286		goto no_join;
 2287
 2288	/* Always join threads in the same process. */
 2289	if (tsk->mm == current->mm)
 2290		join = true;
 2291
 2292	/* Simple filter to avoid false positives due to PID collisions */
 2293	if (flags & TNF_SHARED)
 2294		join = true;
 2295
 2296	/* Update priv based on whether false sharing was detected */
 2297	*priv = !join;
 2298
 2299	if (join && !get_numa_group(grp))
 2300		goto no_join;
 2301
 2302	rcu_read_unlock();
 2303
 2304	if (!join)
 2305		return;
 2306
 2307	BUG_ON(irqs_disabled());
 2308	double_lock_irq(&my_grp->lock, &grp->lock);
 2309
 2310	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
 2311		my_grp->faults[i] -= p->numa_faults[i];
 2312		grp->faults[i] += p->numa_faults[i];
 2313	}
 2314	my_grp->total_faults -= p->total_numa_faults;
 2315	grp->total_faults += p->total_numa_faults;
 2316
 2317	my_grp->nr_tasks--;
 2318	grp->nr_tasks++;
 2319
 2320	spin_unlock(&my_grp->lock);
 2321	spin_unlock_irq(&grp->lock);
 2322
 2323	rcu_assign_pointer(p->numa_group, grp);
 2324
 2325	put_numa_group(my_grp);
 2326	return;
 2327
 2328no_join:
 2329	rcu_read_unlock();
 2330	return;
 2331}
 2332
 2333void task_numa_free(struct task_struct *p)
 2334{
 2335	struct numa_group *grp = p->numa_group;
 2336	void *numa_faults = p->numa_faults;
 2337	unsigned long flags;
 2338	int i;
 2339
 2340	if (grp) {
 2341		spin_lock_irqsave(&grp->lock, flags);
 2342		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 2343			grp->faults[i] -= p->numa_faults[i];
 2344		grp->total_faults -= p->total_numa_faults;
 2345
 2346		grp->nr_tasks--;
 2347		spin_unlock_irqrestore(&grp->lock, flags);
 2348		RCU_INIT_POINTER(p->numa_group, NULL);
 2349		put_numa_group(grp);
 2350	}
 2351
 2352	p->numa_faults = NULL;
 2353	kfree(numa_faults);
 2354}
 2355
 2356/*
 2357 * Got a PROT_NONE fault for a page on @node.
 2358 */
 2359void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 2360{
 2361	struct task_struct *p = current;
 2362	bool migrated = flags & TNF_MIGRATED;
 2363	int cpu_node = task_node(current);
 2364	int local = !!(flags & TNF_FAULT_LOCAL);
 2365	struct numa_group *ng;
 2366	int priv;
 2367
 2368	if (!static_branch_likely(&sched_numa_balancing))
 2369		return;
 2370
 2371	/* for example, ksmd faulting in a user's mm */
 2372	if (!p->mm)
 2373		return;
 2374
 2375	/* Allocate buffer to track faults on a per-node basis */
 2376	if (unlikely(!p->numa_faults)) {
 2377		int size = sizeof(*p->numa_faults) *
 2378			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
 2379
 2380		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
 2381		if (!p->numa_faults)
 2382			return;
 2383
 2384		p->total_numa_faults = 0;
 2385		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 2386	}
 2387
 2388	/*
 2389	 * First accesses are treated as private, otherwise consider accesses
 2390	 * to be private if the accessing pid has not changed
 2391	 */
 2392	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
 2393		priv = 1;
 2394	} else {
 2395		priv = cpupid_match_pid(p, last_cpupid);
 2396		if (!priv && !(flags & TNF_NO_GROUP))
 2397			task_numa_group(p, last_cpupid, flags, &priv);
 2398	}
 2399
 2400	/*
 2401	 * If a workload spans multiple NUMA nodes, a shared fault that
 2402	 * occurs wholly within the set of nodes that the workload is
 2403	 * actively using should be counted as local. This allows the
 2404	 * scan rate to slow down when a workload has settled down.
 2405	 */
 2406	ng = p->numa_group;
 2407	if (!priv && !local && ng && ng->active_nodes > 1 &&
 2408				numa_is_active_node(cpu_node, ng) &&
 2409				numa_is_active_node(mem_node, ng))
 2410		local = 1;
 2411
 2412	task_numa_placement(p);
 2413
 2414	/*
 2415	 * Retry task to preferred node migration periodically, in case it
 2416	 * case it previously failed, or the scheduler moved us.
 2417	 */
 2418	if (time_after(jiffies, p->numa_migrate_retry))
 2419		numa_migrate_preferred(p);
 2420
 2421	if (migrated)
 2422		p->numa_pages_migrated += pages;
 2423	if (flags & TNF_MIGRATE_FAIL)
 2424		p->numa_faults_locality[2] += pages;
 2425
 2426	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 2427	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
 2428	p->numa_faults_locality[local] += pages;
 2429}
 2430
 2431static void reset_ptenuma_scan(struct task_struct *p)
 2432{
 2433	/*
 2434	 * We only did a read acquisition of the mmap sem, so
 2435	 * p->mm->numa_scan_seq is written to without exclusive access
 2436	 * and the update is not guaranteed to be atomic. That's not
 2437	 * much of an issue though, since this is just used for
 2438	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
 2439	 * expensive, to avoid any form of compiler optimizations:
 2440	 */
 2441	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 2442	p->mm->numa_scan_offset = 0;
 2443}
 2444
 2445/*
 2446 * The expensive part of numa migration is done from task_work context.
 2447 * Triggered from task_tick_numa().
 2448 */
 2449void task_numa_work(struct callback_head *work)
 2450{
 2451	unsigned long migrate, next_scan, now = jiffies;
 2452	struct task_struct *p = current;
 2453	struct mm_struct *mm = p->mm;
 2454	u64 runtime = p->se.sum_exec_runtime;
 2455	struct vm_area_struct *vma;
 2456	unsigned long start, end;
 2457	unsigned long nr_pte_updates = 0;
 2458	long pages, virtpages;
 2459
 2460	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 2461
 2462	work->next = work; /* protect against double add */
 2463	/*
 2464	 * Who cares about NUMA placement when they're dying.
 2465	 *
 2466	 * NOTE: make sure not to dereference p->mm before this check,
 2467	 * exit_task_work() happens _after_ exit_mm() so we could be called
 2468	 * without p->mm even though we still had it when we enqueued this
 2469	 * work.
 2470	 */
 2471	if (p->flags & PF_EXITING)
 2472		return;
 2473
 2474	if (!mm->numa_next_scan) {
 2475		mm->numa_next_scan = now +
 2476			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 2477	}
 2478
 2479	/*
 2480	 * Enforce maximal scan/migration frequency..
 2481	 */
 2482	migrate = mm->numa_next_scan;
 2483	if (time_before(now, migrate))
 2484		return;
 2485
 2486	if (p->numa_scan_period == 0) {
 2487		p->numa_scan_period_max = task_scan_max(p);
 2488		p->numa_scan_period = task_scan_start(p);
 2489	}
 2490
 2491	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
 2492	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
 2493		return;
 2494
 2495	/*
 2496	 * Delay this task enough that another task of this mm will likely win
 2497	 * the next time around.
 2498	 */
 2499	p->node_stamp += 2 * TICK_NSEC;
 2500
 2501	start = mm->numa_scan_offset;
 2502	pages = sysctl_numa_balancing_scan_size;
 2503	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 2504	virtpages = pages * 8;	   /* Scan up to this much virtual space */
 2505	if (!pages)
 2506		return;
 2507
 2508
 2509	if (!down_read_trylock(&mm->mmap_sem))
 2510		return;
 2511	vma = find_vma(mm, start);
 2512	if (!vma) {
 2513		reset_ptenuma_scan(p);
 2514		start = 0;
 2515		vma = mm->mmap;
 2516	}
 2517	for (; vma; vma = vma->vm_next) {
 2518		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
 2519			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
 2520			continue;
 2521		}
 2522
 2523		/*
 2524		 * Shared library pages mapped by multiple processes are not
 2525		 * migrated as it is expected they are cache replicated. Avoid
 2526		 * hinting faults in read-only file-backed mappings or the vdso
 2527		 * as migrating the pages will be of marginal benefit.
 2528		 */
 2529		if (!vma->vm_mm ||
 2530		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
 2531			continue;
 2532
 2533		/*
 2534		 * Skip inaccessible VMAs to avoid any confusion between
 2535		 * PROT_NONE and NUMA hinting ptes
 2536		 */
 2537		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 2538			continue;
 2539
 2540		do {
 2541			start = max(start, vma->vm_start);
 2542			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
 2543			end = min(end, vma->vm_end);
 2544			nr_pte_updates = change_prot_numa(vma, start, end);
 2545
 2546			/*
 2547			 * Try to scan sysctl_numa_balancing_size worth of
 2548			 * hpages that have at least one present PTE that
 2549			 * is not already pte-numa. If the VMA contains
 2550			 * areas that are unused or already full of prot_numa
 2551			 * PTEs, scan up to virtpages, to skip through those
 2552			 * areas faster.
 2553			 */
 2554			if (nr_pte_updates)
 2555				pages -= (end - start) >> PAGE_SHIFT;
 2556			virtpages -= (end - start) >> PAGE_SHIFT;
 2557
 2558			start = end;
 2559			if (pages <= 0 || virtpages <= 0)
 2560				goto out;
 2561
 2562			cond_resched();
 2563		} while (end != vma->vm_end);
 2564	}
 2565
 2566out:
 2567	/*
 2568	 * It is possible to reach the end of the VMA list but the last few
 2569	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
 2570	 * would find the !migratable VMA on the next scan but not reset the
 2571	 * scanner to the start so check it now.
 2572	 */
 2573	if (vma)
 2574		mm->numa_scan_offset = start;
 2575	else
 2576		reset_ptenuma_scan(p);
 2577	up_read(&mm->mmap_sem);
 2578
 2579	/*
 2580	 * Make sure tasks use at least 32x as much time to run other code
 2581	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
 2582	 * Usually update_task_scan_period slows down scanning enough; on an
 2583	 * overloaded system we need to limit overhead on a per task basis.
 2584	 */
 2585	if (unlikely(p->se.sum_exec_runtime != runtime)) {
 2586		u64 diff = p->se.sum_exec_runtime - runtime;
 2587		p->node_stamp += 32 * diff;
 2588	}
 2589}
 2590
 2591/*
 2592 * Drive the periodic memory faults..
 2593 */
 2594void task_tick_numa(struct rq *rq, struct task_struct *curr)
 2595{
 2596	struct callback_head *work = &curr->numa_work;
 2597	u64 period, now;
 2598
 2599	/*
 2600	 * We don't care about NUMA placement if we don't have memory.
 2601	 */
 2602	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
 2603		return;
 2604
 2605	/*
 2606	 * Using runtime rather than walltime has the dual advantage that
 2607	 * we (mostly) drive the selection from busy threads and that the
 2608	 * task needs to have done some actual work before we bother with
 2609	 * NUMA placement.
 2610	 */
 2611	now = curr->se.sum_exec_runtime;
 2612	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 2613
 2614	if (now > curr->node_stamp + period) {
 2615		if (!curr->node_stamp)
 2616			curr->numa_scan_period = task_scan_start(curr);
 2617		curr->node_stamp += period;
 2618
 2619		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
 2620			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
 2621			task_work_add(curr, work, true);
 2622		}
 2623	}
 2624}
 2625
 2626#else
 2627static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 2628{
 2629}
 2630
 2631static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 2632{
 2633}
 2634
 2635static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 2636{
 2637}
 2638
 2639#endif /* CONFIG_NUMA_BALANCING */
 2640
 2641static void
 2642account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2643{
 2644	update_load_add(&cfs_rq->load, se->load.weight);
 2645	if (!parent_entity(se))
 2646		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 2647#ifdef CONFIG_SMP
 2648	if (entity_is_task(se)) {
 2649		struct rq *rq = rq_of(cfs_rq);
 2650
 2651		account_numa_enqueue(rq, task_of(se));
 2652		list_add(&se->group_node, &rq->cfs_tasks);
 2653	}
 2654#endif
 2655	cfs_rq->nr_running++;
 2656}
 2657
 2658static void
 2659account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2660{
 2661	update_load_sub(&cfs_rq->load, se->load.weight);
 2662	if (!parent_entity(se))
 2663		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
 2664#ifdef CONFIG_SMP
 2665	if (entity_is_task(se)) {
 2666		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 2667		list_del_init(&se->group_node);
 2668	}
 2669#endif
 2670	cfs_rq->nr_running--;
 2671}
 2672
 2673/*
 2674 * Signed add and clamp on underflow.
 2675 *
 2676 * Explicitly do a load-store to ensure the intermediate value never hits
 2677 * memory. This allows lockless observations without ever seeing the negative
 2678 * values.
 2679 */
 2680#define add_positive(_ptr, _val) do {                           \
 2681	typeof(_ptr) ptr = (_ptr);                              \
 2682	typeof(_val) val = (_val);                              \
 2683	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
 2684								\
 2685	res = var + val;                                        \
 2686								\
 2687	if (val < 0 && res > var)                               \
 2688		res = 0;                                        \
 2689								\
 2690	WRITE_ONCE(*ptr, res);                                  \
 2691} while (0)
 2692
 2693/*
 2694 * Unsigned subtract and clamp on underflow.
 2695 *
 2696 * Explicitly do a load-store to ensure the intermediate value never hits
 2697 * memory. This allows lockless observations without ever seeing the negative
 2698 * values.
 2699 */
 2700#define sub_positive(_ptr, _val) do {				\
 2701	typeof(_ptr) ptr = (_ptr);				\
 2702	typeof(*ptr) val = (_val);				\
 2703	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
 2704	res = var - val;					\
 2705	if (res > var)						\
 2706		res = 0;					\
 2707	WRITE_ONCE(*ptr, res);					\
 2708} while (0)
 2709
 2710#ifdef CONFIG_SMP
 2711/*
 2712 * XXX we want to get rid of these helpers and use the full load resolution.
 2713 */
 2714static inline long se_weight(struct sched_entity *se)
 2715{
 2716	return scale_load_down(se->load.weight);
 2717}
 2718
 2719static inline long se_runnable(struct sched_entity *se)
 2720{
 2721	return scale_load_down(se->runnable_weight);
 2722}
 2723
 2724static inline void
 2725enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2726{
 2727	cfs_rq->runnable_weight += se->runnable_weight;
 2728
 2729	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
 2730	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
 2731}
 2732
 2733static inline void
 2734dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2735{
 2736	cfs_rq->runnable_weight -= se->runnable_weight;
 2737
 2738	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
 2739	sub_positive(&cfs_rq->avg.runnable_load_sum,
 2740		     se_runnable(se) * se->avg.runnable_load_sum);
 2741}
 2742
 2743static inline void
 2744enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2745{
 2746	cfs_rq->avg.load_avg += se->avg.load_avg;
 2747	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
 2748}
 2749
 2750static inline void
 2751dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2752{
 2753	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
 2754	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
 2755}
 2756#else
 2757static inline void
 2758enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 2759static inline void
 2760dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 2761static inline void
 2762enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 2763static inline void
 2764dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 2765#endif
 2766
 2767static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 2768			    unsigned long weight, unsigned long runnable)
 2769{
 2770	if (se->on_rq) {
 2771		/* commit outstanding execution time */
 2772		if (cfs_rq->curr == se)
 2773			update_curr(cfs_rq);
 2774		account_entity_dequeue(cfs_rq, se);
 2775		dequeue_runnable_load_avg(cfs_rq, se);
 2776	}
 2777	dequeue_load_avg(cfs_rq, se);
 2778
 2779	se->runnable_weight = runnable;
 2780	update_load_set(&se->load, weight);
 2781
 2782#ifdef CONFIG_SMP
 2783	do {
 2784		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
 2785
 2786		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
 2787		se->avg.runnable_load_avg =
 2788			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
 2789	} while (0);
 2790#endif
 2791
 2792	enqueue_load_avg(cfs_rq, se);
 2793	if (se->on_rq) {
 2794		account_entity_enqueue(cfs_rq, se);
 2795		enqueue_runnable_load_avg(cfs_rq, se);
 2796	}
 2797}
 2798
 2799void reweight_task(struct task_struct *p, int prio)
 2800{
 2801	struct sched_entity *se = &p->se;
 2802	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 2803	struct load_weight *load = &se->load;
 2804	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
 2805
 2806	reweight_entity(cfs_rq, se, weight, weight);
 2807	load->inv_weight = sched_prio_to_wmult[prio];
 2808}
 2809
 2810#ifdef CONFIG_FAIR_GROUP_SCHED
 2811#ifdef CONFIG_SMP
 2812/*
 2813 * All this does is approximate the hierarchical proportion which includes that
 2814 * global sum we all love to hate.
 2815 *
 2816 * That is, the weight of a group entity, is the proportional share of the
 2817 * group weight based on the group runqueue weights. That is:
 2818 *
 2819 *                     tg->weight * grq->load.weight
 2820 *   ge->load.weight = -----------------------------               (1)
 2821 *			  \Sum grq->load.weight
 2822 *
 2823 * Now, because computing that sum is prohibitively expensive to compute (been
 2824 * there, done that) we approximate it with this average stuff. The average
 2825 * moves slower and therefore the approximation is cheaper and more stable.
 2826 *
 2827 * So instead of the above, we substitute:
 2828 *
 2829 *   grq->load.weight -> grq->avg.load_avg                         (2)
 2830 *
 2831 * which yields the following:
 2832 *
 2833 *                     tg->weight * grq->avg.load_avg
 2834 *   ge->load.weight = ------------------------------              (3)
 2835 *				tg->load_avg
 2836 *
 2837 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
 2838 *
 2839 * That is shares_avg, and it is right (given the approximation (2)).
 2840 *
 2841 * The problem with it is that because the average is slow -- it was designed
 2842 * to be exactly that of course -- this leads to transients in boundary
 2843 * conditions. In specific, the case where the group was idle and we start the
 2844 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
 2845 * yielding bad latency etc..
 2846 *
 2847 * Now, in that special case (1) reduces to:
 2848 *
 2849 *                     tg->weight * grq->load.weight
 2850 *   ge->load.weight = ----------------------------- = tg->weight   (4)
 2851 *			    grp->load.weight
 2852 *
 2853 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
 2854 *
 2855 * So what we do is modify our approximation (3) to approach (4) in the (near)
 2856 * UP case, like:
 2857 *
 2858 *   ge->load.weight =
 2859 *
 2860 *              tg->weight * grq->load.weight
 2861 *     ---------------------------------------------------         (5)
 2862 *     tg->load_avg - grq->avg.load_avg + grq->load.weight
 2863 *
 2864 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
 2865 * we need to use grq->avg.load_avg as its lower bound, which then gives:
 2866 *
 2867 *
 2868 *                     tg->weight * grq->load.weight
 2869 *   ge->load.weight = -----------------------------		   (6)
 2870 *				tg_load_avg'
 2871 *
 2872 * Where:
 2873 *
 2874 *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
 2875 *                  max(grq->load.weight, grq->avg.load_avg)
 2876 *
 2877 * And that is shares_weight and is icky. In the (near) UP case it approaches
 2878 * (4) while in the normal case it approaches (3). It consistently
 2879 * overestimates the ge->load.weight and therefore:
 2880 *
 2881 *   \Sum ge->load.weight >= tg->weight
 2882 *
 2883 * hence icky!
 2884 */
 2885static long calc_group_shares(struct cfs_rq *cfs_rq)
 2886{
 2887	long tg_weight, tg_shares, load, shares;
 2888	struct task_group *tg = cfs_rq->tg;
 2889
 2890	tg_shares = READ_ONCE(tg->shares);
 2891
 2892	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 2893
 2894	tg_weight = atomic_long_read(&tg->load_avg);
 2895
 2896	/* Ensure tg_weight >= load */
 2897	tg_weight -= cfs_rq->tg_load_avg_contrib;
 2898	tg_weight += load;
 2899
 2900	shares = (tg_shares * load);
 2901	if (tg_weight)
 2902		shares /= tg_weight;
 2903
 2904	/*
 2905	 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
 2906	 * of a group with small tg->shares value. It is a floor value which is
 2907	 * assigned as a minimum load.weight to the sched_entity representing
 2908	 * the group on a CPU.
 2909	 *
 2910	 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
 2911	 * on an 8-core system with 8 tasks each runnable on one CPU shares has
 2912	 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
 2913	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
 2914	 * instead of 0.
 2915	 */
 2916	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 2917}
 2918
 2919/*
 2920 * This calculates the effective runnable weight for a group entity based on
 2921 * the group entity weight calculated above.
 2922 *
 2923 * Because of the above approximation (2), our group entity weight is
 2924 * an load_avg based ratio (3). This means that it includes blocked load and
 2925 * does not represent the runnable weight.
 2926 *
 2927 * Approximate the group entity's runnable weight per ratio from the group
 2928 * runqueue:
 2929 *
 2930 *					     grq->avg.runnable_load_avg
 2931 *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
 2932 *						 grq->avg.load_avg
 2933 *
 2934 * However, analogous to above, since the avg numbers are slow, this leads to
 2935 * transients in the from-idle case. Instead we use:
 2936 *
 2937 *   ge->runnable_weight = ge->load.weight *
 2938 *
 2939 *		max(grq->avg.runnable_load_avg, grq->runnable_weight)
 2940 *		-----------------------------------------------------	(8)
 2941 *		      max(grq->avg.load_avg, grq->load.weight)
 2942 *
 2943 * Where these max() serve both to use the 'instant' values to fix the slow
 2944 * from-idle and avoid the /0 on to-idle, similar to (6).
 2945 */
 2946static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
 2947{
 2948	long runnable, load_avg;
 2949
 2950	load_avg = max(cfs_rq->avg.load_avg,
 2951		       scale_load_down(cfs_rq->load.weight));
 2952
 2953	runnable = max(cfs_rq->avg.runnable_load_avg,
 2954		       scale_load_down(cfs_rq->runnable_weight));
 2955
 2956	runnable *= shares;
 2957	if (load_avg)
 2958		runnable /= load_avg;
 2959
 2960	return clamp_t(long, runnable, MIN_SHARES, shares);
 2961}
 2962#endif /* CONFIG_SMP */
 2963
 2964static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 2965
 2966/*
 2967 * Recomputes the group entity based on the current state of its group
 2968 * runqueue.
 2969 */
 2970static void update_cfs_group(struct sched_entity *se)
 2971{
 2972	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 2973	long shares, runnable;
 2974
 2975	if (!gcfs_rq)
 2976		return;
 2977
 2978	if (throttled_hierarchy(gcfs_rq))
 2979		return;
 2980
 2981#ifndef CONFIG_SMP
 2982	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
 2983
 2984	if (likely(se->load.weight == shares))
 2985		return;
 2986#else
 2987	shares   = calc_group_shares(gcfs_rq);
 2988	runnable = calc_group_runnable(gcfs_rq, shares);
 2989#endif
 2990
 2991	reweight_entity(cfs_rq_of(se), se, shares, runnable);
 2992}
 2993
 2994#else /* CONFIG_FAIR_GROUP_SCHED */
 2995static inline void update_cfs_group(struct sched_entity *se)
 2996{
 2997}
 2998#endif /* CONFIG_FAIR_GROUP_SCHED */
 2999
 3000static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 3001{
 3002	struct rq *rq = rq_of(cfs_rq);
 3003
 3004	if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
 3005		/*
 3006		 * There are a few boundary cases this might miss but it should
 3007		 * get called often enough that that should (hopefully) not be
 3008		 * a real problem.
 3009		 *
 3010		 * It will not get called when we go idle, because the idle
 3011		 * thread is a different class (!fair), nor will the utilization
 3012		 * number include things like RT tasks.
 3013		 *
 3014		 * As is, the util number is not freq-invariant (we'd have to
 3015		 * implement arch_scale_freq_capacity() for that).
 3016		 *
 3017		 * See cpu_util().
 3018		 */
 3019		cpufreq_update_util(rq, flags);
 3020	}
 3021}
 3022
 3023#ifdef CONFIG_SMP
 3024/*
 3025 * Approximate:
 3026 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
 3027 */
 3028static u64 decay_load(u64 val, u64 n)
 3029{
 3030	unsigned int local_n;
 3031
 3032	if (unlikely(n > LOAD_AVG_PERIOD * 63))
 3033		return 0;
 3034
 3035	/* after bounds checking we can collapse to 32-bit */
 3036	local_n = n;
 3037
 3038	/*
 3039	 * As y^PERIOD = 1/2, we can combine
 3040	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
 3041	 * With a look-up table which covers y^n (n<PERIOD)
 3042	 *
 3043	 * To achieve constant time decay_load.
 3044	 */
 3045	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
 3046		val >>= local_n / LOAD_AVG_PERIOD;
 3047		local_n %= LOAD_AVG_PERIOD;
 3048	}
 3049
 3050	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
 3051	return val;
 3052}
 3053
 3054static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
 3055{
 3056	u32 c1, c2, c3 = d3; /* y^0 == 1 */
 3057
 3058	/*
 3059	 * c1 = d1 y^p
 3060	 */
 3061	c1 = decay_load((u64)d1, periods);
 3062
 3063	/*
 3064	 *            p-1
 3065	 * c2 = 1024 \Sum y^n
 3066	 *            n=1
 3067	 *
 3068	 *              inf        inf
 3069	 *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
 3070	 *              n=0        n=p
 3071	 */
 3072	c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
 3073
 3074	return c1 + c2 + c3;
 3075}
 3076
 3077/*
 3078 * Accumulate the three separate parts of the sum; d1 the remainder
 3079 * of the last (incomplete) period, d2 the span of full periods and d3
 3080 * the remainder of the (incomplete) current period.
 3081 *
 3082 *           d1          d2           d3
 3083 *           ^           ^            ^
 3084 *           |           |            |
 3085 *         |<->|<----------------->|<--->|
 3086 * ... |---x---|------| ... |------|-----x (now)
 3087 *
 3088 *                           p-1
 3089 * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
 3090 *                           n=1
 3091 *
 3092 *    = u y^p +					(Step 1)
 3093 *
 3094 *                     p-1
 3095 *      d1 y^p + 1024 \Sum y^n + d3 y^0		(Step 2)
 3096 *                     n=1
 3097 */
 3098static __always_inline u32
 3099accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 3100	       unsigned long load, unsigned long runnable, int running)
 3101{
 3102	unsigned long scale_freq, scale_cpu;
 3103	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
 3104	u64 periods;
 3105
 3106	scale_freq = arch_scale_freq_capacity(cpu);
 3107	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 3108
 3109	delta += sa->period_contrib;
 3110	periods = delta / 1024; /* A period is 1024us (~1ms) */
 3111
 3112	/*
 3113	 * Step 1: decay old *_sum if we crossed period boundaries.
 3114	 */
 3115	if (periods) {
 3116		sa->load_sum = decay_load(sa->load_sum, periods);
 3117		sa->runnable_load_sum =
 3118			decay_load(sa->runnable_load_sum, periods);
 3119		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 3120
 3121		/*
 3122		 * Step 2
 3123		 */
 3124		delta %= 1024;
 3125		contrib = __accumulate_pelt_segments(periods,
 3126				1024 - sa->period_contrib, delta);
 3127	}
 3128	sa->period_contrib = delta;
 3129
 3130	contrib = cap_scale(contrib, scale_freq);
 3131	if (load)
 3132		sa->load_sum += load * contrib;
 3133	if (runnable)
 3134		sa->runnable_load_sum += runnable * contrib;
 3135	if (running)
 3136		sa->util_sum += contrib * scale_cpu;
 3137
 3138	return periods;
 3139}
 3140
 3141/*
 3142 * We can represent the historical contribution to runnable average as the
 3143 * coefficients of a geometric series.  To do this we sub-divide our runnable
 3144 * history into segments of approximately 1ms (1024us); label the segment that
 3145 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
 3146 *
 3147 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
 3148 *      p0            p1           p2
 3149 *     (now)       (~1ms ago)  (~2ms ago)
 3150 *
 3151 * Let u_i denote the fraction of p_i that the entity was runnable.
 3152 *
 3153 * We then designate the fractions u_i as our co-efficients, yielding the
 3154 * following representation of historical load:
 3155 *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
 3156 *
 3157 * We choose y based on the with of a reasonably scheduling period, fixing:
 3158 *   y^32 = 0.5
 3159 *
 3160 * This means that the contribution to load ~32ms ago (u_32) will be weighted
 3161 * approximately half as much as the contribution to load within the last ms
 3162 * (u_0).
 3163 *
 3164 * When a period "rolls over" and we have new u_0`, multiplying the previous
 3165 * sum again by y is sufficient to update:
 3166 *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 3167 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 3168 */
 3169static __always_inline int
 3170___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
 3171		  unsigned long load, unsigned long runnable, int running)
 3172{
 3173	u64 delta;
 3174
 3175	delta = now - sa->last_update_time;
 3176	/*
 3177	 * This should only happen when time goes backwards, which it
 3178	 * unfortunately does during sched clock init when we swap over to TSC.
 3179	 */
 3180	if ((s64)delta < 0) {
 3181		sa->last_update_time = now;
 3182		return 0;
 3183	}
 3184
 3185	/*
 3186	 * Use 1024ns as the unit of measurement since it's a reasonable
 3187	 * approximation of 1us and fast to compute.
 3188	 */
 3189	delta >>= 10;
 3190	if (!delta)
 3191		return 0;
 3192
 3193	sa->last_update_time += delta << 10;
 3194
 3195	/*
 3196	 * running is a subset of runnable (weight) so running can't be set if
 3197	 * runnable is clear. But there are some corner cases where the current
 3198	 * se has been already dequeued but cfs_rq->curr still points to it.
 3199	 * This means that weight will be 0 but not running for a sched_entity
 3200	 * but also for a cfs_rq if the latter becomes idle. As an example,
 3201	 * this happens during idle_balance() which calls
 3202	 * update_blocked_averages()
 3203	 */
 3204	if (!load)
 3205		runnable = running = 0;
 3206
 3207	/*
 3208	 * Now we know we crossed measurement unit boundaries. The *_avg
 3209	 * accrues by two steps:
 3210	 *
 3211	 * Step 1: accumulate *_sum since last_update_time. If we haven't
 3212	 * crossed period boundaries, finish.
 3213	 */
 3214	if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
 3215		return 0;
 3216
 3217	return 1;
 3218}
 3219
 3220static __always_inline void
 3221___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
 3222{
 3223	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 3224
 3225	/*
 3226	 * Step 2: update *_avg.
 3227	 */
 3228	sa->load_avg = div_u64(load * sa->load_sum, divider);
 3229	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider);
 3230	sa->util_avg = sa->util_sum / divider;
 3231}
 3232
 3233/*
 3234 * When a task is dequeued, its estimated utilization should not be update if
 3235 * its util_avg has not been updated at least once.
 3236 * This flag is used to synchronize util_avg updates with util_est updates.
 3237 * We map this information into the LSB bit of the utilization saved at
 3238 * dequeue time (i.e. util_est.dequeued).
 3239 */
 3240#define UTIL_AVG_UNCHANGED 0x1
 3241
 3242static inline void cfs_se_util_change(struct sched_avg *avg)
 3243{
 3244	unsigned int enqueued;
 3245
 3246	if (!sched_feat(UTIL_EST))
 3247		return;
 3248
 3249	/* Avoid store if the flag has been already set */
 3250	enqueued = avg->util_est.enqueued;
 3251	if (!(enqueued & UTIL_AVG_UNCHANGED))
 3252		return;
 3253
 3254	/* Reset flag to report util_avg has been updated */
 3255	enqueued &= ~UTIL_AVG_UNCHANGED;
 3256	WRITE_ONCE(avg->util_est.enqueued, enqueued);
 3257}
 3258
 3259/*
 3260 * sched_entity:
 3261 *
 3262 *   task:
 3263 *     se_runnable() == se_weight()
 3264 *
 3265 *   group: [ see update_cfs_group() ]
 3266 *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
 3267 *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
 3268 *
 3269 *   load_sum := runnable_sum
 3270 *   load_avg = se_weight(se) * runnable_avg
 3271 *
 3272 *   runnable_load_sum := runnable_sum
 3273 *   runnable_load_avg = se_runnable(se) * runnable_avg
 3274 *
 3275 * XXX collapse load_sum and runnable_load_sum
 3276 *
 3277 * cfq_rs:
 3278 *
 3279 *   load_sum = \Sum se_weight(se) * se->avg.load_sum
 3280 *   load_avg = \Sum se->avg.load_avg
 3281 *
 3282 *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
 3283 *   runnable_load_avg = \Sum se->avg.runable_load_avg
 3284 */
 3285
 3286static int
 3287__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 3288{
 3289	if (entity_is_task(se))
 3290		se->runnable_weight = se->load.weight;
 3291
 3292	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
 3293		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 3294		return 1;
 3295	}
 3296
 3297	return 0;
 3298}
 3299
 3300static int
 3301__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 3302{
 3303	if (entity_is_task(se))
 3304		se->runnable_weight = se->load.weight;
 3305
 3306	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
 3307				cfs_rq->curr == se)) {
 3308
 3309		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 3310		cfs_se_util_change(&se->avg);
 3311		return 1;
 3312	}
 3313
 3314	return 0;
 3315}
 3316
 3317static int
 3318__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 3319{
 3320	if (___update_load_sum(now, cpu, &cfs_rq->avg,
 3321				scale_load_down(cfs_rq->load.weight),
 3322				scale_load_down(cfs_rq->runnable_weight),
 3323				cfs_rq->curr != NULL)) {
 3324
 3325		___update_load_avg(&cfs_rq->avg, 1, 1);
 3326		return 1;
 3327	}
 3328
 3329	return 0;
 3330}
 3331
 3332#ifdef CONFIG_FAIR_GROUP_SCHED
 3333/**
 3334 * update_tg_load_avg - update the tg's load avg
 3335 * @cfs_rq: the cfs_rq whose avg changed
 3336 * @force: update regardless of how small the difference
 3337 *
 3338 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
 3339 * However, because tg->load_avg is a global value there are performance
 3340 * considerations.
 3341 *
 3342 * In order to avoid having to look at the other cfs_rq's, we use a
 3343 * differential update where we store the last value we propagated. This in
 3344 * turn allows skipping updates if the differential is 'small'.
 3345 *
 3346 * Updating tg's load_avg is necessary before update_cfs_share().
 3347 */
 3348static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 3349{
 3350	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 3351
 3352	/*
 3353	 * No need to update load_avg for root_task_group as it is not used.
 3354	 */
 3355	if (cfs_rq->tg == &root_task_group)
 3356		return;
 3357
 3358	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 3359		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 3360		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
 3361	}
 3362}
 3363
 3364/*
 3365 * Called within set_task_rq() right before setting a task's CPU. The
 3366 * caller only guarantees p->pi_lock is held; no other assumptions,
 3367 * including the state of rq->lock, should be made.
 3368 */
 3369void set_task_rq_fair(struct sched_entity *se,
 3370		      struct cfs_rq *prev, struct cfs_rq *next)
 3371{
 3372	u64 p_last_update_time;
 3373	u64 n_last_update_time;
 3374
 3375	if (!sched_feat(ATTACH_AGE_LOAD))
 3376		return;
 3377
 3378	/*
 3379	 * We are supposed to update the task to "current" time, then its up to
 3380	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
 3381	 * getting what current time is, so simply throw away the out-of-date
 3382	 * time. This will result in the wakee task is less decayed, but giving
 3383	 * the wakee more load sounds not bad.
 3384	 */
 3385	if (!(se->avg.last_update_time && prev))
 3386		return;
 3387
 3388#ifndef CONFIG_64BIT
 3389	{
 3390		u64 p_last_update_time_copy;
 3391		u64 n_last_update_time_copy;
 3392
 3393		do {
 3394			p_last_update_time_copy = prev->load_last_update_time_copy;
 3395			n_last_update_time_copy = next->load_last_update_time_copy;
 3396
 3397			smp_rmb();
 3398
 3399			p_last_update_time = prev->avg.last_update_time;
 3400			n_last_update_time = next->avg.last_update_time;
 3401
 3402		} while (p_last_update_time != p_last_update_time_copy ||
 3403			 n_last_update_time != n_last_update_time_copy);
 3404	}
 3405#else
 3406	p_last_update_time = prev->avg.last_update_time;
 3407	n_last_update_time = next->avg.last_update_time;
 3408#endif
 3409	__update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
 3410	se->avg.last_update_time = n_last_update_time;
 3411}
 3412
 3413
 3414/*
 3415 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
 3416 * propagate its contribution. The key to this propagation is the invariant
 3417 * that for each group:
 3418 *
 3419 *   ge->avg == grq->avg						(1)
 3420 *
 3421 * _IFF_ we look at the pure running and runnable sums. Because they
 3422 * represent the very same entity, just at different points in the hierarchy.
 3423 *
 3424 * Per the above update_tg_cfs_util() is trivial and simply copies the running
 3425 * sum over (but still wrong, because the group entity and group rq do not have
 3426 * their PELT windows aligned).
 3427 *
 3428 * However, update_tg_cfs_runnable() is more complex. So we have:
 3429 *
 3430 *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
 3431 *
 3432 * And since, like util, the runnable part should be directly transferable,
 3433 * the following would _appear_ to be the straight forward approach:
 3434 *
 3435 *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg	(3)
 3436 *
 3437 * And per (1) we have:
 3438 *
 3439 *   ge->avg.runnable_avg == grq->avg.runnable_avg
 3440 *
 3441 * Which gives:
 3442 *
 3443 *                      ge->load.weight * grq->avg.load_avg
 3444 *   ge->avg.load_avg = -----------------------------------		(4)
 3445 *                               grq->load.weight
 3446 *
 3447 * Except that is wrong!
 3448 *
 3449 * Because while for entities historical weight is not important and we
 3450 * really only care about our future and therefore can consider a pure
 3451 * runnable sum, runqueues can NOT do this.
 3452 *
 3453 * We specifically want runqueues to have a load_avg that includes
 3454 * historical weights. Those represent the blocked load, the load we expect
 3455 * to (shortly) return to us. This only works by keeping the weights as
 3456 * integral part of the sum. We therefore cannot decompose as per (3).
 3457 *
 3458 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
 3459 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
 3460 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
 3461 * runnable section of these tasks overlap (or not). If they were to perfectly
 3462 * align the rq as a whole would be runnable 2/3 of the time. If however we
 3463 * always have at least 1 runnable task, the rq as a whole is always runnable.
 3464 *
 3465 * So we'll have to approximate.. :/
 3466 *
 3467 * Given the constraint:
 3468 *
 3469 *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
 3470 *
 3471 * We can construct a rule that adds runnable to a rq by assuming minimal
 3472 * overlap.
 3473 *
 3474 * On removal, we'll assume each task is equally runnable; which yields:
 3475 *
 3476 *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
 3477 *
 3478 * XXX: only do this for the part of runnable > running ?
 3479 *
 3480 */
 3481
 3482static inline void
 3483update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 3484{
 3485	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
 3486
 3487	/* Nothing to update */
 3488	if (!delta)
 3489		return;
 3490
 3491	/*
 3492	 * The relation between sum and avg is:
 3493	 *
 3494	 *   LOAD_AVG_MAX - 1024 + sa->period_contrib
 3495	 *
 3496	 * however, the PELT windows are not aligned between grq and gse.
 3497	 */
 3498
 3499	/* Set new sched_entity's utilization */
 3500	se->avg.util_avg = gcfs_rq->avg.util_avg;
 3501	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
 3502
 3503	/* Update parent cfs_rq utilization */
 3504	add_positive(&cfs_rq->avg.util_avg, delta);
 3505	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
 3506}
 3507
 3508static inline void
 3509update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 3510{
 3511	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
 3512	unsigned long runnable_load_avg, load_avg;
 3513	u64 runnable_load_sum, load_sum = 0;
 3514	s64 delta_sum;
 3515
 3516	if (!runnable_sum)
 3517		return;
 3518
 3519	gcfs_rq->prop_runnable_sum = 0;
 3520
 3521	if (runnable_sum >= 0) {
 3522		/*
 3523		 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
 3524		 * the CPU is saturated running == runnable.
 3525		 */
 3526		runnable_sum += se->avg.load_sum;
 3527		runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
 3528	} else {
 3529		/*
 3530		 * Estimate the new unweighted runnable_sum of the gcfs_rq by
 3531		 * assuming all tasks are equally runnable.
 3532		 */
 3533		if (scale_load_down(gcfs_rq->load.weight)) {
 3534			load_sum = div_s64(gcfs_rq->avg.load_sum,
 3535				scale_load_down(gcfs_rq->load.weight));
 3536		}
 3537
 3538		/* But make sure to not inflate se's runnable */
 3539		runnable_sum = min(se->avg.load_sum, load_sum);
 3540	}
 3541
 3542	/*
 3543	 * runnable_sum can't be lower than running_sum
 3544	 * As running sum is scale with CPU capacity wehreas the runnable sum
 3545	 * is not we rescale running_sum 1st
 3546	 */
 3547	running_sum = se->avg.util_sum /
 3548		arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
 3549	runnable_sum = max(runnable_sum, running_sum);
 3550
 3551	load_sum = (s64)se_weight(se) * runnable_sum;
 3552	load_avg = div_s64(load_sum, LOAD_AVG_MAX);
 3553
 3554	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
 3555	delta_avg = load_avg - se->avg.load_avg;
 3556
 3557	se->avg.load_sum = runnable_sum;
 3558	se->avg.load_avg = load_avg;
 3559	add_positive(&cfs_rq->avg.load_avg, delta_avg);
 3560	add_positive(&cfs_rq->avg.load_sum, delta_sum);
 3561
 3562	runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
 3563	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
 3564	delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
 3565	delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
 3566
 3567	se->avg.runnable_load_sum = runnable_sum;
 3568	se->avg.runnable_load_avg = runnable_load_avg;
 3569
 3570	if (se->on_rq) {
 3571		add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
 3572		add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
 3573	}
 3574}
 3575
 3576static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
 3577{
 3578	cfs_rq->propagate = 1;
 3579	cfs_rq->prop_runnable_sum += runnable_sum;
 3580}
 3581
 3582/* Update task and its cfs_rq load average */
 3583static inline int propagate_entity_load_avg(struct sched_entity *se)
 3584{
 3585	struct cfs_rq *cfs_rq, *gcfs_rq;
 3586
 3587	if (entity_is_task(se))
 3588		return 0;
 3589
 3590	gcfs_rq = group_cfs_rq(se);
 3591	if (!gcfs_rq->propagate)
 3592		return 0;
 3593
 3594	gcfs_rq->propagate = 0;
 3595
 3596	cfs_rq = cfs_rq_of(se);
 3597
 3598	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
 3599
 3600	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
 3601	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
 3602
 3603	return 1;
 3604}
 3605
 3606/*
 3607 * Check if we need to update the load and the utilization of a blocked
 3608 * group_entity:
 3609 */
 3610static inline bool skip_blocked_update(struct sched_entity *se)
 3611{
 3612	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 3613
 3614	/*
 3615	 * If sched_entity still have not zero load or utilization, we have to
 3616	 * decay it:
 3617	 */
 3618	if (se->avg.load_avg || se->avg.util_avg)
 3619		return false;
 3620
 3621	/*
 3622	 * If there is a pending propagation, we have to update the load and
 3623	 * the utilization of the sched_entity:
 3624	 */
 3625	if (gcfs_rq->propagate)
 3626		return false;
 3627
 3628	/*
 3629	 * Otherwise, the load and the utilization of the sched_entity is
 3630	 * already zero and there is no pending propagation, so it will be a
 3631	 * waste of time to try to decay it:
 3632	 */
 3633	return true;
 3634}
 3635
 3636#else /* CONFIG_FAIR_GROUP_SCHED */
 3637
 3638static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 3639
 3640static inline int propagate_entity_load_avg(struct sched_entity *se)
 3641{
 3642	return 0;
 3643}
 3644
 3645static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
 3646
 3647#endif /* CONFIG_FAIR_GROUP_SCHED */
 3648
 3649/**
 3650 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
 3651 * @now: current time, as per cfs_rq_clock_task()
 3652 * @cfs_rq: cfs_rq to update
 3653 *
 3654 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
 3655 * avg. The immediate corollary is that all (fair) tasks must be attached, see
 3656 * post_init_entity_util_avg().
 3657 *
 3658 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
 3659 *
 3660 * Returns true if the load decayed or we removed load.
 3661 *
 3662 * Since both these conditions indicate a changed cfs_rq->avg.load we should
 3663 * call update_tg_load_avg() when this function returns true.
 3664 */
 3665static inline int
 3666update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 3667{
 3668	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
 3669	struct sched_avg *sa = &cfs_rq->avg;
 3670	int decayed = 0;
 3671
 3672	if (cfs_rq->removed.nr) {
 3673		unsigned long r;
 3674		u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 3675
 3676		raw_spin_lock(&cfs_rq->removed.lock);
 3677		swap(cfs_rq->removed.util_avg, removed_util);
 3678		swap(cfs_rq->removed.load_avg, removed_load);
 3679		swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
 3680		cfs_rq->removed.nr = 0;
 3681		raw_spin_unlock(&cfs_rq->removed.lock);
 3682
 3683		r = removed_load;
 3684		sub_positive(&sa->load_avg, r);
 3685		sub_positive(&sa->load_sum, r * divider);
 3686
 3687		r = removed_util;
 3688		sub_positive(&sa->util_avg, r);
 3689		sub_positive(&sa->util_sum, r * divider);
 3690
 3691		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
 3692
 3693		decayed = 1;
 3694	}
 3695
 3696	decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
 3697
 3698#ifndef CONFIG_64BIT
 3699	smp_wmb();
 3700	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 3701#endif
 3702
 3703	if (decayed)
 3704		cfs_rq_util_change(cfs_rq, 0);
 3705
 3706	return decayed;
 3707}
 3708
 3709/**
 3710 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
 3711 * @cfs_rq: cfs_rq to attach to
 3712 * @se: sched_entity to attach
 3713 *
 3714 * Must call update_cfs_rq_load_avg() before this, since we rely on
 3715 * cfs_rq->avg.last_update_time being current.
 3716 */
 3717static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 3718{
 3719	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
 3720
 3721	/*
 3722	 * When we attach the @se to the @cfs_rq, we must align the decay
 3723	 * window because without that, really weird and wonderful things can
 3724	 * happen.
 3725	 *
 3726	 * XXX illustrate
 3727	 */
 3728	se->avg.last_update_time = cfs_rq->avg.last_update_time;
 3729	se->avg.period_contrib = cfs_rq->avg.period_contrib;
 3730
 3731	/*
 3732	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
 3733	 * period_contrib. This isn't strictly correct, but since we're
 3734	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
 3735	 * _sum a little.
 3736	 */
 3737	se->avg.util_sum = se->avg.util_avg * divider;
 3738
 3739	se->avg.load_sum = divider;
 3740	if (se_weight(se)) {
 3741		se->avg.load_sum =
 3742			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
 3743	}
 3744
 3745	se->avg.runnable_load_sum = se->avg.load_sum;
 3746
 3747	enqueue_load_avg(cfs_rq, se);
 3748	cfs_rq->avg.util_avg += se->avg.util_avg;
 3749	cfs_rq->avg.util_sum += se->avg.util_sum;
 3750
 3751	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 3752
 3753	cfs_rq_util_change(cfs_rq, flags);
 3754}
 3755
 3756/**
 3757 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
 3758 * @cfs_rq: cfs_rq to detach from
 3759 * @se: sched_entity to detach
 3760 *
 3761 * Must call update_cfs_rq_load_avg() before this, since we rely on
 3762 * cfs_rq->avg.last_update_time being current.
 3763 */
 3764static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 3765{
 3766	dequeue_load_avg(cfs_rq, se);
 3767	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 3768	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 3769
 3770	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 3771
 3772	cfs_rq_util_change(cfs_rq, 0);
 3773}
 3774
 3775/*
 3776 * Optional action to be done while updating the load average
 3777 */
 3778#define UPDATE_TG	0x1
 3779#define SKIP_AGE_LOAD	0x2
 3780#define DO_ATTACH	0x4
 3781
 3782/* Update task and its cfs_rq load average */
 3783static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 3784{
 3785	u64 now = cfs_rq_clock_task(cfs_rq);
 3786	struct rq *rq = rq_of(cfs_rq);
 3787	int cpu = cpu_of(rq);
 3788	int decayed;
 3789
 3790	/*
 3791	 * Track task load average for carrying it to new CPU after migrated, and
 3792	 * track group sched_entity load average for task_h_load calc in migration
 3793	 */
 3794	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
 3795		__update_load_avg_se(now, cpu, cfs_rq, se);
 3796
 3797	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
 3798	decayed |= propagate_entity_load_avg(se);
 3799
 3800	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
 3801
 3802		/*
 3803		 * DO_ATTACH means we're here from enqueue_entity().
 3804		 * !last_update_time means we've passed through
 3805		 * migrate_task_rq_fair() indicating we migrated.
 3806		 *
 3807		 * IOW we're enqueueing a task on a new CPU.
 3808		 */
 3809		attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
 3810		update_tg_load_avg(cfs_rq, 0);
 3811
 3812	} else if (decayed && (flags & UPDATE_TG))
 3813		update_tg_load_avg(cfs_rq, 0);
 3814}
 3815
 3816#ifndef CONFIG_64BIT
 3817static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 3818{
 3819	u64 last_update_time_copy;
 3820	u64 last_update_time;
 3821
 3822	do {
 3823		last_update_time_copy = cfs_rq->load_last_update_time_copy;
 3824		smp_rmb();
 3825		last_update_time = cfs_rq->avg.last_update_time;
 3826	} while (last_update_time != last_update_time_copy);
 3827
 3828	return last_update_time;
 3829}
 3830#else
 3831static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 3832{
 3833	return cfs_rq->avg.last_update_time;
 3834}
 3835#endif
 3836
 3837/*
 3838 * Synchronize entity load avg of dequeued entity without locking
 3839 * the previous rq.
 3840 */
 3841void sync_entity_load_avg(struct sched_entity *se)
 3842{
 3843	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 3844	u64 last_update_time;
 3845
 3846	last_update_time = cfs_rq_last_update_time(cfs_rq);
 3847	__update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
 3848}
 3849
 3850/*
 3851 * Task first catches up with cfs_rq, and then subtract
 3852 * itself from the cfs_rq (task must be off the queue now).
 3853 */
 3854void remove_entity_load_avg(struct sched_entity *se)
 3855{
 3856	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 3857	unsigned long flags;
 3858
 3859	/*
 3860	 * tasks cannot exit without having gone through wake_up_new_task() ->
 3861	 * post_init_entity_util_avg() which will have added things to the
 3862	 * cfs_rq, so we can remove unconditionally.
 3863	 *
 3864	 * Similarly for groups, they will have passed through
 3865	 * post_init_entity_util_avg() before unregister_sched_fair_group()
 3866	 * calls this.
 3867	 */
 3868
 3869	sync_entity_load_avg(se);
 3870
 3871	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
 3872	++cfs_rq->removed.nr;
 3873	cfs_rq->removed.util_avg	+= se->avg.util_avg;
 3874	cfs_rq->removed.load_avg	+= se->avg.load_avg;
 3875	cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */
 3876	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 3877}
 3878
 3879static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 3880{
 3881	return cfs_rq->avg.runnable_load_avg;
 3882}
 3883
 3884static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 3885{
 3886	return cfs_rq->avg.load_avg;
 3887}
 3888
 3889static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
 3890
 3891static inline unsigned long task_util(struct task_struct *p)
 3892{
 3893	return READ_ONCE(p->se.avg.util_avg);
 3894}
 3895
 3896static inline unsigned long _task_util_est(struct task_struct *p)
 3897{
 3898	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 3899
 3900	return max(ue.ewma, ue.enqueued);
 3901}
 3902
 3903static inline unsigned long task_util_est(struct task_struct *p)
 3904{
 3905	return max(task_util(p), _task_util_est(p));
 3906}
 3907
 3908static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 3909				    struct task_struct *p)
 3910{
 3911	unsigned int enqueued;
 3912
 3913	if (!sched_feat(UTIL_EST))
 3914		return;
 3915
 3916	/* Update root cfs_rq's estimated utilization */
 3917	enqueued  = cfs_rq->avg.util_est.enqueued;
 3918	enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
 3919	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
 3920}
 3921
 3922/*
 3923 * Check if a (signed) value is within a specified (unsigned) margin,
 3924 * based on the observation that:
 3925 *
 3926 *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
 3927 *
 3928 * NOTE: this only works when value + maring < INT_MAX.
 3929 */
 3930static inline bool within_margin(int value, int margin)
 3931{
 3932	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
 3933}
 3934
 3935static void
 3936util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 3937{
 3938	long last_ewma_diff;
 3939	struct util_est ue;
 3940
 3941	if (!sched_feat(UTIL_EST))
 3942		return;
 3943
 3944	/*
 3945	 * Update root cfs_rq's estimated utilization
 3946	 *
 3947	 * If *p is the last task then the root cfs_rq's estimated utilization
 3948	 * of a CPU is 0 by definition.
 3949	 */
 3950	ue.enqueued = 0;
 3951	if (cfs_rq->nr_running) {
 3952		ue.enqueued  = cfs_rq->avg.util_est.enqueued;
 3953		ue.enqueued -= min_t(unsigned int, ue.enqueued,
 3954				     (_task_util_est(p) | UTIL_AVG_UNCHANGED));
 3955	}
 3956	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
 3957
 3958	/*
 3959	 * Skip update of task's estimated utilization when the task has not
 3960	 * yet completed an activation, e.g. being migrated.
 3961	 */
 3962	if (!task_sleep)
 3963		return;
 3964
 3965	/*
 3966	 * If the PELT values haven't changed since enqueue time,
 3967	 * skip the util_est update.
 3968	 */
 3969	ue = p->se.avg.util_est;
 3970	if (ue.enqueued & UTIL_AVG_UNCHANGED)
 3971		return;
 3972
 3973	/*
 3974	 * Skip update of task's estimated utilization when its EWMA is
 3975	 * already ~1% close to its last activation value.
 3976	 */
 3977	ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
 3978	last_ewma_diff = ue.enqueued - ue.ewma;
 3979	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
 3980		return;
 3981
 3982	/*
 3983	 * Update Task's estimated utilization
 3984	 *
 3985	 * When *p completes an activation we can consolidate another sample
 3986	 * of the task size. This is done by storing the current PELT value
 3987	 * as ue.enqueued and by using this value to update the Exponential
 3988	 * Weighted Moving Average (EWMA):
 3989	 *
 3990	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
 3991	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
 3992	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
 3993	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
 3994	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
 3995	 *
 3996	 * Where 'w' is the weight of new samples, which is configured to be
 3997	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
 3998	 */
 3999	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
 4000	ue.ewma  += last_ewma_diff;
 4001	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
 4002	WRITE_ONCE(p->se.avg.util_est, ue);
 4003}
 4004
 4005#else /* CONFIG_SMP */
 4006
 4007static inline int
 4008update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 4009{
 4010	return 0;
 4011}
 4012
 4013#define UPDATE_TG	0x0
 4014#define SKIP_AGE_LOAD	0x0
 4015#define DO_ATTACH	0x0
 4016
 4017static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 4018{
 4019	cfs_rq_util_change(cfs_rq, 0);
 4020}
 4021
 4022static inline void remove_entity_load_avg(struct sched_entity *se) {}
 4023
 4024static inline void
 4025attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
 4026static inline void
 4027detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 4028
 4029static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
 4030{
 4031	return 0;
 4032}
 4033
 4034static inline void
 4035util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 4036
 4037static inline void
 4038util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
 4039		 bool task_sleep) {}
 4040
 4041#endif /* CONFIG_SMP */
 4042
 4043static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 4044{
 4045#ifdef CONFIG_SCHED_DEBUG
 4046	s64 d = se->vruntime - cfs_rq->min_vruntime;
 4047
 4048	if (d < 0)
 4049		d = -d;
 4050
 4051	if (d > 3*sysctl_sched_latency)
 4052		schedstat_inc(cfs_rq->nr_spread_over);
 4053#endif
 4054}
 4055
 4056static void
 4057place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 4058{
 4059	u64 vruntime = cfs_rq->min_vruntime;
 4060
 4061	/*
 4062	 * The 'current' period is already promised to the current tasks,
 4063	 * however the extra weight of the new task will slow them down a
 4064	 * little, place the new task so that it fits in the slot that
 4065	 * stays open at the end.
 4066	 */
 4067	if (initial && sched_feat(START_DEBIT))
 4068		vruntime += sched_vslice(cfs_rq, se);
 4069
 4070	/* sleeps up to a single latency don't count. */
 4071	if (!initial) {
 4072		unsigned long thresh = sysctl_sched_latency;
 4073
 4074		/*
 4075		 * Halve their sleep time's effect, to allow
 4076		 * for a gentler effect of sleepers:
 4077		 */
 4078		if (sched_feat(GENTLE_FAIR_SLEEPERS))
 4079			thresh >>= 1;
 4080
 4081		vruntime -= thresh;
 4082	}
 4083
 4084	/* ensure we never gain time by being placed backwards. */
 4085	se->vruntime = max_vruntime(se->vruntime, vruntime);
 4086}
 4087
 4088static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 4089
 4090static inline void check_schedstat_required(void)
 4091{
 4092#ifdef CONFIG_SCHEDSTATS
 4093	if (schedstat_enabled())
 4094		return;
 4095
 4096	/* Force schedstat enabled if a dependent tracepoint is active */
 4097	if (trace_sched_stat_wait_enabled()    ||
 4098			trace_sched_stat_sleep_enabled()   ||
 4099			trace_sched_stat_iowait_enabled()  ||
 4100			trace_sched_stat_blocked_enabled() ||
 4101			trace_sched_stat_runtime_enabled())  {
 4102		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
 4103			     "stat_blocked and stat_runtime require the "
 4104			     "kernel parameter schedstats=enable or "
 4105			     "kernel.sched_schedstats=1\n");
 4106	}
 4107#endif
 4108}
 4109
 4110
 4111/*
 4112 * MIGRATION
 4113 *
 4114 *	dequeue
 4115 *	  update_curr()
 4116 *	    update_min_vruntime()
 4117 *	  vruntime -= min_vruntime
 4118 *
 4119 *	enqueue
 4120 *	  update_curr()
 4121 *	    update_min_vruntime()
 4122 *	  vruntime += min_vruntime
 4123 *
 4124 * this way the vruntime transition between RQs is done when both
 4125 * min_vruntime are up-to-date.
 4126 *
 4127 * WAKEUP (remote)
 4128 *
 4129 *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
 4130 *	  vruntime -= min_vruntime
 4131 *
 4132 *	enqueue
 4133 *	  update_curr()
 4134 *	    update_min_vruntime()
 4135 *	  vruntime += min_vruntime
 4136 *
 4137 * this way we don't have the most up-to-date min_vruntime on the originating
 4138 * CPU and an up-to-date min_vruntime on the destination CPU.
 4139 */
 4140
 4141static void
 4142enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 4143{
 4144	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
 4145	bool curr = cfs_rq->curr == se;
 4146
 4147	/*
 4148	 * If we're the current task, we must renormalise before calling
 4149	 * update_curr().
 4150	 */
 4151	if (renorm && curr)
 4152		se->vruntime += cfs_rq->min_vruntime;
 4153
 4154	update_curr(cfs_rq);
 4155
 4156	/*
 4157	 * Otherwise, renormalise after, such that we're placed at the current
 4158	 * moment in time, instead of some random moment in the past. Being
 4159	 * placed in the past could significantly boost this task to the
 4160	 * fairness detriment of existing tasks.
 4161	 */
 4162	if (renorm && !curr)
 4163		se->vruntime += cfs_rq->min_vruntime;
 4164
 4165	/*
 4166	 * When enqueuing a sched_entity, we must:
 4167	 *   - Update loads to have both entity and cfs_rq synced with now.
 4168	 *   - Add its load to cfs_rq->runnable_avg
 4169	 *   - For group_entity, update its weight to reflect the new share of
 4170	 *     its group cfs_rq
 4171	 *   - Add its new weight to cfs_rq->load.weight
 4172	 */
 4173	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
 4174	update_cfs_group(se);
 4175	enqueue_runnable_load_avg(cfs_rq, se);
 4176	account_entity_enqueue(cfs_rq, se);
 4177
 4178	if (flags & ENQUEUE_WAKEUP)
 4179		place_entity(cfs_rq, se, 0);
 4180
 4181	check_schedstat_required();
 4182	update_stats_enqueue(cfs_rq, se, flags);
 4183	check_spread(cfs_rq, se);
 4184	if (!curr)
 4185		__enqueue_entity(cfs_rq, se);
 4186	se->on_rq = 1;
 4187
 4188	if (cfs_rq->nr_running == 1) {
 4189		list_add_leaf_cfs_rq(cfs_rq);
 4190		check_enqueue_throttle(cfs_rq);
 4191	}
 4192}
 4193
 4194static void __clear_buddies_last(struct sched_entity *se)
 4195{
 4196	for_each_sched_entity(se) {
 4197		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 4198		if (cfs_rq->last != se)
 4199			break;
 4200
 4201		cfs_rq->last = NULL;
 4202	}
 4203}
 4204
 4205static void __clear_buddies_next(struct sched_entity *se)
 4206{
 4207	for_each_sched_entity(se) {
 4208		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 4209		if (cfs_rq->next != se)
 4210			break;
 4211
 4212		cfs_rq->next = NULL;
 4213	}
 4214}
 4215
 4216static void __clear_buddies_skip(struct sched_entity *se)
 4217{
 4218	for_each_sched_entity(se) {
 4219		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 4220		if (cfs_rq->skip != se)
 4221			break;
 4222
 4223		cfs_rq->skip = NULL;
 4224	}
 4225}
 4226
 4227static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 4228{
 4229	if (cfs_rq->last == se)
 4230		__clear_buddies_last(se);
 4231
 4232	if (cfs_rq->next == se)
 4233		__clear_buddies_next(se);
 4234
 4235	if (cfs_rq->skip == se)
 4236		__clear_buddies_skip(se);
 4237}
 4238
 4239static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 4240
 4241static void
 4242dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 4243{
 4244	/*
 4245	 * Update run-time statistics of the 'current'.
 4246	 */
 4247	update_curr(cfs_rq);
 4248
 4249	/*
 4250	 * When dequeuing a sched_entity, we must:
 4251	 *   - Update loads to have both entity and cfs_rq synced with now.
 4252	 *   - Substract its load from the cfs_rq->runnable_avg.
 4253	 *   - Substract its previous weight from cfs_rq->load.weight.
 4254	 *   - For group entity, update its weight to reflect the new share
 4255	 *     of its group cfs_rq.
 4256	 */
 4257	update_load_avg(cfs_rq, se, UPDATE_TG);
 4258	dequeue_runnable_load_avg(cfs_rq, se);
 4259
 4260	update_stats_dequeue(cfs_rq, se, flags);
 4261
 4262	clear_buddies(cfs_rq, se);
 4263
 4264	if (se != cfs_rq->curr)
 4265		__dequeue_entity(cfs_rq, se);
 4266	se->on_rq = 0;
 4267	account_entity_dequeue(cfs_rq, se);
 4268
 4269	/*
 4270	 * Normalize after update_curr(); which will also have moved
 4271	 * min_vruntime if @se is the one holding it back. But before doing
 4272	 * update_min_vruntime() again, which will discount @se's position and
 4273	 * can move min_vruntime forward still more.
 4274	 */
 4275	if (!(flags & DEQUEUE_SLEEP))
 4276		se->vruntime -= cfs_rq->min_vruntime;
 4277
 4278	/* return excess runtime on last dequeue */
 4279	return_cfs_rq_runtime(cfs_rq);
 4280
 4281	update_cfs_group(se);
 4282
 4283	/*
 4284	 * Now advance min_vruntime if @se was the entity holding it back,
 4285	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
 4286	 * put back on, and if we advance min_vruntime, we'll be placed back
 4287	 * further than we started -- ie. we'll be penalized.
 4288	 */
 4289	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
 4290		update_min_vruntime(cfs_rq);
 4291}
 4292
 4293/*
 4294 * Preempt the current task with a newly woken task if needed:
 4295 */
 4296static void
 4297check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 4298{
 4299	unsigned long ideal_runtime, delta_exec;
 4300	struct sched_entity *se;
 4301	s64 delta;
 4302
 4303	ideal_runtime = sched_slice(cfs_rq, curr);
 4304	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 4305	if (delta_exec > ideal_runtime) {
 4306		resched_curr(rq_of(cfs_rq));
 4307		/*
 4308		 * The current task ran long enough, ensure it doesn't get
 4309		 * re-elected due to buddy favours.
 4310		 */
 4311		clear_buddies(cfs_rq, curr);
 4312		return;
 4313	}
 4314
 4315	/*
 4316	 * Ensure that a task that missed wakeup preemption by a
 4317	 * narrow margin doesn't have to wait for a full slice.
 4318	 * This also mitigates buddy induced latencies under load.
 4319	 */
 4320	if (delta_exec < sysctl_sched_min_granularity)
 4321		return;
 4322
 4323	se = __pick_first_entity(cfs_rq);
 4324	delta = curr->vruntime - se->vruntime;
 4325
 4326	if (delta < 0)
 4327		return;
 4328
 4329	if (delta > ideal_runtime)
 4330		resched_curr(rq_of(cfs_rq));
 4331}
 4332
 4333static void
 4334set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 4335{
 4336	/* 'current' is not kept within the tree. */
 4337	if (se->on_rq) {
 4338		/*
 4339		 * Any task has to be enqueued before it get to execute on
 4340		 * a CPU. So account for the time it spent waiting on the
 4341		 * runqueue.
 4342		 */
 4343		update_stats_wait_end(cfs_rq, se);
 4344		__dequeue_entity(cfs_rq, se);
 4345		update_load_avg(cfs_rq, se, UPDATE_TG);
 4346	}
 4347
 4348	update_stats_curr_start(cfs_rq, se);
 4349	cfs_rq->curr = se;
 4350
 4351	/*
 4352	 * Track our maximum slice length, if the CPU's load is at
 4353	 * least twice that of our own weight (i.e. dont track it
 4354	 * when there are only lesser-weight tasks around):
 4355	 */
 4356	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 4357		schedstat_set(se->statistics.slice_max,
 4358			max((u64)schedstat_val(se->statistics.slice_max),
 4359			    se->sum_exec_runtime - se->prev_sum_exec_runtime));
 4360	}
 4361
 4362	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 4363}
 4364
 4365static int
 4366wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 4367
 4368/*
 4369 * Pick the next process, keeping these things in mind, in this order:
 4370 * 1) keep things fair between processes/task groups
 4371 * 2) pick the "next" process, since someone really wants that to run
 4372 * 3) pick the "last" process, for cache locality
 4373 * 4) do not run the "skip" process, if something else is available
 4374 */
 4375static struct sched_entity *
 4376pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 4377{
 4378	struct sched_entity *left = __pick_first_entity(cfs_rq);
 4379	struct sched_entity *se;
 4380
 4381	/*
 4382	 * If curr is set we have to see if its left of the leftmost entity
 4383	 * still in the tree, provided there was anything in the tree at all.
 4384	 */
 4385	if (!left || (curr && entity_before(curr, left)))
 4386		left = curr;
 4387
 4388	se = left; /* ideally we run the leftmost entity */
 4389
 4390	/*
 4391	 * Avoid running the skip buddy, if running something else can
 4392	 * be done without getting too unfair.
 4393	 */
 4394	if (cfs_rq->skip == se) {
 4395		struct sched_entity *second;
 4396
 4397		if (se == curr) {
 4398			second = __pick_first_entity(cfs_rq);
 4399		} else {
 4400			second = __pick_next_entity(se);
 4401			if (!second || (curr && entity_before(curr, second)))
 4402				second = curr;
 4403		}
 4404
 4405		if (second && wakeup_preempt_entity(second, left) < 1)
 4406			se = second;
 4407	}
 4408
 4409	/*
 4410	 * Prefer last buddy, try to return the CPU to a preempted task.
 4411	 */
 4412	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
 4413		se = cfs_rq->last;
 4414
 4415	/*
 4416	 * Someone really wants this to run. If it's not unfair, run it.
 4417	 */
 4418	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
 4419		se = cfs_rq->next;
 4420
 4421	clear_buddies(cfs_rq, se);
 4422
 4423	return se;
 4424}
 4425
 4426static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 4427
 4428static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 4429{
 4430	/*
 4431	 * If still on the runqueue then deactivate_task()
 4432	 * was not called and update_curr() has to be done:
 4433	 */
 4434	if (prev->on_rq)
 4435		update_curr(cfs_rq);
 4436
 4437	/* throttle cfs_rqs exceeding runtime */
 4438	check_cfs_rq_runtime(cfs_rq);
 4439
 4440	check_spread(cfs_rq, prev);
 4441
 4442	if (prev->on_rq) {
 4443		update_stats_wait_start(cfs_rq, prev);
 4444		/* Put 'current' back into the tree. */
 4445		__enqueue_entity(cfs_rq, prev);
 4446		/* in !on_rq case, update occurred at dequeue */
 4447		update_load_avg(cfs_rq, prev, 0);
 4448	}
 4449	cfs_rq->curr = NULL;
 4450}
 4451
 4452static void
 4453entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 4454{
 4455	/*
 4456	 * Update run-time statistics of the 'current'.
 4457	 */
 4458	update_curr(cfs_rq);
 4459
 4460	/*
 4461	 * Ensure that runnable average is periodically updated.
 4462	 */
 4463	update_load_avg(cfs_rq, curr, UPDATE_TG);
 4464	update_cfs_group(curr);
 4465
 4466#ifdef CONFIG_SCHED_HRTICK
 4467	/*
 4468	 * queued ticks are scheduled to match the slice, so don't bother
 4469	 * validating it and just reschedule.
 4470	 */
 4471	if (queued) {
 4472		resched_curr(rq_of(cfs_rq));
 4473		return;
 4474	}
 4475	/*
 4476	 * don't let the period tick interfere with the hrtick preemption
 4477	 */
 4478	if (!sched_feat(DOUBLE_TICK) &&
 4479			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
 4480		return;
 4481#endif
 4482
 4483	if (cfs_rq->nr_running > 1)
 4484		check_preempt_tick(cfs_rq, curr);
 4485}
 4486
 4487
 4488/**************************************************
 4489 * CFS bandwidth control machinery
 4490 */
 4491
 4492#ifdef CONFIG_CFS_BANDWIDTH
 4493
 4494#ifdef HAVE_JUMP_LABEL
 4495static struct static_key __cfs_bandwidth_used;
 4496
 4497static inline bool cfs_bandwidth_used(void)
 4498{
 4499	return static_key_false(&__cfs_bandwidth_used);
 4500}
 4501
 4502void cfs_bandwidth_usage_inc(void)
 4503{
 4504	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
 4505}
 4506
 4507void cfs_bandwidth_usage_dec(void)
 4508{
 4509	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 4510}
 4511#else /* HAVE_JUMP_LABEL */
 4512static bool cfs_bandwidth_used(void)
 4513{
 4514	return true;
 4515}
 4516
 4517void cfs_bandwidth_usage_inc(void) {}
 4518void cfs_bandwidth_usage_dec(void) {}
 4519#endif /* HAVE_JUMP_LABEL */
 4520
 4521/*
 4522 * default period for cfs group bandwidth.
 4523 * default: 0.1s, units: nanoseconds
 4524 */
 4525static inline u64 default_cfs_period(void)
 4526{
 4527	return 100000000ULL;
 4528}
 4529
 4530static inline u64 sched_cfs_bandwidth_slice(void)
 4531{
 4532	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
 4533}
 4534
 4535/*
 4536 * Replenish runtime according to assigned quota and update expiration time.
 4537 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
 4538 * additional synchronization around rq->lock.
 4539 *
 4540 * requires cfs_b->lock
 4541 */
 4542void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 4543{
 4544	u64 now;
 4545
 4546	if (cfs_b->quota == RUNTIME_INF)
 4547		return;
 4548
 4549	now = sched_clock_cpu(smp_processor_id());
 4550	cfs_b->runtime = cfs_b->quota;
 4551	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
 4552}
 4553
 4554static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 4555{
 4556	return &tg->cfs_bandwidth;
 4557}
 4558
 4559/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 4560static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 4561{
 4562	if (unlikely(cfs_rq->throttle_count))
 4563		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
 4564
 4565	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
 4566}
 4567
 4568/* returns 0 on failure to allocate runtime */
 4569static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 4570{
 4571	struct task_group *tg = cfs_rq->tg;
 4572	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 4573	u64 amount = 0, min_amount, expires;
 4574
 4575	/* note: this is a positive sum as runtime_remaining <= 0 */
 4576	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
 4577
 4578	raw_spin_lock(&cfs_b->lock);
 4579	if (cfs_b->quota == RUNTIME_INF)
 4580		amount = min_amount;
 4581	else {
 4582		start_cfs_bandwidth(cfs_b);
 4583
 4584		if (cfs_b->runtime > 0) {
 4585			amount = min(cfs_b->runtime, min_amount);
 4586			cfs_b->runtime -= amount;
 4587			cfs_b->idle = 0;
 4588		}
 4589	}
 4590	expires = cfs_b->runtime_expires;
 4591	raw_spin_unlock(&cfs_b->lock);
 4592
 4593	cfs_rq->runtime_remaining += amount;
 4594	/*
 4595	 * we may have advanced our local expiration to account for allowed
 4596	 * spread between our sched_clock and the one on which runtime was
 4597	 * issued.
 4598	 */
 4599	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
 4600		cfs_rq->runtime_expires = expires;
 4601
 4602	return cfs_rq->runtime_remaining > 0;
 4603}
 4604
 4605/*
 4606 * Note: This depends on the synchronization provided by sched_clock and the
 4607 * fact that rq->clock snapshots this value.
 4608 */
 4609static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 4610{
 4611	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 4612
 4613	/* if the deadline is ahead of our clock, nothing to do */
 4614	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
 4615		return;
 4616
 4617	if (cfs_rq->runtime_remaining < 0)
 4618		return;
 4619
 4620	/*
 4621	 * If the local deadline has passed we have to consider the
 4622	 * possibility that our sched_clock is 'fast' and the global deadline
 4623	 * has not truly expired.
 4624	 *
 4625	 * Fortunately we can check determine whether this the case by checking
 4626	 * whether the global deadline has advanced. It is valid to compare
 4627	 * cfs_b->runtime_expires without any locks since we only care about
 4628	 * exact equality, so a partial write will still work.
 4629	 */
 4630
 4631	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
 4632		/* extend local deadline, drift is bounded above by 2 ticks */
 4633		cfs_rq->runtime_expires += TICK_NSEC;
 4634	} else {
 4635		/* global deadline is ahead, expiration has passed */
 4636		cfs_rq->runtime_remaining = 0;
 4637	}
 4638}
 4639
 4640static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 4641{
 4642	/* dock delta_exec before expiring quota (as it could span periods) */
 4643	cfs_rq->runtime_remaining -= delta_exec;
 4644	expire_cfs_rq_runtime(cfs_rq);
 4645
 4646	if (likely(cfs_rq->runtime_remaining > 0))
 4647		return;
 4648
 4649	/*
 4650	 * if we're unable to extend our runtime we resched so that the active
 4651	 * hierarchy can be throttled
 4652	 */
 4653	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
 4654		resched_curr(rq_of(cfs_rq));
 4655}
 4656
 4657static __always_inline
 4658void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 4659{
 4660	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 4661		return;
 4662
 4663	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 4664}
 4665
 4666static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 4667{
 4668	return cfs_bandwidth_used() && cfs_rq->throttled;
 4669}
 4670
 4671/* check whether cfs_rq, or any parent, is throttled */
 4672static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 4673{
 4674	return cfs_bandwidth_used() && cfs_rq->throttle_count;
 4675}
 4676
 4677/*
 4678 * Ensure that neither of the group entities corresponding to src_cpu or
 4679 * dest_cpu are members of a throttled hierarchy when performing group
 4680 * load-balance operations.
 4681 */
 4682static inline int throttled_lb_pair(struct task_group *tg,
 4683				    int src_cpu, int dest_cpu)
 4684{
 4685	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
 4686
 4687	src_cfs_rq = tg->cfs_rq[src_cpu];
 4688	dest_cfs_rq = tg->cfs_rq[dest_cpu];
 4689
 4690	return throttled_hierarchy(src_cfs_rq) ||
 4691	       throttled_hierarchy(dest_cfs_rq);
 4692}
 4693
 4694/* updated child weight may affect parent so we have to do this bottom up */
 4695static int tg_unthrottle_up(struct task_group *tg, void *data)
 4696{
 4697	struct rq *rq = data;
 4698	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 4699
 4700	cfs_rq->throttle_count--;
 4701	if (!cfs_rq->throttle_count) {
 4702		/* adjust cfs_rq_clock_task() */
 4703		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
 4704					     cfs_rq->throttled_clock_task;
 4705	}
 4706
 4707	return 0;
 4708}
 4709
 4710static int tg_throttle_down(struct task_group *tg, void *data)
 4711{
 4712	struct rq *rq = data;
 4713	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 4714
 4715	/* group is entering throttled state, stop time */
 4716	if (!cfs_rq->throttle_count)
 4717		cfs_rq->throttled_clock_task = rq_clock_task(rq);
 4718	cfs_rq->throttle_count++;
 4719
 4720	return 0;
 4721}
 4722
 4723static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 4724{
 4725	struct rq *rq = rq_of(cfs_rq);
 4726	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 4727	struct sched_entity *se;
 4728	long task_delta, dequeue = 1;
 4729	bool empty;
 4730
 4731	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 4732
 4733	/* freeze hierarchy runnable averages while throttled */
 4734	rcu_read_lock();
 4735	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 4736	rcu_read_unlock();
 4737
 4738	task_delta = cfs_rq->h_nr_running;
 4739	for_each_sched_entity(se) {
 4740		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 4741		/* throttled entity or throttle-on-deactivate */
 4742		if (!se->on_rq)
 4743			break;
 4744
 4745		if (dequeue)
 4746			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 4747		qcfs_rq->h_nr_running -= task_delta;
 4748
 4749		if (qcfs_rq->load.weight)
 4750			dequeue = 0;
 4751	}
 4752
 4753	if (!se)
 4754		sub_nr_running(rq, task_delta);
 4755
 4756	cfs_rq->throttled = 1;
 4757	cfs_rq->throttled_clock = rq_clock(rq);
 4758	raw_spin_lock(&cfs_b->lock);
 4759	empty = list_empty(&cfs_b->throttled_cfs_rq);
 4760
 4761	/*
 4762	 * Add to the _head_ of the list, so that an already-started
 4763	 * distribute_cfs_runtime will not see us
 4764	 */
 4765	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 4766
 4767	/*
 4768	 * If we're the first throttled task, make sure the bandwidth
 4769	 * timer is running.
 4770	 */
 4771	if (empty)
 4772		start_cfs_bandwidth(cfs_b);
 4773
 4774	raw_spin_unlock(&cfs_b->lock);
 4775}
 4776
 4777void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 4778{
 4779	struct rq *rq = rq_of(cfs_rq);
 4780	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 4781	struct sched_entity *se;
 4782	int enqueue = 1;
 4783	long task_delta;
 4784
 4785	se = cfs_rq->tg->se[cpu_of(rq)];
 4786
 4787	cfs_rq->throttled = 0;
 4788
 4789	update_rq_clock(rq);
 4790
 4791	raw_spin_lock(&cfs_b->lock);
 4792	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
 4793	list_del_rcu(&cfs_rq->throttled_list);
 4794	raw_spin_unlock(&cfs_b->lock);
 4795
 4796	/* update hierarchical throttle state */
 4797	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
 4798
 4799	if (!cfs_rq->load.weight)
 4800		return;
 4801
 4802	task_delta = cfs_rq->h_nr_running;
 4803	for_each_sched_entity(se) {
 4804		if (se->on_rq)
 4805			enqueue = 0;
 4806
 4807		cfs_rq = cfs_rq_of(se);
 4808		if (enqueue)
 4809			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
 4810		cfs_rq->h_nr_running += task_delta;
 4811
 4812		if (cfs_rq_throttled(cfs_rq))
 4813			break;
 4814	}
 4815
 4816	if (!se)
 4817		add_nr_running(rq, task_delta);
 4818
 4819	/* Determine whether we need to wake up potentially idle CPU: */
 4820	if (rq->curr == rq->idle && rq->cfs.nr_running)
 4821		resched_curr(rq);
 4822}
 4823
 4824static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
 4825		u64 remaining, u64 expires)
 4826{
 4827	struct cfs_rq *cfs_rq;
 4828	u64 runtime;
 4829	u64 starting_runtime = remaining;
 4830
 4831	rcu_read_lock();
 4832	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
 4833				throttled_list) {
 4834		struct rq *rq = rq_of(cfs_rq);
 4835		struct rq_flags rf;
 4836
 4837		rq_lock(rq, &rf);
 4838		if (!cfs_rq_throttled(cfs_rq))
 4839			goto next;
 4840
 4841		runtime = -cfs_rq->runtime_remaining + 1;
 4842		if (runtime > remaining)
 4843			runtime = remaining;
 4844		remaining -= runtime;
 4845
 4846		cfs_rq->runtime_remaining += runtime;
 4847		cfs_rq->runtime_expires = expires;
 4848
 4849		/* we check whether we're throttled above */
 4850		if (cfs_rq->runtime_remaining > 0)
 4851			unthrottle_cfs_rq(cfs_rq);
 4852
 4853next:
 4854		rq_unlock(rq, &rf);
 4855
 4856		if (!remaining)
 4857			break;
 4858	}
 4859	rcu_read_unlock();
 4860
 4861	return starting_runtime - remaining;
 4862}
 4863
 4864/*
 4865 * Responsible for refilling a task_group's bandwidth and unthrottling its
 4866 * cfs_rqs as appropriate. If there has been no activity within the last
 4867 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
 4868 * used to track this state.
 4869 */
 4870static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 4871{
 4872	u64 runtime, runtime_expires;
 4873	int throttled;
 4874
 4875	/* no need to continue the timer with no bandwidth constraint */
 4876	if (cfs_b->quota == RUNTIME_INF)
 4877		goto out_deactivate;
 4878
 4879	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 4880	cfs_b->nr_periods += overrun;
 4881
 4882	/*
 4883	 * idle depends on !throttled (for the case of a large deficit), and if
 4884	 * we're going inactive then everything else can be deferred
 4885	 */
 4886	if (cfs_b->idle && !throttled)
 4887		goto out_deactivate;
 4888
 4889	__refill_cfs_bandwidth_runtime(cfs_b);
 4890
 4891	if (!throttled) {
 4892		/* mark as potentially idle for the upcoming period */
 4893		cfs_b->idle = 1;
 4894		return 0;
 4895	}
 4896
 4897	/* account preceding periods in which throttling occurred */
 4898	cfs_b->nr_throttled += overrun;
 4899
 4900	runtime_expires = cfs_b->runtime_expires;
 4901
 4902	/*
 4903	 * This check is repeated as we are holding onto the new bandwidth while
 4904	 * we unthrottle. This can potentially race with an unthrottled group
 4905	 * trying to acquire new bandwidth from the global pool. This can result
 4906	 * in us over-using our runtime if it is all used during this loop, but
 4907	 * only by limited amounts in that extreme case.
 4908	 */
 4909	while (throttled && cfs_b->runtime > 0) {
 4910		runtime = cfs_b->runtime;
 4911		raw_spin_unlock(&cfs_b->lock);
 4912		/* we can't nest cfs_b->lock while distributing bandwidth */
 4913		runtime = distribute_cfs_runtime(cfs_b, runtime,
 4914						 runtime_expires);
 4915		raw_spin_lock(&cfs_b->lock);
 4916
 4917		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 4918
 4919		cfs_b->runtime -= min(runtime, cfs_b->runtime);
 4920	}
 4921
 4922	/*
 4923	 * While we are ensured activity in the period following an
 4924	 * unthrottle, this also covers the case in which the new bandwidth is
 4925	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
 4926	 * timer to remain active while there are any throttled entities.)
 4927	 */
 4928	cfs_b->idle = 0;
 4929
 4930	return 0;
 4931
 4932out_deactivate:
 4933	return 1;
 4934}
 4935
 4936/* a cfs_rq won't donate quota below this amount */
 4937static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
 4938/* minimum remaining period time to redistribute slack quota */
 4939static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 4940/* how long we wait to gather additional slack before distributing */
 4941static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
 4942
 4943/*
 4944 * Are we near the end of the current quota period?
 4945 *
 4946 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
 4947 * hrtimer base being cleared by hrtimer_start. In the case of
 4948 * migrate_hrtimers, base is never cleared, so we are fine.
 4949 */
 4950static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 4951{
 4952	struct hrtimer *refresh_timer = &cfs_b->period_timer;
 4953	u64 remaining;
 4954
 4955	/* if the call-back is running a quota refresh is already occurring */
 4956	if (hrtimer_callback_running(refresh_timer))
 4957		return 1;
 4958
 4959	/* is a quota refresh about to occur? */
 4960	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
 4961	if (remaining < min_expire)
 4962		return 1;
 4963
 4964	return 0;
 4965}
 4966
 4967static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
 4968{
 4969	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
 4970
 4971	/* if there's a quota refresh soon don't bother with slack */
 4972	if (runtime_refresh_within(cfs_b, min_left))
 4973		return;
 4974
 4975	hrtimer_start(&cfs_b->slack_timer,
 4976			ns_to_ktime(cfs_bandwidth_slack_period),
 4977			HRTIMER_MODE_REL);
 4978}
 4979
 4980/* we know any runtime found here is valid as update_curr() precedes return */
 4981static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 4982{
 4983	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 4984	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
 4985
 4986	if (slack_runtime <= 0)
 4987		return;
 4988
 4989	raw_spin_lock(&cfs_b->lock);
 4990	if (cfs_b->quota != RUNTIME_INF &&
 4991	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
 4992		cfs_b->runtime += slack_runtime;
 4993
 4994		/* we are under rq->lock, defer unthrottling using a timer */
 4995		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
 4996		    !list_empty(&cfs_b->throttled_cfs_rq))
 4997			start_cfs_slack_bandwidth(cfs_b);
 4998	}
 4999	raw_spin_unlock(&cfs_b->lock);
 5000
 5001	/* even if it's not valid for return we don't want to try again */
 5002	cfs_rq->runtime_remaining -= slack_runtime;
 5003}
 5004
 5005static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 5006{
 5007	if (!cfs_bandwidth_used())
 5008		return;
 5009
 5010	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
 5011		return;
 5012
 5013	__return_cfs_rq_runtime(cfs_rq);
 5014}
 5015
 5016/*
 5017 * This is done with a timer (instead of inline with bandwidth return) since
 5018 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
 5019 */
 5020static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 5021{
 5022	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
 5023	u64 expires;
 5024
 5025	/* confirm we're still not at a refresh boundary */
 5026	raw_spin_lock(&cfs_b->lock);
 5027	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 5028		raw_spin_unlock(&cfs_b->lock);
 5029		return;
 5030	}
 5031
 5032	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
 5033		runtime = cfs_b->runtime;
 5034
 5035	expires = cfs_b->runtime_expires;
 5036	raw_spin_unlock(&cfs_b->lock);
 5037
 5038	if (!runtime)
 5039		return;
 5040
 5041	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
 5042
 5043	raw_spin_lock(&cfs_b->lock);
 5044	if (expires == cfs_b->runtime_expires)
 5045		cfs_b->runtime -= min(runtime, cfs_b->runtime);
 5046	raw_spin_unlock(&cfs_b->lock);
 5047}
 5048
 5049/*
 5050 * When a group wakes up we want to make sure that its quota is not already
 5051 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
 5052 * runtime as update_curr() throttling can not not trigger until it's on-rq.
 5053 */
 5054static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 5055{
 5056	if (!cfs_bandwidth_used())
 5057		return;
 5058
 5059	/* an active group must be handled by the update_curr()->put() path */
 5060	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 5061		return;
 5062
 5063	/* ensure the group is not already throttled */
 5064	if (cfs_rq_throttled(cfs_rq))
 5065		return;
 5066
 5067	/* update runtime allocation */
 5068	account_cfs_rq_runtime(cfs_rq, 0);
 5069	if (cfs_rq->runtime_remaining <= 0)
 5070		throttle_cfs_rq(cfs_rq);
 5071}
 5072
 5073static void sync_throttle(struct task_group *tg, int cpu)
 5074{
 5075	struct cfs_rq *pcfs_rq, *cfs_rq;
 5076
 5077	if (!cfs_bandwidth_used())
 5078		return;
 5079
 5080	if (!tg->parent)
 5081		return;
 5082
 5083	cfs_rq = tg->cfs_rq[cpu];
 5084	pcfs_rq = tg->parent->cfs_rq[cpu];
 5085
 5086	cfs_rq->throttle_count = pcfs_rq->throttle_count;
 5087	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
 5088}
 5089
 5090/* conditionally throttle active cfs_rq's from put_prev_entity() */
 5091static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 5092{
 5093	if (!cfs_bandwidth_used())
 5094		return false;
 5095
 5096	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
 5097		return false;
 5098
 5099	/*
 5100	 * it's possible for a throttled entity to be forced into a running
 5101	 * state (e.g. set_curr_task), in this case we're finished.
 5102	 */
 5103	if (cfs_rq_throttled(cfs_rq))
 5104		return true;
 5105
 5106	throttle_cfs_rq(cfs_rq);
 5107	return true;
 5108}
 5109
 5110static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 5111{
 5112	struct cfs_bandwidth *cfs_b =
 5113		container_of(timer, struct cfs_bandwidth, slack_timer);
 5114
 5115	do_sched_cfs_slack_timer(cfs_b);
 5116
 5117	return HRTIMER_NORESTART;
 5118}
 5119
 5120static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 5121{
 5122	struct cfs_bandwidth *cfs_b =
 5123		container_of(timer, struct cfs_bandwidth, period_timer);
 5124	int overrun;
 5125	int idle = 0;
 5126
 5127	raw_spin_lock(&cfs_b->lock);
 5128	for (;;) {
 5129		overrun = hrtimer_forward_now(timer, cfs_b->period);
 5130		if (!overrun)
 5131			break;
 5132
 5133		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 5134	}
 5135	if (idle)
 5136		cfs_b->period_active = 0;
 5137	raw_spin_unlock(&cfs_b->lock);
 5138
 5139	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 5140}
 5141
 5142void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 5143{
 5144	raw_spin_lock_init(&cfs_b->lock);
 5145	cfs_b->runtime = 0;
 5146	cfs_b->quota = RUNTIME_INF;
 5147	cfs_b->period = ns_to_ktime(default_cfs_period());
 5148
 5149	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 5150	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 5151	cfs_b->period_timer.function = sched_cfs_period_timer;
 5152	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 5153	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 5154}
 5155
 5156static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 5157{
 5158	cfs_rq->runtime_enabled = 0;
 5159	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 5160}
 5161
 5162void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 5163{
 5164	lockdep_assert_held(&cfs_b->lock);
 5165
 5166	if (!cfs_b->period_active) {
 5167		cfs_b->period_active = 1;
 5168		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
 5169		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
 5170	}
 5171}
 5172
 5173static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 5174{
 5175	/* init_cfs_bandwidth() was not called */
 5176	if (!cfs_b->throttled_cfs_rq.next)
 5177		return;
 5178
 5179	hrtimer_cancel(&cfs_b->period_timer);
 5180	hrtimer_cancel(&cfs_b->slack_timer);
 5181}
 5182
 5183/*
 5184 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
 5185 *
 5186 * The race is harmless, since modifying bandwidth settings of unhooked group
 5187 * bits doesn't do much.
 5188 */
 5189
 5190/* cpu online calback */
 5191static void __maybe_unused update_runtime_enabled(struct rq *rq)
 5192{
 5193	struct task_group *tg;
 5194
 5195	lockdep_assert_held(&rq->lock);
 5196
 5197	rcu_read_lock();
 5198	list_for_each_entry_rcu(tg, &task_groups, list) {
 5199		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 5200		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 5201
 5202		raw_spin_lock(&cfs_b->lock);
 5203		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 5204		raw_spin_unlock(&cfs_b->lock);
 5205	}
 5206	rcu_read_unlock();
 5207}
 5208
 5209/* cpu offline callback */
 5210static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 5211{
 5212	struct task_group *tg;
 5213
 5214	lockdep_assert_held(&rq->lock);
 5215
 5216	rcu_read_lock();
 5217	list_for_each_entry_rcu(tg, &task_groups, list) {
 5218		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 5219
 5220		if (!cfs_rq->runtime_enabled)
 5221			continue;
 5222
 5223		/*
 5224		 * clock_task is not advancing so we just need to make sure
 5225		 * there's some valid quota amount
 5226		 */
 5227		cfs_rq->runtime_remaining = 1;
 5228		/*
 5229		 * Offline rq is schedulable till CPU is completely disabled
 5230		 * in take_cpu_down(), so we prevent new cfs throttling here.
 5231		 */
 5232		cfs_rq->runtime_enabled = 0;
 5233
 5234		if (cfs_rq_throttled(cfs_rq))
 5235			unthrottle_cfs_rq(cfs_rq);
 5236	}
 5237	rcu_read_unlock();
 5238}
 5239
 5240#else /* CONFIG_CFS_BANDWIDTH */
 5241static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 5242{
 5243	return rq_clock_task(rq_of(cfs_rq));
 5244}
 5245
 5246static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 5247static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 5248static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 5249static inline void sync_throttle(struct task_group *tg, int cpu) {}
 5250static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 5251
 5252static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 5253{
 5254	return 0;
 5255}
 5256
 5257static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 5258{
 5259	return 0;
 5260}
 5261
 5262static inline int throttled_lb_pair(struct task_group *tg,
 5263				    int src_cpu, int dest_cpu)
 5264{
 5265	return 0;
 5266}
 5267
 5268void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 5269
 5270#ifdef CONFIG_FAIR_GROUP_SCHED
 5271static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 5272#endif
 5273
 5274static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 5275{
 5276	return NULL;
 5277}
 5278static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 5279static inline void update_runtime_enabled(struct rq *rq) {}
 5280static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 5281
 5282#endif /* CONFIG_CFS_BANDWIDTH */
 5283
 5284/**************************************************
 5285 * CFS operations on tasks:
 5286 */
 5287
 5288#ifdef CONFIG_SCHED_HRTICK
 5289static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 5290{
 5291	struct sched_entity *se = &p->se;
 5292	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 5293
 5294	SCHED_WARN_ON(task_rq(p) != rq);
 5295
 5296	if (rq->cfs.h_nr_running > 1) {
 5297		u64 slice = sched_slice(cfs_rq, se);
 5298		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
 5299		s64 delta = slice - ran;
 5300
 5301		if (delta < 0) {
 5302			if (rq->curr == p)
 5303				resched_curr(rq);
 5304			return;
 5305		}
 5306		hrtick_start(rq, delta);
 5307	}
 5308}
 5309
 5310/*
 5311 * called from enqueue/dequeue and updates the hrtick when the
 5312 * current task is from our class and nr_running is low enough
 5313 * to matter.
 5314 */
 5315static void hrtick_update(struct rq *rq)
 5316{
 5317	struct task_struct *curr = rq->curr;
 5318
 5319	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
 5320		return;
 5321
 5322	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
 5323		hrtick_start_fair(rq, curr);
 5324}
 5325#else /* !CONFIG_SCHED_HRTICK */
 5326static inline void
 5327hrtick_start_fair(struct rq *rq, struct task_struct *p)
 5328{
 5329}
 5330
 5331static inline void hrtick_update(struct rq *rq)
 5332{
 5333}
 5334#endif
 5335
 5336/*
 5337 * The enqueue_task method is called before nr_running is
 5338 * increased. Here we update the fair scheduling stats and
 5339 * then put the task into the rbtree:
 5340 */
 5341static void
 5342enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 5343{
 5344	struct cfs_rq *cfs_rq;
 5345	struct sched_entity *se = &p->se;
 5346
 5347	/*
 5348	 * If in_iowait is set, the code below may not trigger any cpufreq
 5349	 * utilization updates, so do it here explicitly with the IOWAIT flag
 5350	 * passed.
 5351	 */
 5352	if (p->in_iowait)
 5353		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 5354
 5355	for_each_sched_entity(se) {
 5356		if (se->on_rq)
 5357			break;
 5358		cfs_rq = cfs_rq_of(se);
 5359		enqueue_entity(cfs_rq, se, flags);
 5360
 5361		/*
 5362		 * end evaluation on encountering a throttled cfs_rq
 5363		 *
 5364		 * note: in the case of encountering a throttled cfs_rq we will
 5365		 * post the final h_nr_running increment below.
 5366		 */
 5367		if (cfs_rq_throttled(cfs_rq))
 5368			break;
 5369		cfs_rq->h_nr_running++;
 5370
 5371		flags = ENQUEUE_WAKEUP;
 5372	}
 5373
 5374	for_each_sched_entity(se) {
 5375		cfs_rq = cfs_rq_of(se);
 5376		cfs_rq->h_nr_running++;
 5377
 5378		if (cfs_rq_throttled(cfs_rq))
 5379			break;
 5380
 5381		update_load_avg(cfs_rq, se, UPDATE_TG);
 5382		update_cfs_group(se);
 5383	}
 5384
 5385	if (!se)
 5386		add_nr_running(rq, 1);
 5387
 5388	util_est_enqueue(&rq->cfs, p);
 5389	hrtick_update(rq);
 5390}
 5391
 5392static void set_next_buddy(struct sched_entity *se);
 5393
 5394/*
 5395 * The dequeue_task method is called before nr_running is
 5396 * decreased. We remove the task from the rbtree and
 5397 * update the fair scheduling stats:
 5398 */
 5399static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 5400{
 5401	struct cfs_rq *cfs_rq;
 5402	struct sched_entity *se = &p->se;
 5403	int task_sleep = flags & DEQUEUE_SLEEP;
 5404
 5405	for_each_sched_entity(se) {
 5406		cfs_rq = cfs_rq_of(se);
 5407		dequeue_entity(cfs_rq, se, flags);
 5408
 5409		/*
 5410		 * end evaluation on encountering a throttled cfs_rq
 5411		 *
 5412		 * note: in the case of encountering a throttled cfs_rq we will
 5413		 * post the final h_nr_running decrement below.
 5414		*/
 5415		if (cfs_rq_throttled(cfs_rq))
 5416			break;
 5417		cfs_rq->h_nr_running--;
 5418
 5419		/* Don't dequeue parent if it has other entities besides us */
 5420		if (cfs_rq->load.weight) {
 5421			/* Avoid re-evaluating load for this entity: */
 5422			se = parent_entity(se);
 5423			/*
 5424			 * Bias pick_next to pick a task from this cfs_rq, as
 5425			 * p is sleeping when it is within its sched_slice.
 5426			 */
 5427			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
 5428				set_next_buddy(se);
 5429			break;
 5430		}
 5431		flags |= DEQUEUE_SLEEP;
 5432	}
 5433
 5434	for_each_sched_entity(se) {
 5435		cfs_rq = cfs_rq_of(se);
 5436		cfs_rq->h_nr_running--;
 5437
 5438		if (cfs_rq_throttled(cfs_rq))
 5439			break;
 5440
 5441		update_load_avg(cfs_rq, se, UPDATE_TG);
 5442		update_cfs_group(se);
 5443	}
 5444
 5445	if (!se)
 5446		sub_nr_running(rq, 1);
 5447
 5448	util_est_dequeue(&rq->cfs, p, task_sleep);
 5449	hrtick_update(rq);
 5450}
 5451
 5452#ifdef CONFIG_SMP
 5453
 5454/* Working cpumask for: load_balance, load_balance_newidle. */
 5455DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 5456DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 5457
 5458#ifdef CONFIG_NO_HZ_COMMON
 5459/*
 5460 * per rq 'load' arrray crap; XXX kill this.
 5461 */
 5462
 5463/*
 5464 * The exact cpuload calculated at every tick would be:
 5465 *
 5466 *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
 5467 *
 5468 * If a CPU misses updates for n ticks (as it was idle) and update gets
 5469 * called on the n+1-th tick when CPU may be busy, then we have:
 5470 *
 5471 *   load_n   = (1 - 1/2^i)^n * load_0
 5472 *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
 5473 *
 5474 * decay_load_missed() below does efficient calculation of
 5475 *
 5476 *   load' = (1 - 1/2^i)^n * load
 5477 *
 5478 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
 5479 * This allows us to precompute the above in said factors, thereby allowing the
 5480 * reduction of an arbitrary n in O(log_2 n) steps. (See also
 5481 * fixed_power_int())
 5482 *
 5483 * The calculation is approximated on a 128 point scale.
 5484 */
 5485#define DEGRADE_SHIFT		7
 5486
 5487static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
 5488static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
 5489	{   0,   0,  0,  0,  0,  0, 0, 0 },
 5490	{  64,  32,  8,  0,  0,  0, 0, 0 },
 5491	{  96,  72, 40, 12,  1,  0, 0, 0 },
 5492	{ 112,  98, 75, 43, 15,  1, 0, 0 },
 5493	{ 120, 112, 98, 76, 45, 16, 2, 0 }
 5494};
 5495
 5496/*
 5497 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
 5498 * would be when CPU is idle and so we just decay the old load without
 5499 * adding any new load.
 5500 */
 5501static unsigned long
 5502decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 5503{
 5504	int j = 0;
 5505
 5506	if (!missed_updates)
 5507		return load;
 5508
 5509	if (missed_updates >= degrade_zero_ticks[idx])
 5510		return 0;
 5511
 5512	if (idx == 1)
 5513		return load >> missed_updates;
 5514
 5515	while (missed_updates) {
 5516		if (missed_updates % 2)
 5517			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
 5518
 5519		missed_updates >>= 1;
 5520		j++;
 5521	}
 5522	return load;
 5523}
 5524
 5525static struct {
 5526	cpumask_var_t idle_cpus_mask;
 5527	atomic_t nr_cpus;
 5528	int has_blocked;		/* Idle CPUS has blocked load */
 5529	unsigned long next_balance;     /* in jiffy units */
 5530	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 5531} nohz ____cacheline_aligned;
 5532
 5533#endif /* CONFIG_NO_HZ_COMMON */
 5534
 5535/**
 5536 * __cpu_load_update - update the rq->cpu_load[] statistics
 5537 * @this_rq: The rq to update statistics for
 5538 * @this_load: The current load
 5539 * @pending_updates: The number of missed updates
 5540 *
 5541 * Update rq->cpu_load[] statistics. This function is usually called every
 5542 * scheduler tick (TICK_NSEC).
 5543 *
 5544 * This function computes a decaying average:
 5545 *
 5546 *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
 5547 *
 5548 * Because of NOHZ it might not get called on every tick which gives need for
 5549 * the @pending_updates argument.
 5550 *
 5551 *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
 5552 *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
 5553 *             = A * (A * load[i]_n-2 + B) + B
 5554 *             = A * (A * (A * load[i]_n-3 + B) + B) + B
 5555 *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
 5556 *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
 5557 *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
 5558 *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
 5559 *
 5560 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
 5561 * any change in load would have resulted in the tick being turned back on.
 5562 *
 5563 * For regular NOHZ, this reduces to:
 5564 *
 5565 *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
 5566 *
 5567 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
 5568 * term.
 5569 */
 5570static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
 5571			    unsigned long pending_updates)
 5572{
 5573	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
 5574	int i, scale;
 5575
 5576	this_rq->nr_load_updates++;
 5577
 5578	/* Update our load: */
 5579	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
 5580	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 5581		unsigned long old_load, new_load;
 5582
 5583		/* scale is effectively 1 << i now, and >> i divides by scale */
 5584
 5585		old_load = this_rq->cpu_load[i];
 5586#ifdef CONFIG_NO_HZ_COMMON
 5587		old_load = decay_load_missed(old_load, pending_updates - 1, i);
 5588		if (tickless_load) {
 5589			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
 5590			/*
 5591			 * old_load can never be a negative value because a
 5592			 * decayed tickless_load cannot be greater than the
 5593			 * original tickless_load.
 5594			 */
 5595			old_load += tickless_load;
 5596		}
 5597#endif
 5598		new_load = this_load;
 5599		/*
 5600		 * Round up the averaging division if load is increasing. This
 5601		 * prevents us from getting stuck on 9 if the load is 10, for
 5602		 * example.
 5603		 */
 5604		if (new_load > old_load)
 5605			new_load += scale - 1;
 5606
 5607		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
 5608	}
 5609
 5610	sched_avg_update(this_rq);
 5611}
 5612
 5613/* Used instead of source_load when we know the type == 0 */
 5614static unsigned long weighted_cpuload(struct rq *rq)
 5615{
 5616	return cfs_rq_runnable_load_avg(&rq->cfs);
 5617}
 5618
 5619#ifdef CONFIG_NO_HZ_COMMON
 5620/*
 5621 * There is no sane way to deal with nohz on smp when using jiffies because the
 5622 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
 5623 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
 5624 *
 5625 * Therefore we need to avoid the delta approach from the regular tick when
 5626 * possible since that would seriously skew the load calculation. This is why we
 5627 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
 5628 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
 5629 * loop exit, nohz_idle_balance, nohz full exit...)
 5630 *
 5631 * This means we might still be one tick off for nohz periods.
 5632 */
 5633
 5634static void cpu_load_update_nohz(struct rq *this_rq,
 5635				 unsigned long curr_jiffies,
 5636				 unsigned long load)
 5637{
 5638	unsigned long pending_updates;
 5639
 5640	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 5641	if (pending_updates) {
 5642		this_rq->last_load_update_tick = curr_jiffies;
 5643		/*
 5644		 * In the regular NOHZ case, we were idle, this means load 0.
 5645		 * In the NOHZ_FULL case, we were non-idle, we should consider
 5646		 * its weighted load.
 5647		 */
 5648		cpu_load_update(this_rq, load, pending_updates);
 5649	}
 5650}
 5651
 5652/*
 5653 * Called from nohz_idle_balance() to update the load ratings before doing the
 5654 * idle balance.
 5655 */
 5656static void cpu_load_update_idle(struct rq *this_rq)
 5657{
 5658	/*
 5659	 * bail if there's load or we're actually up-to-date.
 5660	 */
 5661	if (weighted_cpuload(this_rq))
 5662		return;
 5663
 5664	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
 5665}
 5666
 5667/*
 5668 * Record CPU load on nohz entry so we know the tickless load to account
 5669 * on nohz exit. cpu_load[0] happens then to be updated more frequently
 5670 * than other cpu_load[idx] but it should be fine as cpu_load readers
 5671 * shouldn't rely into synchronized cpu_load[*] updates.
 5672 */
 5673void cpu_load_update_nohz_start(void)
 5674{
 5675	struct rq *this_rq = this_rq();
 5676
 5677	/*
 5678	 * This is all lockless but should be fine. If weighted_cpuload changes
 5679	 * concurrently we'll exit nohz. And cpu_load write can race with
 5680	 * cpu_load_update_idle() but both updater would be writing the same.
 5681	 */
 5682	this_rq->cpu_load[0] = weighted_cpuload(this_rq);
 5683}
 5684
 5685/*
 5686 * Account the tickless load in the end of a nohz frame.
 5687 */
 5688void cpu_load_update_nohz_stop(void)
 5689{
 5690	unsigned long curr_jiffies = READ_ONCE(jiffies);
 5691	struct rq *this_rq = this_rq();
 5692	unsigned long load;
 5693	struct rq_flags rf;
 5694
 5695	if (curr_jiffies == this_rq->last_load_update_tick)
 5696		return;
 5697
 5698	load = weighted_cpuload(this_rq);
 5699	rq_lock(this_rq, &rf);
 5700	update_rq_clock(this_rq);
 5701	cpu_load_update_nohz(this_rq, curr_jiffies, load);
 5702	rq_unlock(this_rq, &rf);
 5703}
 5704#else /* !CONFIG_NO_HZ_COMMON */
 5705static inline void cpu_load_update_nohz(struct rq *this_rq,
 5706					unsigned long curr_jiffies,
 5707					unsigned long load) { }
 5708#endif /* CONFIG_NO_HZ_COMMON */
 5709
 5710static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 5711{
 5712#ifdef CONFIG_NO_HZ_COMMON
 5713	/* See the mess around cpu_load_update_nohz(). */
 5714	this_rq->last_load_update_tick = READ_ONCE(jiffies);
 5715#endif
 5716	cpu_load_update(this_rq, load, 1);
 5717}
 5718
 5719/*
 5720 * Called from scheduler_tick()
 5721 */
 5722void cpu_load_update_active(struct rq *this_rq)
 5723{
 5724	unsigned long load = weighted_cpuload(this_rq);
 5725
 5726	if (tick_nohz_tick_stopped())
 5727		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
 5728	else
 5729		cpu_load_update_periodic(this_rq, load);
 5730}
 5731
 5732/*
 5733 * Return a low guess at the load of a migration-source CPU weighted
 5734 * according to the scheduling class and "nice" value.
 5735 *
 5736 * We want to under-estimate the load of migration sources, to
 5737 * balance conservatively.
 5738 */
 5739static unsigned long source_load(int cpu, int type)
 5740{
 5741	struct rq *rq = cpu_rq(cpu);
 5742	unsigned long total = weighted_cpuload(rq);
 5743
 5744	if (type == 0 || !sched_feat(LB_BIAS))
 5745		return total;
 5746
 5747	return min(rq->cpu_load[type-1], total);
 5748}
 5749
 5750/*
 5751 * Return a high guess at the load of a migration-target CPU weighted
 5752 * according to the scheduling class and "nice" value.
 5753 */
 5754static unsigned long target_load(int cpu, int type)
 5755{
 5756	struct rq *rq = cpu_rq(cpu);
 5757	unsigned long total = weighted_cpuload(rq);
 5758
 5759	if (type == 0 || !sched_feat(LB_BIAS))
 5760		return total;
 5761
 5762	return max(rq->cpu_load[type-1], total);
 5763}
 5764
 5765static unsigned long capacity_of(int cpu)
 5766{
 5767	return cpu_rq(cpu)->cpu_capacity;
 5768}
 5769
 5770static unsigned long capacity_orig_of(int cpu)
 5771{
 5772	return cpu_rq(cpu)->cpu_capacity_orig;
 5773}
 5774
 5775static unsigned long cpu_avg_load_per_task(int cpu)
 5776{
 5777	struct rq *rq = cpu_rq(cpu);
 5778	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
 5779	unsigned long load_avg = weighted_cpuload(rq);
 5780
 5781	if (nr_running)
 5782		return load_avg / nr_running;
 5783
 5784	return 0;
 5785}
 5786
 5787static void record_wakee(struct task_struct *p)
 5788{
 5789	/*
 5790	 * Only decay a single time; tasks that have less then 1 wakeup per
 5791	 * jiffy will not have built up many flips.
 5792	 */
 5793	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
 5794		current->wakee_flips >>= 1;
 5795		current->wakee_flip_decay_ts = jiffies;
 5796	}
 5797
 5798	if (current->last_wakee != p) {
 5799		current->last_wakee = p;
 5800		current->wakee_flips++;
 5801	}
 5802}
 5803
 5804/*
 5805 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
 5806 *
 5807 * A waker of many should wake a different task than the one last awakened
 5808 * at a frequency roughly N times higher than one of its wakees.
 5809 *
 5810 * In order to determine whether we should let the load spread vs consolidating
 5811 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
 5812 * partner, and a factor of lls_size higher frequency in the other.
 5813 *
 5814 * With both conditions met, we can be relatively sure that the relationship is
 5815 * non-monogamous, with partner count exceeding socket size.
 5816 *
 5817 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
 5818 * whatever is irrelevant, spread criteria is apparent partner count exceeds
 5819 * socket size.
 5820 */
 5821static int wake_wide(struct task_struct *p)
 5822{
 5823	unsigned int master = current->wakee_flips;
 5824	unsigned int slave = p->wakee_flips;
 5825	int factor = this_cpu_read(sd_llc_size);
 5826
 5827	if (master < slave)
 5828		swap(master, slave);
 5829	if (slave < factor || master < slave * factor)
 5830		return 0;
 5831	return 1;
 5832}
 5833
 5834/*
 5835 * The purpose of wake_affine() is to quickly determine on which CPU we can run
 5836 * soonest. For the purpose of speed we only consider the waking and previous
 5837 * CPU.
 5838 *
 5839 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
 5840 *			cache-affine and is (or	will be) idle.
 5841 *
 5842 * wake_affine_weight() - considers the weight to reflect the average
 5843 *			  scheduling latency of the CPUs. This seems to work
 5844 *			  for the overloaded case.
 5845 */
 5846static int
 5847wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 5848{
 5849	/*
 5850	 * If this_cpu is idle, it implies the wakeup is from interrupt
 5851	 * context. Only allow the move if cache is shared. Otherwise an
 5852	 * interrupt intensive workload could force all tasks onto one
 5853	 * node depending on the IO topology or IRQ affinity settings.
 5854	 *
 5855	 * If the prev_cpu is idle and cache affine then avoid a migration.
 5856	 * There is no guarantee that the cache hot data from an interrupt
 5857	 * is more important than cache hot data on the prev_cpu and from
 5858	 * a cpufreq perspective, it's better to have higher utilisation
 5859	 * on one CPU.
 5860	 */
 5861	if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
 5862		return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 5863
 5864	if (sync && cpu_rq(this_cpu)->nr_running == 1)
 5865		return this_cpu;
 5866
 5867	return nr_cpumask_bits;
 5868}
 5869
 5870static int
 5871wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 5872		   int this_cpu, int prev_cpu, int sync)
 5873{
 5874	s64 this_eff_load, prev_eff_load;
 5875	unsigned long task_load;
 5876
 5877	this_eff_load = target_load(this_cpu, sd->wake_idx);
 5878
 5879	if (sync) {
 5880		unsigned long current_load = task_h_load(current);
 5881
 5882		if (current_load > this_eff_load)
 5883			return this_cpu;
 5884
 5885		this_eff_load -= current_load;
 5886	}
 5887
 5888	task_load = task_h_load(p);
 5889
 5890	this_eff_load += task_load;
 5891	if (sched_feat(WA_BIAS))
 5892		this_eff_load *= 100;
 5893	this_eff_load *= capacity_of(prev_cpu);
 5894
 5895	prev_eff_load = source_load(prev_cpu, sd->wake_idx);
 5896	prev_eff_load -= task_load;
 5897	if (sched_feat(WA_BIAS))
 5898		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
 5899	prev_eff_load *= capacity_of(this_cpu);
 5900
 5901	/*
 5902	 * If sync, adjust the weight of prev_eff_load such that if
 5903	 * prev_eff == this_eff that select_idle_sibling() will consider
 5904	 * stacking the wakee on top of the waker if no other CPU is
 5905	 * idle.
 5906	 */
 5907	if (sync)
 5908		prev_eff_load += 1;
 5909
 5910	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 5911}
 5912
 5913static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 5914		       int this_cpu, int prev_cpu, int sync)
 5915{
 5916	int target = nr_cpumask_bits;
 5917
 5918	if (sched_feat(WA_IDLE))
 5919		target = wake_affine_idle(this_cpu, prev_cpu, sync);
 5920
 5921	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
 5922		target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
 5923
 5924	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 5925	if (target == nr_cpumask_bits)
 5926		return prev_cpu;
 5927
 5928	schedstat_inc(sd->ttwu_move_affine);
 5929	schedstat_inc(p->se.statistics.nr_wakeups_affine);
 5930	return target;
 5931}
 5932
 5933static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
 5934
 5935static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 5936{
 5937	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
 5938}
 5939
 5940/*
 5941 * find_idlest_group finds and returns the least busy CPU group within the
 5942 * domain.
 5943 *
 5944 * Assumes p is allowed on at least one CPU in sd.
 5945 */
 5946static struct sched_group *
 5947find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 5948		  int this_cpu, int sd_flag)
 5949{
 5950	struct sched_group *idlest = NULL, *group = sd->groups;
 5951	struct sched_group *most_spare_sg = NULL;
 5952	unsigned long min_runnable_load = ULONG_MAX;
 5953	unsigned long this_runnable_load = ULONG_MAX;
 5954	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
 5955	unsigned long most_spare = 0, this_spare = 0;
 5956	int load_idx = sd->forkexec_idx;
 5957	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
 5958	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
 5959				(sd->imbalance_pct-100) / 100;
 5960
 5961	if (sd_flag & SD_BALANCE_WAKE)
 5962		load_idx = sd->wake_idx;
 5963
 5964	do {
 5965		unsigned long load, avg_load, runnable_load;
 5966		unsigned long spare_cap, max_spare_cap;
 5967		int local_group;
 5968		int i;
 5969
 5970		/* Skip over this group if it has no CPUs allowed */
 5971		if (!cpumask_intersects(sched_group_span(group),
 5972					&p->cpus_allowed))
 5973			continue;
 5974
 5975		local_group = cpumask_test_cpu(this_cpu,
 5976					       sched_group_span(group));
 5977
 5978		/*
 5979		 * Tally up the load of all CPUs in the group and find
 5980		 * the group containing the CPU with most spare capacity.
 5981		 */
 5982		avg_load = 0;
 5983		runnable_load = 0;
 5984		max_spare_cap = 0;
 5985
 5986		for_each_cpu(i, sched_group_span(group)) {
 5987			/* Bias balancing toward CPUs of our domain */
 5988			if (local_group)
 5989				load = source_load(i, load_idx);
 5990			else
 5991				load = target_load(i, load_idx);
 5992
 5993			runnable_load += load;
 5994
 5995			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
 5996
 5997			spare_cap = capacity_spare_wake(i, p);
 5998
 5999			if (spare_cap > max_spare_cap)
 6000				max_spare_cap = spare_cap;
 6001		}
 6002
 6003		/* Adjust by relative CPU capacity of the group */
 6004		avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
 6005					group->sgc->capacity;
 6006		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
 6007					group->sgc->capacity;
 6008
 6009		if (local_group) {
 6010			this_runnable_load = runnable_load;
 6011			this_avg_load = avg_load;
 6012			this_spare = max_spare_cap;
 6013		} else {
 6014			if (min_runnable_load > (runnable_load + imbalance)) {
 6015				/*
 6016				 * The runnable load is significantly smaller
 6017				 * so we can pick this new CPU:
 6018				 */
 6019				min_runnable_load = runnable_load;
 6020				min_avg_load = avg_load;
 6021				idlest = group;
 6022			} else if ((runnable_load < (min_runnable_load + imbalance)) &&
 6023				   (100*min_avg_load > imbalance_scale*avg_load)) {
 6024				/*
 6025				 * The runnable loads are close so take the
 6026				 * blocked load into account through avg_load:
 6027				 */
 6028				min_avg_load = avg_load;
 6029				idlest = group;
 6030			}
 6031
 6032			if (most_spare < max_spare_cap) {
 6033				most_spare = max_spare_cap;
 6034				most_spare_sg = group;
 6035			}
 6036		}
 6037	} while (group = group->next, group != sd->groups);
 6038
 6039	/*
 6040	 * The cross-over point between using spare capacity or least load
 6041	 * is too conservative for high utilization tasks on partially
 6042	 * utilized systems if we require spare_capacity > task_util(p),
 6043	 * so we allow for some task stuffing by using
 6044	 * spare_capacity > task_util(p)/2.
 6045	 *
 6046	 * Spare capacity can't be used for fork because the utilization has
 6047	 * not been set yet, we must first select a rq to compute the initial
 6048	 * utilization.
 6049	 */
 6050	if (sd_flag & SD_BALANCE_FORK)
 6051		goto skip_spare;
 6052
 6053	if (this_spare > task_util(p) / 2 &&
 6054	    imbalance_scale*this_spare > 100*most_spare)
 6055		return NULL;
 6056
 6057	if (most_spare > task_util(p) / 2)
 6058		return most_spare_sg;
 6059
 6060skip_spare:
 6061	if (!idlest)
 6062		return NULL;
 6063
 6064	/*
 6065	 * When comparing groups across NUMA domains, it's possible for the
 6066	 * local domain to be very lightly loaded relative to the remote
 6067	 * domains but "imbalance" skews the comparison making remote CPUs
 6068	 * look much more favourable. When considering cross-domain, add
 6069	 * imbalance to the runnable load on the remote node and consider
 6070	 * staying local.
 6071	 */
 6072	if ((sd->flags & SD_NUMA) &&
 6073	    min_runnable_load + imbalance >= this_runnable_load)
 6074		return NULL;
 6075
 6076	if (min_runnable_load > (this_runnable_load + imbalance))
 6077		return NULL;
 6078
 6079	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
 6080	     (100*this_avg_load < imbalance_scale*min_avg_load))
 6081		return NULL;
 6082
 6083	return idlest;
 6084}
 6085
 6086/*
 6087 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
 6088 */
 6089static int
 6090find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 6091{
 6092	unsigned long load, min_load = ULONG_MAX;
 6093	unsigned int min_exit_latency = UINT_MAX;
 6094	u64 latest_idle_timestamp = 0;
 6095	int least_loaded_cpu = this_cpu;
 6096	int shallowest_idle_cpu = -1;
 6097	int i;
 6098
 6099	/* Check if we have any choice: */
 6100	if (group->group_weight == 1)
 6101		return cpumask_first(sched_group_span(group));
 6102
 6103	/* Traverse only the allowed CPUs */
 6104	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
 6105		if (idle_cpu(i)) {
 6106			struct rq *rq = cpu_rq(i);
 6107			struct cpuidle_state *idle = idle_get_state(rq);
 6108			if (idle && idle->exit_latency < min_exit_latency) {
 6109				/*
 6110				 * We give priority to a CPU whose idle state
 6111				 * has the smallest exit latency irrespective
 6112				 * of any idle timestamp.
 6113				 */
 6114				min_exit_latency = idle->exit_latency;
 6115				latest_idle_timestamp = rq->idle_stamp;
 6116				shallowest_idle_cpu = i;
 6117			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
 6118				   rq->idle_stamp > latest_idle_timestamp) {
 6119				/*
 6120				 * If equal or no active idle state, then
 6121				 * the most recently idled CPU might have
 6122				 * a warmer cache.
 6123				 */
 6124				latest_idle_timestamp = rq->idle_stamp;
 6125				shallowest_idle_cpu = i;
 6126			}
 6127		} else if (shallowest_idle_cpu == -1) {
 6128			load = weighted_cpuload(cpu_rq(i));
 6129			if (load < min_load) {
 6130				min_load = load;
 6131				least_loaded_cpu = i;
 6132			}
 6133		}
 6134	}
 6135
 6136	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 6137}
 6138
 6139static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
 6140				  int cpu, int prev_cpu, int sd_flag)
 6141{
 6142	int new_cpu = cpu;
 6143
 6144	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
 6145		return prev_cpu;
 6146
 6147	while (sd) {
 6148		struct sched_group *group;
 6149		struct sched_domain *tmp;
 6150		int weight;
 6151
 6152		if (!(sd->flags & sd_flag)) {
 6153			sd = sd->child;
 6154			continue;
 6155		}
 6156
 6157		group = find_idlest_group(sd, p, cpu, sd_flag);
 6158		if (!group) {
 6159			sd = sd->child;
 6160			continue;
 6161		}
 6162
 6163		new_cpu = find_idlest_group_cpu(group, p, cpu);
 6164		if (new_cpu == cpu) {
 6165			/* Now try balancing at a lower domain level of 'cpu': */
 6166			sd = sd->child;
 6167			continue;
 6168		}
 6169
 6170		/* Now try balancing at a lower domain level of 'new_cpu': */
 6171		cpu = new_cpu;
 6172		weight = sd->span_weight;
 6173		sd = NULL;
 6174		for_each_domain(cpu, tmp) {
 6175			if (weight <= tmp->span_weight)
 6176				break;
 6177			if (tmp->flags & sd_flag)
 6178				sd = tmp;
 6179		}
 6180	}
 6181
 6182	return new_cpu;
 6183}
 6184
 6185#ifdef CONFIG_SCHED_SMT
 6186
 6187static inline void set_idle_cores(int cpu, int val)
 6188{
 6189	struct sched_domain_shared *sds;
 6190
 6191	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 6192	if (sds)
 6193		WRITE_ONCE(sds->has_idle_cores, val);
 6194}
 6195
 6196static inline bool test_idle_cores(int cpu, bool def)
 6197{
 6198	struct sched_domain_shared *sds;
 6199
 6200	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 6201	if (sds)
 6202		return READ_ONCE(sds->has_idle_cores);
 6203
 6204	return def;
 6205}
 6206
 6207/*
 6208 * Scans the local SMT mask to see if the entire core is idle, and records this
 6209 * information in sd_llc_shared->has_idle_cores.
 6210 *
 6211 * Since SMT siblings share all cache levels, inspecting this limited remote
 6212 * state should be fairly cheap.
 6213 */
 6214void __update_idle_core(struct rq *rq)
 6215{
 6216	int core = cpu_of(rq);
 6217	int cpu;
 6218
 6219	rcu_read_lock();
 6220	if (test_idle_cores(core, true))
 6221		goto unlock;
 6222
 6223	for_each_cpu(cpu, cpu_smt_mask(core)) {
 6224		if (cpu == core)
 6225			continue;
 6226
 6227		if (!idle_cpu(cpu))
 6228			goto unlock;
 6229	}
 6230
 6231	set_idle_cores(core, 1);
 6232unlock:
 6233	rcu_read_unlock();
 6234}
 6235
 6236/*
 6237 * Scan the entire LLC domain for idle cores; this dynamically switches off if
 6238 * there are no idle cores left in the system; tracked through
 6239 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
 6240 */
 6241static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
 6242{
 6243	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
 6244	int core, cpu;
 6245
 6246	if (!static_branch_likely(&sched_smt_present))
 6247		return -1;
 6248
 6249	if (!test_idle_cores(target, false))
 6250		return -1;
 6251
 6252	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
 6253
 6254	for_each_cpu_wrap(core, cpus, target) {
 6255		bool idle = true;
 6256
 6257		for_each_cpu(cpu, cpu_smt_mask(core)) {
 6258			cpumask_clear_cpu(cpu, cpus);
 6259			if (!idle_cpu(cpu))
 6260				idle = false;
 6261		}
 6262
 6263		if (idle)
 6264			return core;
 6265	}
 6266
 6267	/*
 6268	 * Failed to find an idle core; stop looking for one.
 6269	 */
 6270	set_idle_cores(target, 0);
 6271
 6272	return -1;
 6273}
 6274
 6275/*
 6276 * Scan the local SMT mask for idle CPUs.
 6277 */
 6278static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 6279{
 6280	int cpu;
 6281
 6282	if (!static_branch_likely(&sched_smt_present))
 6283		return -1;
 6284
 6285	for_each_cpu(cpu, cpu_smt_mask(target)) {
 6286		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 6287			continue;
 6288		if (idle_cpu(cpu))
 6289			return cpu;
 6290	}
 6291
 6292	return -1;
 6293}
 6294
 6295#else /* CONFIG_SCHED_SMT */
 6296
 6297static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
 6298{
 6299	return -1;
 6300}
 6301
 6302static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 6303{
 6304	return -1;
 6305}
 6306
 6307#endif /* CONFIG_SCHED_SMT */
 6308
 6309/*
 6310 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
 6311 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
 6312 * average idle time for this rq (as found in rq->avg_idle).
 6313 */
 6314static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
 6315{
 6316	struct sched_domain *this_sd;
 6317	u64 avg_cost, avg_idle;
 6318	u64 time, cost;
 6319	s64 delta;
 6320	int cpu, nr = INT_MAX;
 6321
 6322	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 6323	if (!this_sd)
 6324		return -1;
 6325
 6326	/*
 6327	 * Due to large variance we need a large fuzz factor; hackbench in
 6328	 * particularly is sensitive here.
 6329	 */
 6330	avg_idle = this_rq()->avg_idle / 512;
 6331	avg_cost = this_sd->avg_scan_cost + 1;
 6332
 6333	if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
 6334		return -1;
 6335
 6336	if (sched_feat(SIS_PROP)) {
 6337		u64 span_avg = sd->span_weight * avg_idle;
 6338		if (span_avg > 4*avg_cost)
 6339			nr = div_u64(span_avg, avg_cost);
 6340		else
 6341			nr = 4;
 6342	}
 6343
 6344	time = local_clock();
 6345
 6346	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
 6347		if (!--nr)
 6348			return -1;
 6349		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 6350			continue;
 6351		if (idle_cpu(cpu))
 6352			break;
 6353	}
 6354
 6355	time = local_clock() - time;
 6356	cost = this_sd->avg_scan_cost;
 6357	delta = (s64)(time - cost) / 8;
 6358	this_sd->avg_scan_cost += delta;
 6359
 6360	return cpu;
 6361}
 6362
 6363/*
 6364 * Try and locate an idle core/thread in the LLC cache domain.
 6365 */
 6366static int select_idle_sibling(struct task_struct *p, int prev, int target)
 6367{
 6368	struct sched_domain *sd;
 6369	int i, recent_used_cpu;
 6370
 6371	if (idle_cpu(target))
 6372		return target;
 6373
 6374	/*
 6375	 * If the previous CPU is cache affine and idle, don't be stupid:
 6376	 */
 6377	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
 6378		return prev;
 6379
 6380	/* Check a recently used CPU as a potential idle candidate: */
 6381	recent_used_cpu = p->recent_used_cpu;
 6382	if (recent_used_cpu != prev &&
 6383	    recent_used_cpu != target &&
 6384	    cpus_share_cache(recent_used_cpu, target) &&
 6385	    idle_cpu(recent_used_cpu) &&
 6386	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
 6387		/*
 6388		 * Replace recent_used_cpu with prev as it is a potential
 6389		 * candidate for the next wake:
 6390		 */
 6391		p->recent_used_cpu = prev;
 6392		return recent_used_cpu;
 6393	}
 6394
 6395	sd = rcu_dereference(per_cpu(sd_llc, target));
 6396	if (!sd)
 6397		return target;
 6398
 6399	i = select_idle_core(p, sd, target);
 6400	if ((unsigned)i < nr_cpumask_bits)
 6401		return i;
 6402
 6403	i = select_idle_cpu(p, sd, target);
 6404	if ((unsigned)i < nr_cpumask_bits)
 6405		return i;
 6406
 6407	i = select_idle_smt(p, sd, target);
 6408	if ((unsigned)i < nr_cpumask_bits)
 6409		return i;
 6410
 6411	return target;
 6412}
 6413
 6414/**
 6415 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
 6416 * @cpu: the CPU to get the utilization of
 6417 *
 6418 * The unit of the return value must be the one of capacity so we can compare
 6419 * the utilization with the capacity of the CPU that is available for CFS task
 6420 * (ie cpu_capacity).
 6421 *
 6422 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
 6423 * recent utilization of currently non-runnable tasks on a CPU. It represents
 6424 * the amount of utilization of a CPU in the range [0..capacity_orig] where
 6425 * capacity_orig is the cpu_capacity available at the highest frequency
 6426 * (arch_scale_freq_capacity()).
 6427 * The utilization of a CPU converges towards a sum equal to or less than the
 6428 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
 6429 * the running time on this CPU scaled by capacity_curr.
 6430 *
 6431 * The estimated utilization of a CPU is defined to be the maximum between its
 6432 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
 6433 * currently RUNNABLE on that CPU.
 6434 * This allows to properly represent the expected utilization of a CPU which
 6435 * has just got a big task running since a long sleep period. At the same time
 6436 * however it preserves the benefits of the "blocked utilization" in
 6437 * describing the potential for other tasks waking up on the same CPU.
 6438 *
 6439 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
 6440 * higher than capacity_orig because of unfortunate rounding in
 6441 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
 6442 * the average stabilizes with the new running time. We need to check that the
 6443 * utilization stays within the range of [0..capacity_orig] and cap it if
 6444 * necessary. Without utilization capping, a group could be seen as overloaded
 6445 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
 6446 * available capacity. We allow utilization to overshoot capacity_curr (but not
 6447 * capacity_orig) as it useful for predicting the capacity required after task
 6448 * migrations (scheduler-driven DVFS).
 6449 *
 6450 * Return: the (estimated) utilization for the specified CPU
 6451 */
 6452static inline unsigned long cpu_util(int cpu)
 6453{
 6454	struct cfs_rq *cfs_rq;
 6455	unsigned int util;
 6456
 6457	cfs_rq = &cpu_rq(cpu)->cfs;
 6458	util = READ_ONCE(cfs_rq->avg.util_avg);
 6459
 6460	if (sched_feat(UTIL_EST))
 6461		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
 6462
 6463	return min_t(unsigned long, util, capacity_orig_of(cpu));
 6464}
 6465
 6466/*
 6467 * cpu_util_wake: Compute CPU utilization with any contributions from
 6468 * the waking task p removed.
 6469 */
 6470static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 6471{
 6472	struct cfs_rq *cfs_rq;
 6473	unsigned int util;
 6474
 6475	/* Task has no contribution or is new */
 6476	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 6477		return cpu_util(cpu);
 6478
 6479	cfs_rq = &cpu_rq(cpu)->cfs;
 6480	util = READ_ONCE(cfs_rq->avg.util_avg);
 6481
 6482	/* Discount task's blocked util from CPU's util */
 6483	util -= min_t(unsigned int, util, task_util(p));
 6484
 6485	/*
 6486	 * Covered cases:
 6487	 *
 6488	 * a) if *p is the only task sleeping on this CPU, then:
 6489	 *      cpu_util (== task_util) > util_est (== 0)
 6490	 *    and thus we return:
 6491	 *      cpu_util_wake = (cpu_util - task_util) = 0
 6492	 *
 6493	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
 6494	 *    IDLE, then:
 6495	 *      cpu_util >= task_util
 6496	 *      cpu_util > util_est (== 0)
 6497	 *    and thus we discount *p's blocked utilization to return:
 6498	 *      cpu_util_wake = (cpu_util - task_util) >= 0
 6499	 *
 6500	 * c) if other tasks are RUNNABLE on that CPU and
 6501	 *      util_est > cpu_util
 6502	 *    then we use util_est since it returns a more restrictive
 6503	 *    estimation of the spare capacity on that CPU, by just
 6504	 *    considering the expected utilization of tasks already
 6505	 *    runnable on that CPU.
 6506	 *
 6507	 * Cases a) and b) are covered by the above code, while case c) is
 6508	 * covered by the following code when estimated utilization is
 6509	 * enabled.
 6510	 */
 6511	if (sched_feat(UTIL_EST))
 6512		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
 6513
 6514	/*
 6515	 * Utilization (estimated) can exceed the CPU capacity, thus let's
 6516	 * clamp to the maximum CPU capacity to ensure consistency with
 6517	 * the cpu_util call.
 6518	 */
 6519	return min_t(unsigned long, util, capacity_orig_of(cpu));
 6520}
 6521
 6522/*
 6523 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
 6524 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
 6525 *
 6526 * In that case WAKE_AFFINE doesn't make sense and we'll let
 6527 * BALANCE_WAKE sort things out.
 6528 */
 6529static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 6530{
 6531	long min_cap, max_cap;
 6532
 6533	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
 6534	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
 6535
 6536	/* Minimum capacity is close to max, no need to abort wake_affine */
 6537	if (max_cap - min_cap < max_cap >> 3)
 6538		return 0;
 6539
 6540	/* Bring task utilization in sync with prev_cpu */
 6541	sync_entity_load_avg(&p->se);
 6542
 6543	return min_cap * 1024 < task_util(p) * capacity_margin;
 6544}
 6545
 6546/*
 6547 * select_task_rq_fair: Select target runqueue for the waking task in domains
 6548 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
 6549 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 6550 *
 6551 * Balances load by selecting the idlest CPU in the idlest group, or under
 6552 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
 6553 *
 6554 * Returns the target CPU number.
 6555 *
 6556 * preempt must be disabled.
 6557 */
 6558static int
 6559select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 6560{
 6561	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 6562	int cpu = smp_processor_id();
 6563	int new_cpu = prev_cpu;
 6564	int want_affine = 0;
 6565	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
 6566
 6567	if (sd_flag & SD_BALANCE_WAKE) {
 6568		record_wakee(p);
 6569		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
 6570			      && cpumask_test_cpu(cpu, &p->cpus_allowed);
 6571	}
 6572
 6573	rcu_read_lock();
 6574	for_each_domain(cpu, tmp) {
 6575		if (!(tmp->flags & SD_LOAD_BALANCE))
 6576			break;
 6577
 6578		/*
 6579		 * If both 'cpu' and 'prev_cpu' are part of this domain,
 6580		 * cpu is a valid SD_WAKE_AFFINE target.
 6581		 */
 6582		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 6583		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
 6584			affine_sd = tmp;
 6585			break;
 6586		}
 6587
 6588		if (tmp->flags & sd_flag)
 6589			sd = tmp;
 6590		else if (!want_affine)
 6591			break;
 6592	}
 6593
 6594	if (affine_sd) {
 6595		sd = NULL; /* Prefer wake_affine over balance flags */
 6596		if (cpu == prev_cpu)
 6597			goto pick_cpu;
 6598
 6599		new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
 6600	}
 6601
 6602	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
 6603		/*
 6604		 * We're going to need the task's util for capacity_spare_wake
 6605		 * in find_idlest_group. Sync it up to prev_cpu's
 6606		 * last_update_time.
 6607		 */
 6608		sync_entity_load_avg(&p->se);
 6609	}
 6610
 6611	if (!sd) {
 6612pick_cpu:
 6613		if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
 6614			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 6615
 6616			if (want_affine)
 6617				current->recent_used_cpu = cpu;
 6618		}
 6619	} else {
 6620		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
 6621	}
 6622	rcu_read_unlock();
 6623
 6624	return new_cpu;
 6625}
 6626
 6627static void detach_entity_cfs_rq(struct sched_entity *se);
 6628
 6629/*
 6630 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
 6631 * cfs_rq_of(p) references at time of call are still valid and identify the
 6632 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
 6633 */
 6634static void migrate_task_rq_fair(struct task_struct *p)
 6635{
 6636	/*
 6637	 * As blocked tasks retain absolute vruntime the migration needs to
 6638	 * deal with this by subtracting the old and adding the new
 6639	 * min_vruntime -- the latter is done by enqueue_entity() when placing
 6640	 * the task on the new runqueue.
 6641	 */
 6642	if (p->state == TASK_WAKING) {
 6643		struct sched_entity *se = &p->se;
 6644		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 6645		u64 min_vruntime;
 6646
 6647#ifndef CONFIG_64BIT
 6648		u64 min_vruntime_copy;
 6649
 6650		do {
 6651			min_vruntime_copy = cfs_rq->min_vruntime_copy;
 6652			smp_rmb();
 6653			min_vruntime = cfs_rq->min_vruntime;
 6654		} while (min_vruntime != min_vruntime_copy);
 6655#else
 6656		min_vruntime = cfs_rq->min_vruntime;
 6657#endif
 6658
 6659		se->vruntime -= min_vruntime;
 6660	}
 6661
 6662	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
 6663		/*
 6664		 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
 6665		 * rq->lock and can modify state directly.
 6666		 */
 6667		lockdep_assert_held(&task_rq(p)->lock);
 6668		detach_entity_cfs_rq(&p->se);
 6669
 6670	} else {
 6671		/*
 6672		 * We are supposed to update the task to "current" time, then
 6673		 * its up to date and ready to go to new CPU/cfs_rq. But we
 6674		 * have difficulty in getting what current time is, so simply
 6675		 * throw away the out-of-date time. This will result in the
 6676		 * wakee task is less decayed, but giving the wakee more load
 6677		 * sounds not bad.
 6678		 */
 6679		remove_entity_load_avg(&p->se);
 6680	}
 6681
 6682	/* Tell new CPU we are migrated */
 6683	p->se.avg.last_update_time = 0;
 6684
 6685	/* We have migrated, no longer consider this task hot */
 6686	p->se.exec_start = 0;
 6687}
 6688
 6689static void task_dead_fair(struct task_struct *p)
 6690{
 6691	remove_entity_load_avg(&p->se);
 6692}
 6693#endif /* CONFIG_SMP */
 6694
 6695static unsigned long wakeup_gran(struct sched_entity *se)
 6696{
 6697	unsigned long gran = sysctl_sched_wakeup_granularity;
 6698
 6699	/*
 6700	 * Since its curr running now, convert the gran from real-time
 6701	 * to virtual-time in his units.
 6702	 *
 6703	 * By using 'se' instead of 'curr' we penalize light tasks, so
 6704	 * they get preempted easier. That is, if 'se' < 'curr' then
 6705	 * the resulting gran will be larger, therefore penalizing the
 6706	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
 6707	 * be smaller, again penalizing the lighter task.
 6708	 *
 6709	 * This is especially important for buddies when the leftmost
 6710	 * task is higher priority than the buddy.
 6711	 */
 6712	return calc_delta_fair(gran, se);
 6713}
 6714
 6715/*
 6716 * Should 'se' preempt 'curr'.
 6717 *
 6718 *             |s1
 6719 *        |s2
 6720 *   |s3
 6721 *         g
 6722 *      |<--->|c
 6723 *
 6724 *  w(c, s1) = -1
 6725 *  w(c, s2) =  0
 6726 *  w(c, s3) =  1
 6727 *
 6728 */
 6729static int
 6730wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 6731{
 6732	s64 gran, vdiff = curr->vruntime - se->vruntime;
 6733
 6734	if (vdiff <= 0)
 6735		return -1;
 6736
 6737	gran = wakeup_gran(se);
 6738	if (vdiff > gran)
 6739		return 1;
 6740
 6741	return 0;
 6742}
 6743
 6744static void set_last_buddy(struct sched_entity *se)
 6745{
 6746	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
 6747		return;
 6748
 6749	for_each_sched_entity(se) {
 6750		if (SCHED_WARN_ON(!se->on_rq))
 6751			return;
 6752		cfs_rq_of(se)->last = se;
 6753	}
 6754}
 6755
 6756static void set_next_buddy(struct sched_entity *se)
 6757{
 6758	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
 6759		return;
 6760
 6761	for_each_sched_entity(se) {
 6762		if (SCHED_WARN_ON(!se->on_rq))
 6763			return;
 6764		cfs_rq_of(se)->next = se;
 6765	}
 6766}
 6767
 6768static void set_skip_buddy(struct sched_entity *se)
 6769{
 6770	for_each_sched_entity(se)
 6771		cfs_rq_of(se)->skip = se;
 6772}
 6773
 6774/*
 6775 * Preempt the current task with a newly woken task if needed:
 6776 */
 6777static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 6778{
 6779	struct task_struct *curr = rq->curr;
 6780	struct sched_entity *se = &curr->se, *pse = &p->se;
 6781	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 6782	int scale = cfs_rq->nr_running >= sched_nr_latency;
 6783	int next_buddy_marked = 0;
 6784
 6785	if (unlikely(se == pse))
 6786		return;
 6787
 6788	/*
 6789	 * This is possible from callers such as attach_tasks(), in which we
 6790	 * unconditionally check_prempt_curr() after an enqueue (which may have
 6791	 * lead to a throttle).  This both saves work and prevents false
 6792	 * next-buddy nomination below.
 6793	 */
 6794	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
 6795		return;
 6796
 6797	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
 6798		set_next_buddy(pse);
 6799		next_buddy_marked = 1;
 6800	}
 6801
 6802	/*
 6803	 * We can come here with TIF_NEED_RESCHED already set from new task
 6804	 * wake up path.
 6805	 *
 6806	 * Note: this also catches the edge-case of curr being in a throttled
 6807	 * group (e.g. via set_curr_task), since update_curr() (in the
 6808	 * enqueue of curr) will have resulted in resched being set.  This
 6809	 * prevents us from potentially nominating it as a false LAST_BUDDY
 6810	 * below.
 6811	 */
 6812	if (test_tsk_need_resched(curr))
 6813		return;
 6814
 6815	/* Idle tasks are by definition preempted by non-idle tasks. */
 6816	if (unlikely(curr->policy == SCHED_IDLE) &&
 6817	    likely(p->policy != SCHED_IDLE))
 6818		goto preempt;
 6819
 6820	/*
 6821	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
 6822	 * is driven by the tick):
 6823	 */
 6824	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
 6825		return;
 6826
 6827	find_matching_se(&se, &pse);
 6828	update_curr(cfs_rq_of(se));
 6829	BUG_ON(!pse);
 6830	if (wakeup_preempt_entity(se, pse) == 1) {
 6831		/*
 6832		 * Bias pick_next to pick the sched entity that is
 6833		 * triggering this preemption.
 6834		 */
 6835		if (!next_buddy_marked)
 6836			set_next_buddy(pse);
 6837		goto preempt;
 6838	}
 6839
 6840	return;
 6841
 6842preempt:
 6843	resched_curr(rq);
 6844	/*
 6845	 * Only set the backward buddy when the current task is still
 6846	 * on the rq. This can happen when a wakeup gets interleaved
 6847	 * with schedule on the ->pre_schedule() or idle_balance()
 6848	 * point, either of which can * drop the rq lock.
 6849	 *
 6850	 * Also, during early boot the idle thread is in the fair class,
 6851	 * for obvious reasons its a bad idea to schedule back to it.
 6852	 */
 6853	if (unlikely(!se->on_rq || curr == rq->idle))
 6854		return;
 6855
 6856	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
 6857		set_last_buddy(se);
 6858}
 6859
 6860static struct task_struct *
 6861pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 6862{
 6863	struct cfs_rq *cfs_rq = &rq->cfs;
 6864	struct sched_entity *se;
 6865	struct task_struct *p;
 6866	int new_tasks;
 6867
 6868again:
 6869	if (!cfs_rq->nr_running)
 6870		goto idle;
 6871
 6872#ifdef CONFIG_FAIR_GROUP_SCHED
 6873	if (prev->sched_class != &fair_sched_class)
 6874		goto simple;
 6875
 6876	/*
 6877	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
 6878	 * likely that a next task is from the same cgroup as the current.
 6879	 *
 6880	 * Therefore attempt to avoid putting and setting the entire cgroup
 6881	 * hierarchy, only change the part that actually changes.
 6882	 */
 6883
 6884	do {
 6885		struct sched_entity *curr = cfs_rq->curr;
 6886
 6887		/*
 6888		 * Since we got here without doing put_prev_entity() we also
 6889		 * have to consider cfs_rq->curr. If it is still a runnable
 6890		 * entity, update_curr() will update its vruntime, otherwise
 6891		 * forget we've ever seen it.
 6892		 */
 6893		if (curr) {
 6894			if (curr->on_rq)
 6895				update_curr(cfs_rq);
 6896			else
 6897				curr = NULL;
 6898
 6899			/*
 6900			 * This call to check_cfs_rq_runtime() will do the
 6901			 * throttle and dequeue its entity in the parent(s).
 6902			 * Therefore the nr_running test will indeed
 6903			 * be correct.
 6904			 */
 6905			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
 6906				cfs_rq = &rq->cfs;
 6907
 6908				if (!cfs_rq->nr_running)
 6909					goto idle;
 6910
 6911				goto simple;
 6912			}
 6913		}
 6914
 6915		se = pick_next_entity(cfs_rq, curr);
 6916		cfs_rq = group_cfs_rq(se);
 6917	} while (cfs_rq);
 6918
 6919	p = task_of(se);
 6920
 6921	/*
 6922	 * Since we haven't yet done put_prev_entity and if the selected task
 6923	 * is a different task than we started out with, try and touch the
 6924	 * least amount of cfs_rqs.
 6925	 */
 6926	if (prev != p) {
 6927		struct sched_entity *pse = &prev->se;
 6928
 6929		while (!(cfs_rq = is_same_group(se, pse))) {
 6930			int se_depth = se->depth;
 6931			int pse_depth = pse->depth;
 6932
 6933			if (se_depth <= pse_depth) {
 6934				put_prev_entity(cfs_rq_of(pse), pse);
 6935				pse = parent_entity(pse);
 6936			}
 6937			if (se_depth >= pse_depth) {
 6938				set_next_entity(cfs_rq_of(se), se);
 6939				se = parent_entity(se);
 6940			}
 6941		}
 6942
 6943		put_prev_entity(cfs_rq, pse);
 6944		set_next_entity(cfs_rq, se);
 6945	}
 6946
 6947	goto done;
 6948simple:
 6949#endif
 6950
 6951	put_prev_task(rq, prev);
 6952
 6953	do {
 6954		se = pick_next_entity(cfs_rq, NULL);
 6955		set_next_entity(cfs_rq, se);
 6956		cfs_rq = group_cfs_rq(se);
 6957	} while (cfs_rq);
 6958
 6959	p = task_of(se);
 6960
 6961done: __maybe_unused;
 6962#ifdef CONFIG_SMP
 6963	/*
 6964	 * Move the next running task to the front of
 6965	 * the list, so our cfs_tasks list becomes MRU
 6966	 * one.
 6967	 */
 6968	list_move(&p->se.group_node, &rq->cfs_tasks);
 6969#endif
 6970
 6971	if (hrtick_enabled(rq))
 6972		hrtick_start_fair(rq, p);
 6973
 6974	return p;
 6975
 6976idle:
 6977	new_tasks = idle_balance(rq, rf);
 6978
 6979	/*
 6980	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 6981	 * possible for any higher priority task to appear. In that case we
 6982	 * must re-start the pick_next_entity() loop.
 6983	 */
 6984	if (new_tasks < 0)
 6985		return RETRY_TASK;
 6986
 6987	if (new_tasks > 0)
 6988		goto again;
 6989
 6990	return NULL;
 6991}
 6992
 6993/*
 6994 * Account for a descheduled task:
 6995 */
 6996static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 6997{
 6998	struct sched_entity *se = &prev->se;
 6999	struct cfs_rq *cfs_rq;
 7000
 7001	for_each_sched_entity(se) {
 7002		cfs_rq = cfs_rq_of(se);
 7003		put_prev_entity(cfs_rq, se);
 7004	}
 7005}
 7006
 7007/*
 7008 * sched_yield() is very simple
 7009 *
 7010 * The magic of dealing with the ->skip buddy is in pick_next_entity.
 7011 */
 7012static void yield_task_fair(struct rq *rq)
 7013{
 7014	struct task_struct *curr = rq->curr;
 7015	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 7016	struct sched_entity *se = &curr->se;
 7017
 7018	/*
 7019	 * Are we the only task in the tree?
 7020	 */
 7021	if (unlikely(rq->nr_running == 1))
 7022		return;
 7023
 7024	clear_buddies(cfs_rq, se);
 7025
 7026	if (curr->policy != SCHED_BATCH) {
 7027		update_rq_clock(rq);
 7028		/*
 7029		 * Update run-time statistics of the 'current'.
 7030		 */
 7031		update_curr(cfs_rq);
 7032		/*
 7033		 * Tell update_rq_clock() that we've just updated,
 7034		 * so we don't do microscopic update in schedule()
 7035		 * and double the fastpath cost.
 7036		 */
 7037		rq_clock_skip_update(rq);
 7038	}
 7039
 7040	set_skip_buddy(se);
 7041}
 7042
 7043static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
 7044{
 7045	struct sched_entity *se = &p->se;
 7046
 7047	/* throttled hierarchies are not runnable */
 7048	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 7049		return false;
 7050
 7051	/* Tell the scheduler that we'd really like pse to run next. */
 7052	set_next_buddy(se);
 7053
 7054	yield_task_fair(rq);
 7055
 7056	return true;
 7057}
 7058
 7059#ifdef CONFIG_SMP
 7060/**************************************************
 7061 * Fair scheduling class load-balancing methods.
 7062 *
 7063 * BASICS
 7064 *
 7065 * The purpose of load-balancing is to achieve the same basic fairness the
 7066 * per-CPU scheduler provides, namely provide a proportional amount of compute
 7067 * time to each task. This is expressed in the following equation:
 7068 *
 7069 *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
 7070 *
 7071 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
 7072 * W_i,0 is defined as:
 7073 *
 7074 *   W_i,0 = \Sum_j w_i,j                                             (2)
 7075 *
 7076 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
 7077 * is derived from the nice value as per sched_prio_to_weight[].
 7078 *
 7079 * The weight average is an exponential decay average of the instantaneous
 7080 * weight:
 7081 *
 7082 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
 7083 *
 7084 * C_i is the compute capacity of CPU i, typically it is the
 7085 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
 7086 * can also include other factors [XXX].
 7087 *
 7088 * To achieve this balance we define a measure of imbalance which follows
 7089 * directly from (1):
 7090 *
 7091 *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
 7092 *
 7093 * We them move tasks around to minimize the imbalance. In the continuous
 7094 * function space it is obvious this converges, in the discrete case we get
 7095 * a few fun cases generally called infeasible weight scenarios.
 7096 *
 7097 * [XXX expand on:
 7098 *     - infeasible weights;
 7099 *     - local vs global optima in the discrete case. ]
 7100 *
 7101 *
 7102 * SCHED DOMAINS
 7103 *
 7104 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
 7105 * for all i,j solution, we create a tree of CPUs that follows the hardware
 7106 * topology where each level pairs two lower groups (or better). This results
 7107 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
 7108 * tree to only the first of the previous level and we decrease the frequency
 7109 * of load-balance at each level inv. proportional to the number of CPUs in
 7110 * the groups.
 7111 *
 7112 * This yields:
 7113 *
 7114 *     log_2 n     1     n
 7115 *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
 7116 *     i = 0      2^i   2^i
 7117 *                               `- size of each group
 7118 *         |         |     `- number of CPUs doing load-balance
 7119 *         |         `- freq
 7120 *         `- sum over all levels
 7121 *
 7122 * Coupled with a limit on how many tasks we can migrate every balance pass,
 7123 * this makes (5) the runtime complexity of the balancer.
 7124 *
 7125 * An important property here is that each CPU is still (indirectly) connected
 7126 * to every other CPU in at most O(log n) steps:
 7127 *
 7128 * The adjacency matrix of the resulting graph is given by:
 7129 *
 7130 *             log_2 n
 7131 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
 7132 *             k = 0
 7133 *
 7134 * And you'll find that:
 7135 *
 7136 *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
 7137 *
 7138 * Showing there's indeed a path between every CPU in at most O(log n) steps.
 7139 * The task movement gives a factor of O(m), giving a convergence complexity
 7140 * of:
 7141 *
 7142 *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
 7143 *
 7144 *
 7145 * WORK CONSERVING
 7146 *
 7147 * In order to avoid CPUs going idle while there's still work to do, new idle
 7148 * balancing is more aggressive and has the newly idle CPU iterate up the domain
 7149 * tree itself instead of relying on other CPUs to bring it work.
 7150 *
 7151 * This adds some complexity to both (5) and (8) but it reduces the total idle
 7152 * time.
 7153 *
 7154 * [XXX more?]
 7155 *
 7156 *
 7157 * CGROUPS
 7158 *
 7159 * Cgroups make a horror show out of (2), instead of a simple sum we get:
 7160 *
 7161 *                                s_k,i
 7162 *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
 7163 *                                 S_k
 7164 *
 7165 * Where
 7166 *
 7167 *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
 7168 *
 7169 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
 7170 *
 7171 * The big problem is S_k, its a global sum needed to compute a local (W_i)
 7172 * property.
 7173 *
 7174 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
 7175 *      rewrite all of this once again.]
 7176 */
 7177
 7178static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 7179
 7180enum fbq_type { regular, remote, all };
 7181
 7182#define LBF_ALL_PINNED	0x01
 7183#define LBF_NEED_BREAK	0x02
 7184#define LBF_DST_PINNED  0x04
 7185#define LBF_SOME_PINNED	0x08
 7186#define LBF_NOHZ_STATS	0x10
 7187#define LBF_NOHZ_AGAIN	0x20
 7188
 7189struct lb_env {
 7190	struct sched_domain	*sd;
 7191
 7192	struct rq		*src_rq;
 7193	int			src_cpu;
 7194
 7195	int			dst_cpu;
 7196	struct rq		*dst_rq;
 7197
 7198	struct cpumask		*dst_grpmask;
 7199	int			new_dst_cpu;
 7200	enum cpu_idle_type	idle;
 7201	long			imbalance;
 7202	/* The set of CPUs under consideration for load-balancing */
 7203	struct cpumask		*cpus;
 7204
 7205	unsigned int		flags;
 7206
 7207	unsigned int		loop;
 7208	unsigned int		loop_break;
 7209	unsigned int		loop_max;
 7210
 7211	enum fbq_type		fbq_type;
 7212	struct list_head	tasks;
 7213};
 7214
 7215/*
 7216 * Is this task likely cache-hot:
 7217 */
 7218static int task_hot(struct task_struct *p, struct lb_env *env)
 7219{
 7220	s64 delta;
 7221
 7222	lockdep_assert_held(&env->src_rq->lock);
 7223
 7224	if (p->sched_class != &fair_sched_class)
 7225		return 0;
 7226
 7227	if (unlikely(p->policy == SCHED_IDLE))
 7228		return 0;
 7229
 7230	/*
 7231	 * Buddy candidates are cache hot:
 7232	 */
 7233	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
 7234			(&p->se == cfs_rq_of(&p->se)->next ||
 7235			 &p->se == cfs_rq_of(&p->se)->last))
 7236		return 1;
 7237
 7238	if (sysctl_sched_migration_cost == -1)
 7239		return 1;
 7240	if (sysctl_sched_migration_cost == 0)
 7241		return 0;
 7242
 7243	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
 7244
 7245	return delta < (s64)sysctl_sched_migration_cost;
 7246}
 7247
 7248#ifdef CONFIG_NUMA_BALANCING
 7249/*
 7250 * Returns 1, if task migration degrades locality
 7251 * Returns 0, if task migration improves locality i.e migration preferred.
 7252 * Returns -1, if task migration is not affected by locality.
 7253 */
 7254static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 7255{
 7256	struct numa_group *numa_group = rcu_dereference(p->numa_group);
 7257	unsigned long src_faults, dst_faults;
 7258	int src_nid, dst_nid;
 7259
 7260	if (!static_branch_likely(&sched_numa_balancing))
 7261		return -1;
 7262
 7263	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
 7264		return -1;
 7265
 7266	src_nid = cpu_to_node(env->src_cpu);
 7267	dst_nid = cpu_to_node(env->dst_cpu);
 7268
 7269	if (src_nid == dst_nid)
 7270		return -1;
 7271
 7272	/* Migrating away from the preferred node is always bad. */
 7273	if (src_nid == p->numa_preferred_nid) {
 7274		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
 7275			return 1;
 7276		else
 7277			return -1;
 7278	}
 7279
 7280	/* Encourage migration to the preferred node. */
 7281	if (dst_nid == p->numa_preferred_nid)
 7282		return 0;
 7283
 7284	/* Leaving a core idle is often worse than degrading locality. */
 7285	if (env->idle != CPU_NOT_IDLE)
 7286		return -1;
 7287
 7288	if (numa_group) {
 7289		src_faults = group_faults(p, src_nid);
 7290		dst_faults = group_faults(p, dst_nid);
 7291	} else {
 7292		src_faults = task_faults(p, src_nid);
 7293		dst_faults = task_faults(p, dst_nid);
 7294	}
 7295
 7296	return dst_faults < src_faults;
 7297}
 7298
 7299#else
 7300static inline int migrate_degrades_locality(struct task_struct *p,
 7301					     struct lb_env *env)
 7302{
 7303	return -1;
 7304}
 7305#endif
 7306
 7307/*
 7308 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 7309 */
 7310static
 7311int can_migrate_task(struct task_struct *p, struct lb_env *env)
 7312{
 7313	int tsk_cache_hot;
 7314
 7315	lockdep_assert_held(&env->src_rq->lock);
 7316
 7317	/*
 7318	 * We do not migrate tasks that are:
 7319	 * 1) throttled_lb_pair, or
 7320	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 7321	 * 3) running (obviously), or
 7322	 * 4) are cache-hot on their current CPU.
 7323	 */
 7324	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 7325		return 0;
 7326
 7327	if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
 7328		int cpu;
 7329
 7330		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
 7331
 7332		env->flags |= LBF_SOME_PINNED;
 7333
 7334		/*
 7335		 * Remember if this task can be migrated to any other CPU in
 7336		 * our sched_group. We may want to revisit it if we couldn't
 7337		 * meet load balance goals by pulling other tasks on src_cpu.
 7338		 *
 7339		 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
 7340		 * already computed one in current iteration.
 7341		 */
 7342		if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
 7343			return 0;
 7344
 7345		/* Prevent to re-select dst_cpu via env's CPUs: */
 7346		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
 7347			if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
 7348				env->flags |= LBF_DST_PINNED;
 7349				env->new_dst_cpu = cpu;
 7350				break;
 7351			}
 7352		}
 7353
 7354		return 0;
 7355	}
 7356
 7357	/* Record that we found atleast one task that could run on dst_cpu */
 7358	env->flags &= ~LBF_ALL_PINNED;
 7359
 7360	if (task_running(env->src_rq, p)) {
 7361		schedstat_inc(p->se.statistics.nr_failed_migrations_running);
 7362		return 0;
 7363	}
 7364
 7365	/*
 7366	 * Aggressive migration if:
 7367	 * 1) destination numa is preferred
 7368	 * 2) task is cache cold, or
 7369	 * 3) too many balance attempts have failed.
 7370	 */
 7371	tsk_cache_hot = migrate_degrades_locality(p, env);
 7372	if (tsk_cache_hot == -1)
 7373		tsk_cache_hot = task_hot(p, env);
 7374
 7375	if (tsk_cache_hot <= 0 ||
 7376	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 7377		if (tsk_cache_hot == 1) {
 7378			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
 7379			schedstat_inc(p->se.statistics.nr_forced_migrations);
 7380		}
 7381		return 1;
 7382	}
 7383
 7384	schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
 7385	return 0;
 7386}
 7387
 7388/*
 7389 * detach_task() -- detach the task for the migration specified in env
 7390 */
 7391static void detach_task(struct task_struct *p, struct lb_env *env)
 7392{
 7393	lockdep_assert_held(&env->src_rq->lock);
 7394
 7395	p->on_rq = TASK_ON_RQ_MIGRATING;
 7396	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
 7397	set_task_cpu(p, env->dst_cpu);
 7398}
 7399
 7400/*
 7401 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
 7402 * part of active balancing operations within "domain".
 7403 *
 7404 * Returns a task if successful and NULL otherwise.
 7405 */
 7406static struct task_struct *detach_one_task(struct lb_env *env)
 7407{
 7408	struct task_struct *p;
 7409
 7410	lockdep_assert_held(&env->src_rq->lock);
 7411
 7412	list_for_each_entry_reverse(p,
 7413			&env->src_rq->cfs_tasks, se.group_node) {
 7414		if (!can_migrate_task(p, env))
 7415			continue;
 7416
 7417		detach_task(p, env);
 7418
 7419		/*
 7420		 * Right now, this is only the second place where
 7421		 * lb_gained[env->idle] is updated (other is detach_tasks)
 7422		 * so we can safely collect stats here rather than
 7423		 * inside detach_tasks().
 7424		 */
 7425		schedstat_inc(env->sd->lb_gained[env->idle]);
 7426		return p;
 7427	}
 7428	return NULL;
 7429}
 7430
 7431static const unsigned int sched_nr_migrate_break = 32;
 7432
 7433/*
 7434 * detach_tasks() -- tries to detach up to imbalance weighted load from
 7435 * busiest_rq, as part of a balancing operation within domain "sd".
 7436 *
 7437 * Returns number of detached tasks if successful and 0 otherwise.
 7438 */
 7439static int detach_tasks(struct lb_env *env)
 7440{
 7441	struct list_head *tasks = &env->src_rq->cfs_tasks;
 7442	struct task_struct *p;
 7443	unsigned long load;
 7444	int detached = 0;
 7445
 7446	lockdep_assert_held(&env->src_rq->lock);
 7447
 7448	if (env->imbalance <= 0)
 7449		return 0;
 7450
 7451	while (!list_empty(tasks)) {
 7452		/*
 7453		 * We don't want to steal all, otherwise we may be treated likewise,
 7454		 * which could at worst lead to a livelock crash.
 7455		 */
 7456		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
 7457			break;
 7458
 7459		p = list_last_entry(tasks, struct task_struct, se.group_node);
 7460
 7461		env->loop++;
 7462		/* We've more or less seen every task there is, call it quits */
 7463		if (env->loop > env->loop_max)
 7464			break;
 7465
 7466		/* take a breather every nr_migrate tasks */
 7467		if (env->loop > env->loop_break) {
 7468			env->loop_break += sched_nr_migrate_break;
 7469			env->flags |= LBF_NEED_BREAK;
 7470			break;
 7471		}
 7472
 7473		if (!can_migrate_task(p, env))
 7474			goto next;
 7475
 7476		load = task_h_load(p);
 7477
 7478		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
 7479			goto next;
 7480
 7481		if ((load / 2) > env->imbalance)
 7482			goto next;
 7483
 7484		detach_task(p, env);
 7485		list_add(&p->se.group_node, &env->tasks);
 7486
 7487		detached++;
 7488		env->imbalance -= load;
 7489
 7490#ifdef CONFIG_PREEMPT
 7491		/*
 7492		 * NEWIDLE balancing is a source of latency, so preemptible
 7493		 * kernels will stop after the first task is detached to minimize
 7494		 * the critical section.
 7495		 */
 7496		if (env->idle == CPU_NEWLY_IDLE)
 7497			break;
 7498#endif
 7499
 7500		/*
 7501		 * We only want to steal up to the prescribed amount of
 7502		 * weighted load.
 7503		 */
 7504		if (env->imbalance <= 0)
 7505			break;
 7506
 7507		continue;
 7508next:
 7509		list_move(&p->se.group_node, tasks);
 7510	}
 7511
 7512	/*
 7513	 * Right now, this is one of only two places we collect this stat
 7514	 * so we can safely collect detach_one_task() stats here rather
 7515	 * than inside detach_one_task().
 7516	 */
 7517	schedstat_add(env->sd->lb_gained[env->idle], detached);
 7518
 7519	return detached;
 7520}
 7521
 7522/*
 7523 * attach_task() -- attach the task detached by detach_task() to its new rq.
 7524 */
 7525static void attach_task(struct rq *rq, struct task_struct *p)
 7526{
 7527	lockdep_assert_held(&rq->lock);
 7528
 7529	BUG_ON(task_rq(p) != rq);
 7530	activate_task(rq, p, ENQUEUE_NOCLOCK);
 7531	p->on_rq = TASK_ON_RQ_QUEUED;
 7532	check_preempt_curr(rq, p, 0);
 7533}
 7534
 7535/*
 7536 * attach_one_task() -- attaches the task returned from detach_one_task() to
 7537 * its new rq.
 7538 */
 7539static void attach_one_task(struct rq *rq, struct task_struct *p)
 7540{
 7541	struct rq_flags rf;
 7542
 7543	rq_lock(rq, &rf);
 7544	update_rq_clock(rq);
 7545	attach_task(rq, p);
 7546	rq_unlock(rq, &rf);
 7547}
 7548
 7549/*
 7550 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
 7551 * new rq.
 7552 */
 7553static void attach_tasks(struct lb_env *env)
 7554{
 7555	struct list_head *tasks = &env->tasks;
 7556	struct task_struct *p;
 7557	struct rq_flags rf;
 7558
 7559	rq_lock(env->dst_rq, &rf);
 7560	update_rq_clock(env->dst_rq);
 7561
 7562	while (!list_empty(tasks)) {
 7563		p = list_first_entry(tasks, struct task_struct, se.group_node);
 7564		list_del_init(&p->se.group_node);
 7565
 7566		attach_task(env->dst_rq, p);
 7567	}
 7568
 7569	rq_unlock(env->dst_rq, &rf);
 7570}
 7571
 7572static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
 7573{
 7574	if (cfs_rq->avg.load_avg)
 7575		return true;
 7576
 7577	if (cfs_rq->avg.util_avg)
 7578		return true;
 7579
 7580	return false;
 7581}
 7582
 7583#ifdef CONFIG_FAIR_GROUP_SCHED
 7584
 7585static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 7586{
 7587	if (cfs_rq->load.weight)
 7588		return false;
 7589
 7590	if (cfs_rq->avg.load_sum)
 7591		return false;
 7592
 7593	if (cfs_rq->avg.util_sum)
 7594		return false;
 7595
 7596	if (cfs_rq->avg.runnable_load_sum)
 7597		return false;
 7598
 7599	return true;
 7600}
 7601
 7602static void update_blocked_averages(int cpu)
 7603{
 7604	struct rq *rq = cpu_rq(cpu);
 7605	struct cfs_rq *cfs_rq, *pos;
 7606	struct rq_flags rf;
 7607	bool done = true;
 7608
 7609	rq_lock_irqsave(rq, &rf);
 7610	update_rq_clock(rq);
 7611
 7612	/*
 7613	 * Iterates the task_group tree in a bottom up fashion, see
 7614	 * list_add_leaf_cfs_rq() for details.
 7615	 */
 7616	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 7617		struct sched_entity *se;
 7618
 7619		/* throttled entities do not contribute to load */
 7620		if (throttled_hierarchy(cfs_rq))
 7621			continue;
 7622
 7623		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
 7624			update_tg_load_avg(cfs_rq, 0);
 7625
 7626		/* Propagate pending load changes to the parent, if any: */
 7627		se = cfs_rq->tg->se[cpu];
 7628		if (se && !skip_blocked_update(se))
 7629			update_load_avg(cfs_rq_of(se), se, 0);
 7630
 7631		/*
 7632		 * There can be a lot of idle CPU cgroups.  Don't let fully
 7633		 * decayed cfs_rqs linger on the list.
 7634		 */
 7635		if (cfs_rq_is_decayed(cfs_rq))
 7636			list_del_leaf_cfs_rq(cfs_rq);
 7637
 7638		/* Don't need periodic decay once load/util_avg are null */
 7639		if (cfs_rq_has_blocked(cfs_rq))
 7640			done = false;
 7641	}
 7642
 7643#ifdef CONFIG_NO_HZ_COMMON
 7644	rq->last_blocked_load_update_tick = jiffies;
 7645	if (done)
 7646		rq->has_blocked_load = 0;
 7647#endif
 7648	rq_unlock_irqrestore(rq, &rf);
 7649}
 7650
 7651/*
 7652 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
 7653 * This needs to be done in a top-down fashion because the load of a child
 7654 * group is a fraction of its parents load.
 7655 */
 7656static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 7657{
 7658	struct rq *rq = rq_of(cfs_rq);
 7659	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
 7660	unsigned long now = jiffies;
 7661	unsigned long load;
 7662
 7663	if (cfs_rq->last_h_load_update == now)
 7664		return;
 7665
 7666	cfs_rq->h_load_next = NULL;
 7667	for_each_sched_entity(se) {
 7668		cfs_rq = cfs_rq_of(se);
 7669		cfs_rq->h_load_next = se;
 7670		if (cfs_rq->last_h_load_update == now)
 7671			break;
 7672	}
 7673
 7674	if (!se) {
 7675		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
 7676		cfs_rq->last_h_load_update = now;
 7677	}
 7678
 7679	while ((se = cfs_rq->h_load_next) != NULL) {
 7680		load = cfs_rq->h_load;
 7681		load = div64_ul(load * se->avg.load_avg,
 7682			cfs_rq_load_avg(cfs_rq) + 1);
 7683		cfs_rq = group_cfs_rq(se);
 7684		cfs_rq->h_load = load;
 7685		cfs_rq->last_h_load_update = now;
 7686	}
 7687}
 7688
 7689static unsigned long task_h_load(struct task_struct *p)
 7690{
 7691	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 7692
 7693	update_cfs_rq_h_load(cfs_rq);
 7694	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
 7695			cfs_rq_load_avg(cfs_rq) + 1);
 7696}
 7697#else
 7698static inline void update_blocked_averages(int cpu)
 7699{
 7700	struct rq *rq = cpu_rq(cpu);
 7701	struct cfs_rq *cfs_rq = &rq->cfs;
 7702	struct rq_flags rf;
 7703
 7704	rq_lock_irqsave(rq, &rf);
 7705	update_rq_clock(rq);
 7706	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 7707#ifdef CONFIG_NO_HZ_COMMON
 7708	rq->last_blocked_load_update_tick = jiffies;
 7709	if (!cfs_rq_has_blocked(cfs_rq))
 7710		rq->has_blocked_load = 0;
 7711#endif
 7712	rq_unlock_irqrestore(rq, &rf);
 7713}
 7714
 7715static unsigned long task_h_load(struct task_struct *p)
 7716{
 7717	return p->se.avg.load_avg;
 7718}
 7719#endif
 7720
 7721/********** Helpers for find_busiest_group ************************/
 7722
 7723enum group_type {
 7724	group_other = 0,
 7725	group_imbalanced,
 7726	group_overloaded,
 7727};
 7728
 7729/*
 7730 * sg_lb_stats - stats of a sched_group required for load_balancing
 7731 */
 7732struct sg_lb_stats {
 7733	unsigned long avg_load; /*Avg load across the CPUs of the group */
 7734	unsigned long group_load; /* Total load over the CPUs of the group */
 7735	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 7736	unsigned long load_per_task;
 7737	unsigned long group_capacity;
 7738	unsigned long group_util; /* Total utilization of the group */
 7739	unsigned int sum_nr_running; /* Nr tasks running in the group */
 7740	unsigned int idle_cpus;
 7741	unsigned int group_weight;
 7742	enum group_type group_type;
 7743	int group_no_capacity;
 7744#ifdef CONFIG_NUMA_BALANCING
 7745	unsigned int nr_numa_running;
 7746	unsigned int nr_preferred_running;
 7747#endif
 7748};
 7749
 7750/*
 7751 * sd_lb_stats - Structure to store the statistics of a sched_domain
 7752 *		 during load balancing.
 7753 */
 7754struct sd_lb_stats {
 7755	struct sched_group *busiest;	/* Busiest group in this sd */
 7756	struct sched_group *local;	/* Local group in this sd */
 7757	unsigned long total_running;
 7758	unsigned long total_load;	/* Total load of all groups in sd */
 7759	unsigned long total_capacity;	/* Total capacity of all groups in sd */
 7760	unsigned long avg_load;	/* Average load across all groups in sd */
 7761
 7762	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
 7763	struct sg_lb_stats local_stat;	/* Statistics of the local group */
 7764};
 7765
 7766static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 7767{
 7768	/*
 7769	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
 7770	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
 7771	 * We must however clear busiest_stat::avg_load because
 7772	 * update_sd_pick_busiest() reads this before assignment.
 7773	 */
 7774	*sds = (struct sd_lb_stats){
 7775		.busiest = NULL,
 7776		.local = NULL,
 7777		.total_running = 0UL,
 7778		.total_load = 0UL,
 7779		.total_capacity = 0UL,
 7780		.busiest_stat = {
 7781			.avg_load = 0UL,
 7782			.sum_nr_running = 0,
 7783			.group_type = group_other,
 7784		},
 7785	};
 7786}
 7787
 7788/**
 7789 * get_sd_load_idx - Obtain the load index for a given sched domain.
 7790 * @sd: The sched_domain whose load_idx is to be obtained.
 7791 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
 7792 *
 7793 * Return: The load index.
 7794 */
 7795static inline int get_sd_load_idx(struct sched_domain *sd,
 7796					enum cpu_idle_type idle)
 7797{
 7798	int load_idx;
 7799
 7800	switch (idle) {
 7801	case CPU_NOT_IDLE:
 7802		load_idx = sd->busy_idx;
 7803		break;
 7804
 7805	case CPU_NEWLY_IDLE:
 7806		load_idx = sd->newidle_idx;
 7807		break;
 7808	default:
 7809		load_idx = sd->idle_idx;
 7810		break;
 7811	}
 7812
 7813	return load_idx;
 7814}
 7815
 7816static unsigned long scale_rt_capacity(int cpu)
 7817{
 7818	struct rq *rq = cpu_rq(cpu);
 7819	u64 total, used, age_stamp, avg;
 7820	s64 delta;
 7821
 7822	/*
 7823	 * Since we're reading these variables without serialization make sure
 7824	 * we read them once before doing sanity checks on them.
 7825	 */
 7826	age_stamp = READ_ONCE(rq->age_stamp);
 7827	avg = READ_ONCE(rq->rt_avg);
 7828	delta = __rq_clock_broken(rq) - age_stamp;
 7829
 7830	if (unlikely(delta < 0))
 7831		delta = 0;
 7832
 7833	total = sched_avg_period() + delta;
 7834
 7835	used = div_u64(avg, total);
 7836
 7837	if (likely(used < SCHED_CAPACITY_SCALE))
 7838		return SCHED_CAPACITY_SCALE - used;
 7839
 7840	return 1;
 7841}
 7842
 7843static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 7844{
 7845	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
 7846	struct sched_group *sdg = sd->groups;
 7847
 7848	cpu_rq(cpu)->cpu_capacity_orig = capacity;
 7849
 7850	capacity *= scale_rt_capacity(cpu);
 7851	capacity >>= SCHED_CAPACITY_SHIFT;
 7852
 7853	if (!capacity)
 7854		capacity = 1;
 7855
 7856	cpu_rq(cpu)->cpu_capacity = capacity;
 7857	sdg->sgc->capacity = capacity;
 7858	sdg->sgc->min_capacity = capacity;
 7859}
 7860
 7861void update_group_capacity(struct sched_domain *sd, int cpu)
 7862{
 7863	struct sched_domain *child = sd->child;
 7864	struct sched_group *group, *sdg = sd->groups;
 7865	unsigned long capacity, min_capacity;
 7866	unsigned long interval;
 7867
 7868	interval = msecs_to_jiffies(sd->balance_interval);
 7869	interval = clamp(interval, 1UL, max_load_balance_interval);
 7870	sdg->sgc->next_update = jiffies + interval;
 7871
 7872	if (!child) {
 7873		update_cpu_capacity(sd, cpu);
 7874		return;
 7875	}
 7876
 7877	capacity = 0;
 7878	min_capacity = ULONG_MAX;
 7879
 7880	if (child->flags & SD_OVERLAP) {
 7881		/*
 7882		 * SD_OVERLAP domains cannot assume that child groups
 7883		 * span the current group.
 7884		 */
 7885
 7886		for_each_cpu(cpu, sched_group_span(sdg)) {
 7887			struct sched_group_capacity *sgc;
 7888			struct rq *rq = cpu_rq(cpu);
 7889
 7890			/*
 7891			 * build_sched_domains() -> init_sched_groups_capacity()
 7892			 * gets here before we've attached the domains to the
 7893			 * runqueues.
 7894			 *
 7895			 * Use capacity_of(), which is set irrespective of domains
 7896			 * in update_cpu_capacity().
 7897			 *
 7898			 * This avoids capacity from being 0 and
 7899			 * causing divide-by-zero issues on boot.
 7900			 */
 7901			if (unlikely(!rq->sd)) {
 7902				capacity += capacity_of(cpu);
 7903			} else {
 7904				sgc = rq->sd->groups->sgc;
 7905				capacity += sgc->capacity;
 7906			}
 7907
 7908			min_capacity = min(capacity, min_capacity);
 7909		}
 7910	} else  {
 7911		/*
 7912		 * !SD_OVERLAP domains can assume that child groups
 7913		 * span the current group.
 7914		 */
 7915
 7916		group = child->groups;
 7917		do {
 7918			struct sched_group_capacity *sgc = group->sgc;
 7919
 7920			capacity += sgc->capacity;
 7921			min_capacity = min(sgc->min_capacity, min_capacity);
 7922			group = group->next;
 7923		} while (group != child->groups);
 7924	}
 7925
 7926	sdg->sgc->capacity = capacity;
 7927	sdg->sgc->min_capacity = min_capacity;
 7928}
 7929
 7930/*
 7931 * Check whether the capacity of the rq has been noticeably reduced by side
 7932 * activity. The imbalance_pct is used for the threshold.
 7933 * Return true is the capacity is reduced
 7934 */
 7935static inline int
 7936check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 7937{
 7938	return ((rq->cpu_capacity * sd->imbalance_pct) <
 7939				(rq->cpu_capacity_orig * 100));
 7940}
 7941
 7942/*
 7943 * Group imbalance indicates (and tries to solve) the problem where balancing
 7944 * groups is inadequate due to ->cpus_allowed constraints.
 7945 *
 7946 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
 7947 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
 7948 * Something like:
 7949 *
 7950 *	{ 0 1 2 3 } { 4 5 6 7 }
 7951 *	        *     * * *
 7952 *
 7953 * If we were to balance group-wise we'd place two tasks in the first group and
 7954 * two tasks in the second group. Clearly this is undesired as it will overload
 7955 * cpu 3 and leave one of the CPUs in the second group unused.
 7956 *
 7957 * The current solution to this issue is detecting the skew in the first group
 7958 * by noticing the lower domain failed to reach balance and had difficulty
 7959 * moving tasks due to affinity constraints.
 7960 *
 7961 * When this is so detected; this group becomes a candidate for busiest; see
 7962 * update_sd_pick_busiest(). And calculate_imbalance() and
 7963 * find_busiest_group() avoid some of the usual balance conditions to allow it
 7964 * to create an effective group imbalance.
 7965 *
 7966 * This is a somewhat tricky proposition since the next run might not find the
 7967 * group imbalance and decide the groups need to be balanced again. A most
 7968 * subtle and fragile situation.
 7969 */
 7970
 7971static inline int sg_imbalanced(struct sched_group *group)
 7972{
 7973	return group->sgc->imbalance;
 7974}
 7975
 7976/*
 7977 * group_has_capacity returns true if the group has spare capacity that could
 7978 * be used by some tasks.
 7979 * We consider that a group has spare capacity if the  * number of task is
 7980 * smaller than the number of CPUs or if the utilization is lower than the
 7981 * available capacity for CFS tasks.
 7982 * For the latter, we use a threshold to stabilize the state, to take into
 7983 * account the variance of the tasks' load and to return true if the available
 7984 * capacity in meaningful for the load balancer.
 7985 * As an example, an available capacity of 1% can appear but it doesn't make
 7986 * any benefit for the load balance.
 7987 */
 7988static inline bool
 7989group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
 7990{
 7991	if (sgs->sum_nr_running < sgs->group_weight)
 7992		return true;
 7993
 7994	if ((sgs->group_capacity * 100) >
 7995			(sgs->group_util * env->sd->imbalance_pct))
 7996		return true;
 7997
 7998	return false;
 7999}
 8000
 8001/*
 8002 *  group_is_overloaded returns true if the group has more tasks than it can
 8003 *  handle.
 8004 *  group_is_overloaded is not equals to !group_has_capacity because a group
 8005 *  with the exact right number of tasks, has no more spare capacity but is not
 8006 *  overloaded so both group_has_capacity and group_is_overloaded return
 8007 *  false.
 8008 */
 8009static inline bool
 8010group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 8011{
 8012	if (sgs->sum_nr_running <= sgs->group_weight)
 8013		return false;
 8014
 8015	if ((sgs->group_capacity * 100) <
 8016			(sgs->group_util * env->sd->imbalance_pct))
 8017		return true;
 8018
 8019	return false;
 8020}
 8021
 8022/*
 8023 * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
 8024 * per-CPU capacity than sched_group ref.
 8025 */
 8026static inline bool
 8027group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 8028{
 8029	return sg->sgc->min_capacity * capacity_margin <
 8030						ref->sgc->min_capacity * 1024;
 8031}
 8032
 8033static inline enum
 8034group_type group_classify(struct sched_group *group,
 8035			  struct sg_lb_stats *sgs)
 8036{
 8037	if (sgs->group_no_capacity)
 8038		return group_overloaded;
 8039
 8040	if (sg_imbalanced(group))
 8041		return group_imbalanced;
 8042
 8043	return group_other;
 8044}
 8045
 8046static bool update_nohz_stats(struct rq *rq, bool force)
 8047{
 8048#ifdef CONFIG_NO_HZ_COMMON
 8049	unsigned int cpu = rq->cpu;
 8050
 8051	if (!rq->has_blocked_load)
 8052		return false;
 8053
 8054	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
 8055		return false;
 8056
 8057	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
 8058		return true;
 8059
 8060	update_blocked_averages(cpu);
 8061
 8062	return rq->has_blocked_load;
 8063#else
 8064	return false;
 8065#endif
 8066}
 8067
 8068/**
 8069 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 8070 * @env: The load balancing environment.
 8071 * @group: sched_group whose statistics are to be updated.
 8072 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 8073 * @local_group: Does group contain this_cpu.
 8074 * @sgs: variable to hold the statistics for this group.
 8075 * @overload: Indicate more than one runnable task for any CPU.
 8076 */
 8077static inline void update_sg_lb_stats(struct lb_env *env,
 8078			struct sched_group *group, int load_idx,
 8079			int local_group, struct sg_lb_stats *sgs,
 8080			bool *overload)
 8081{
 8082	unsigned long load;
 8083	int i, nr_running;
 8084
 8085	memset(sgs, 0, sizeof(*sgs));
 8086
 8087	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 8088		struct rq *rq = cpu_rq(i);
 8089
 8090		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
 8091			env->flags |= LBF_NOHZ_AGAIN;
 8092
 8093		/* Bias balancing toward CPUs of our domain: */
 8094		if (local_group)
 8095			load = target_load(i, load_idx);
 8096		else
 8097			load = source_load(i, load_idx);
 8098
 8099		sgs->group_load += load;
 8100		sgs->group_util += cpu_util(i);
 8101		sgs->sum_nr_running += rq->cfs.h_nr_running;
 8102
 8103		nr_running = rq->nr_running;
 8104		if (nr_running > 1)
 8105			*overload = true;
 8106
 8107#ifdef CONFIG_NUMA_BALANCING
 8108		sgs->nr_numa_running += rq->nr_numa_running;
 8109		sgs->nr_preferred_running += rq->nr_preferred_running;
 8110#endif
 8111		sgs->sum_weighted_load += weighted_cpuload(rq);
 8112		/*
 8113		 * No need to call idle_cpu() if nr_running is not 0
 8114		 */
 8115		if (!nr_running && idle_cpu(i))
 8116			sgs->idle_cpus++;
 8117	}
 8118
 8119	/* Adjust by relative CPU capacity of the group */
 8120	sgs->group_capacity = group->sgc->capacity;
 8121	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
 8122
 8123	if (sgs->sum_nr_running)
 8124		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 8125
 8126	sgs->group_weight = group->group_weight;
 8127
 8128	sgs->group_no_capacity = group_is_overloaded(env, sgs);
 8129	sgs->group_type = group_classify(group, sgs);
 8130}
 8131
 8132/**
 8133 * update_sd_pick_busiest - return 1 on busiest group
 8134 * @env: The load balancing environment.
 8135 * @sds: sched_domain statistics
 8136 * @sg: sched_group candidate to be checked for being the busiest
 8137 * @sgs: sched_group statistics
 8138 *
 8139 * Determine if @sg is a busier group than the previously selected
 8140 * busiest group.
 8141 *
 8142 * Return: %true if @sg is a busier group than the previously selected
 8143 * busiest group. %false otherwise.
 8144 */
 8145static bool update_sd_pick_busiest(struct lb_env *env,
 8146				   struct sd_lb_stats *sds,
 8147				   struct sched_group *sg,
 8148				   struct sg_lb_stats *sgs)
 8149{
 8150	struct sg_lb_stats *busiest = &sds->busiest_stat;
 8151
 8152	if (sgs->group_type > busiest->group_type)
 8153		return true;
 8154
 8155	if (sgs->group_type < busiest->group_type)
 8156		return false;
 8157
 8158	if (sgs->avg_load <= busiest->avg_load)
 8159		return false;
 8160
 8161	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
 8162		goto asym_packing;
 8163
 8164	/*
 8165	 * Candidate sg has no more than one task per CPU and
 8166	 * has higher per-CPU capacity. Migrating tasks to less
 8167	 * capable CPUs may harm throughput. Maximize throughput,
 8168	 * power/energy consequences are not considered.
 8169	 */
 8170	if (sgs->sum_nr_running <= sgs->group_weight &&
 8171	    group_smaller_cpu_capacity(sds->local, sg))
 8172		return false;
 8173
 8174asym_packing:
 8175	/* This is the busiest node in its class. */
 8176	if (!(env->sd->flags & SD_ASYM_PACKING))
 8177		return true;
 8178
 8179	/* No ASYM_PACKING if target CPU is already busy */
 8180	if (env->idle == CPU_NOT_IDLE)
 8181		return true;
 8182	/*
 8183	 * ASYM_PACKING needs to move all the work to the highest
 8184	 * prority CPUs in the group, therefore mark all groups
 8185	 * of lower priority than ourself as busy.
 8186	 */
 8187	if (sgs->sum_nr_running &&
 8188	    sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
 8189		if (!sds->busiest)
 8190			return true;
 8191
 8192		/* Prefer to move from lowest priority CPU's work */
 8193		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
 8194				      sg->asym_prefer_cpu))
 8195			return true;
 8196	}
 8197
 8198	return false;
 8199}
 8200
 8201#ifdef CONFIG_NUMA_BALANCING
 8202static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 8203{
 8204	if (sgs->sum_nr_running > sgs->nr_numa_running)
 8205		return regular;
 8206	if (sgs->sum_nr_running > sgs->nr_preferred_running)
 8207		return remote;
 8208	return all;
 8209}
 8210
 8211static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 8212{
 8213	if (rq->nr_running > rq->nr_numa_running)
 8214		return regular;
 8215	if (rq->nr_running > rq->nr_preferred_running)
 8216		return remote;
 8217	return all;
 8218}
 8219#else
 8220static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 8221{
 8222	return all;
 8223}
 8224
 8225static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 8226{
 8227	return regular;
 8228}
 8229#endif /* CONFIG_NUMA_BALANCING */
 8230
 8231/**
 8232 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 8233 * @env: The load balancing environment.
 8234 * @sds: variable to hold the statistics for this sched_domain.
 8235 */
 8236static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 8237{
 8238	struct sched_domain *child = env->sd->child;
 8239	struct sched_group *sg = env->sd->groups;
 8240	struct sg_lb_stats *local = &sds->local_stat;
 8241	struct sg_lb_stats tmp_sgs;
 8242	int load_idx, prefer_sibling = 0;
 8243	bool overload = false;
 8244
 8245	if (child && child->flags & SD_PREFER_SIBLING)
 8246		prefer_sibling = 1;
 8247
 8248#ifdef CONFIG_NO_HZ_COMMON
 8249	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
 8250		env->flags |= LBF_NOHZ_STATS;
 8251#endif
 8252
 8253	load_idx = get_sd_load_idx(env->sd, env->idle);
 8254
 8255	do {
 8256		struct sg_lb_stats *sgs = &tmp_sgs;
 8257		int local_group;
 8258
 8259		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
 8260		if (local_group) {
 8261			sds->local = sg;
 8262			sgs = local;
 8263
 8264			if (env->idle != CPU_NEWLY_IDLE ||
 8265			    time_after_eq(jiffies, sg->sgc->next_update))
 8266				update_group_capacity(env->sd, env->dst_cpu);
 8267		}
 8268
 8269		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
 8270						&overload);
 8271
 8272		if (local_group)
 8273			goto next_group;
 8274
 8275		/*
 8276		 * In case the child domain prefers tasks go to siblings
 8277		 * first, lower the sg capacity so that we'll try
 8278		 * and move all the excess tasks away. We lower the capacity
 8279		 * of a group only if the local group has the capacity to fit
 8280		 * these excess tasks. The extra check prevents the case where
 8281		 * you always pull from the heaviest group when it is already
 8282		 * under-utilized (possible with a large weight task outweighs
 8283		 * the tasks on the system).
 8284		 */
 8285		if (prefer_sibling && sds->local &&
 8286		    group_has_capacity(env, local) &&
 8287		    (sgs->sum_nr_running > local->sum_nr_running + 1)) {
 8288			sgs->group_no_capacity = 1;
 8289			sgs->group_type = group_classify(sg, sgs);
 8290		}
 8291
 8292		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
 8293			sds->busiest = sg;
 8294			sds->busiest_stat = *sgs;
 8295		}
 8296
 8297next_group:
 8298		/* Now, start updating sd_lb_stats */
 8299		sds->total_running += sgs->sum_nr_running;
 8300		sds->total_load += sgs->group_load;
 8301		sds->total_capacity += sgs->group_capacity;
 8302
 8303		sg = sg->next;
 8304	} while (sg != env->sd->groups);
 8305
 8306#ifdef CONFIG_NO_HZ_COMMON
 8307	if ((env->flags & LBF_NOHZ_AGAIN) &&
 8308	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
 8309
 8310		WRITE_ONCE(nohz.next_blocked,
 8311			   jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
 8312	}
 8313#endif
 8314
 8315	if (env->sd->flags & SD_NUMA)
 8316		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 8317
 8318	if (!env->sd->parent) {
 8319		/* update overload indicator if we are at root domain */
 8320		if (env->dst_rq->rd->overload != overload)
 8321			env->dst_rq->rd->overload = overload;
 8322	}
 8323}
 8324
 8325/**
 8326 * check_asym_packing - Check to see if the group is packed into the
 8327 *			sched domain.
 8328 *
 8329 * This is primarily intended to used at the sibling level.  Some
 8330 * cores like POWER7 prefer to use lower numbered SMT threads.  In the
 8331 * case of POWER7, it can move to lower SMT modes only when higher
 8332 * threads are idle.  When in lower SMT modes, the threads will
 8333 * perform better since they share less core resources.  Hence when we
 8334 * have idle threads, we want them to be the higher ones.
 8335 *
 8336 * This packing function is run on idle threads.  It checks to see if
 8337 * the busiest CPU in this domain (core in the P7 case) has a higher
 8338 * CPU number than the packing function is being run on.  Here we are
 8339 * assuming lower CPU number will be equivalent to lower a SMT thread
 8340 * number.
 8341 *
 8342 * Return: 1 when packing is required and a task should be moved to
 8343 * this CPU.  The amount of the imbalance is returned in env->imbalance.
 8344 *
 8345 * @env: The load balancing environment.
 8346 * @sds: Statistics of the sched_domain which is to be packed
 8347 */
 8348static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 8349{
 8350	int busiest_cpu;
 8351
 8352	if (!(env->sd->flags & SD_ASYM_PACKING))
 8353		return 0;
 8354
 8355	if (env->idle == CPU_NOT_IDLE)
 8356		return 0;
 8357
 8358	if (!sds->busiest)
 8359		return 0;
 8360
 8361	busiest_cpu = sds->busiest->asym_prefer_cpu;
 8362	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
 8363		return 0;
 8364
 8365	env->imbalance = DIV_ROUND_CLOSEST(
 8366		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
 8367		SCHED_CAPACITY_SCALE);
 8368
 8369	return 1;
 8370}
 8371
 8372/**
 8373 * fix_small_imbalance - Calculate the minor imbalance that exists
 8374 *			amongst the groups of a sched_domain, during
 8375 *			load balancing.
 8376 * @env: The load balancing environment.
 8377 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
 8378 */
 8379static inline
 8380void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 8381{
 8382	unsigned long tmp, capa_now = 0, capa_move = 0;
 8383	unsigned int imbn = 2;
 8384	unsigned long scaled_busy_load_per_task;
 8385	struct sg_lb_stats *local, *busiest;
 8386
 8387	local = &sds->local_stat;
 8388	busiest = &sds->busiest_stat;
 8389
 8390	if (!local->sum_nr_running)
 8391		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
 8392	else if (busiest->load_per_task > local->load_per_task)
 8393		imbn = 1;
 8394
 8395	scaled_busy_load_per_task =
 8396		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
 8397		busiest->group_capacity;
 8398
 8399	if (busiest->avg_load + scaled_busy_load_per_task >=
 8400	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
 8401		env->imbalance = busiest->load_per_task;
 8402		return;
 8403	}
 8404
 8405	/*
 8406	 * OK, we don't have enough imbalance to justify moving tasks,
 8407	 * however we may be able to increase total CPU capacity used by
 8408	 * moving them.
 8409	 */
 8410
 8411	capa_now += busiest->group_capacity *
 8412			min(busiest->load_per_task, busiest->avg_load);
 8413	capa_now += local->group_capacity *
 8414			min(local->load_per_task, local->avg_load);
 8415	capa_now /= SCHED_CAPACITY_SCALE;
 8416
 8417	/* Amount of load we'd subtract */
 8418	if (busiest->avg_load > scaled_busy_load_per_task) {
 8419		capa_move += busiest->group_capacity *
 8420			    min(busiest->load_per_task,
 8421				busiest->avg_load - scaled_busy_load_per_task);
 8422	}
 8423
 8424	/* Amount of load we'd add */
 8425	if (busiest->avg_load * busiest->group_capacity <
 8426	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
 8427		tmp = (busiest->avg_load * busiest->group_capacity) /
 8428		      local->group_capacity;
 8429	} else {
 8430		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
 8431		      local->group_capacity;
 8432	}
 8433	capa_move += local->group_capacity *
 8434		    min(local->load_per_task, local->avg_load + tmp);
 8435	capa_move /= SCHED_CAPACITY_SCALE;
 8436
 8437	/* Move if we gain throughput */
 8438	if (capa_move > capa_now)
 8439		env->imbalance = busiest->load_per_task;
 8440}
 8441
 8442/**
 8443 * calculate_imbalance - Calculate the amount of imbalance present within the
 8444 *			 groups of a given sched_domain during load balance.
 8445 * @env: load balance environment
 8446 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
 8447 */
 8448static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 8449{
 8450	unsigned long max_pull, load_above_capacity = ~0UL;
 8451	struct sg_lb_stats *local, *busiest;
 8452
 8453	local = &sds->local_stat;
 8454	busiest = &sds->busiest_stat;
 8455
 8456	if (busiest->group_type == group_imbalanced) {
 8457		/*
 8458		 * In the group_imb case we cannot rely on group-wide averages
 8459		 * to ensure CPU-load equilibrium, look at wider averages. XXX
 8460		 */
 8461		busiest->load_per_task =
 8462			min(busiest->load_per_task, sds->avg_load);
 8463	}
 8464
 8465	/*
 8466	 * Avg load of busiest sg can be less and avg load of local sg can
 8467	 * be greater than avg load across all sgs of sd because avg load
 8468	 * factors in sg capacity and sgs with smaller group_type are
 8469	 * skipped when updating the busiest sg:
 8470	 */
 8471	if (busiest->avg_load <= sds->avg_load ||
 8472	    local->avg_load >= sds->avg_load) {
 8473		env->imbalance = 0;
 8474		return fix_small_imbalance(env, sds);
 8475	}
 8476
 8477	/*
 8478	 * If there aren't any idle CPUs, avoid creating some.
 8479	 */
 8480	if (busiest->group_type == group_overloaded &&
 8481	    local->group_type   == group_overloaded) {
 8482		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
 8483		if (load_above_capacity > busiest->group_capacity) {
 8484			load_above_capacity -= busiest->group_capacity;
 8485			load_above_capacity *= scale_load_down(NICE_0_LOAD);
 8486			load_above_capacity /= busiest->group_capacity;
 8487		} else
 8488			load_above_capacity = ~0UL;
 8489	}
 8490
 8491	/*
 8492	 * We're trying to get all the CPUs to the average_load, so we don't
 8493	 * want to push ourselves above the average load, nor do we wish to
 8494	 * reduce the max loaded CPU below the average load. At the same time,
 8495	 * we also don't want to reduce the group load below the group
 8496	 * capacity. Thus we look for the minimum possible imbalance.
 8497	 */
 8498	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
 8499
 8500	/* How much load to actually move to equalise the imbalance */
 8501	env->imbalance = min(
 8502		max_pull * busiest->group_capacity,
 8503		(sds->avg_load - local->avg_load) * local->group_capacity
 8504	) / SCHED_CAPACITY_SCALE;
 8505
 8506	/*
 8507	 * if *imbalance is less than the average load per runnable task
 8508	 * there is no guarantee that any tasks will be moved so we'll have
 8509	 * a think about bumping its value to force at least one task to be
 8510	 * moved
 8511	 */
 8512	if (env->imbalance < busiest->load_per_task)
 8513		return fix_small_imbalance(env, sds);
 8514}
 8515
 8516/******* find_busiest_group() helpers end here *********************/
 8517
 8518/**
 8519 * find_busiest_group - Returns the busiest group within the sched_domain
 8520 * if there is an imbalance.
 8521 *
 8522 * Also calculates the amount of weighted load which should be moved
 8523 * to restore balance.
 8524 *
 8525 * @env: The load balancing environment.
 8526 *
 8527 * Return:	- The busiest group if imbalance exists.
 8528 */
 8529static struct sched_group *find_busiest_group(struct lb_env *env)
 8530{
 8531	struct sg_lb_stats *local, *busiest;
 8532	struct sd_lb_stats sds;
 8533
 8534	init_sd_lb_stats(&sds);
 8535
 8536	/*
 8537	 * Compute the various statistics relavent for load balancing at
 8538	 * this level.
 8539	 */
 8540	update_sd_lb_stats(env, &sds);
 8541	local = &sds.local_stat;
 8542	busiest = &sds.busiest_stat;
 8543
 8544	/* ASYM feature bypasses nice load balance check */
 8545	if (check_asym_packing(env, &sds))
 8546		return sds.busiest;
 8547
 8548	/* There is no busy sibling group to pull tasks from */
 8549	if (!sds.busiest || busiest->sum_nr_running == 0)
 8550		goto out_balanced;
 8551
 8552	/* XXX broken for overlapping NUMA groups */
 8553	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
 8554						/ sds.total_capacity;
 8555
 8556	/*
 8557	 * If the busiest group is imbalanced the below checks don't
 8558	 * work because they assume all things are equal, which typically
 8559	 * isn't true due to cpus_allowed constraints and the like.
 8560	 */
 8561	if (busiest->group_type == group_imbalanced)
 8562		goto force_balance;
 8563
 8564	/*
 8565	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
 8566	 * capacities from resulting in underutilization due to avg_load.
 8567	 */
 8568	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
 8569	    busiest->group_no_capacity)
 8570		goto force_balance;
 8571
 8572	/*
 8573	 * If the local group is busier than the selected busiest group
 8574	 * don't try and pull any tasks.
 8575	 */
 8576	if (local->avg_load >= busiest->avg_load)
 8577		goto out_balanced;
 8578
 8579	/*
 8580	 * Don't pull any tasks if this group is already above the domain
 8581	 * average load.
 8582	 */
 8583	if (local->avg_load >= sds.avg_load)
 8584		goto out_balanced;
 8585
 8586	if (env->idle == CPU_IDLE) {
 8587		/*
 8588		 * This CPU is idle. If the busiest group is not overloaded
 8589		 * and there is no imbalance between this and busiest group
 8590		 * wrt idle CPUs, it is balanced. The imbalance becomes
 8591		 * significant if the diff is greater than 1 otherwise we
 8592		 * might end up to just move the imbalance on another group
 8593		 */
 8594		if ((busiest->group_type != group_overloaded) &&
 8595				(local->idle_cpus <= (busiest->idle_cpus + 1)))
 8596			goto out_balanced;
 8597	} else {
 8598		/*
 8599		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
 8600		 * imbalance_pct to be conservative.
 8601		 */
 8602		if (100 * busiest->avg_load <=
 8603				env->sd->imbalance_pct * local->avg_load)
 8604			goto out_balanced;
 8605	}
 8606
 8607force_balance:
 8608	/* Looks like there is an imbalance. Compute it */
 8609	calculate_imbalance(env, &sds);
 8610	return sds.busiest;
 8611
 8612out_balanced:
 8613	env->imbalance = 0;
 8614	return NULL;
 8615}
 8616
 8617/*
 8618 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
 8619 */
 8620static struct rq *find_busiest_queue(struct lb_env *env,
 8621				     struct sched_group *group)
 8622{
 8623	struct rq *busiest = NULL, *rq;
 8624	unsigned long busiest_load = 0, busiest_capacity = 1;
 8625	int i;
 8626
 8627	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 8628		unsigned long capacity, wl;
 8629		enum fbq_type rt;
 8630
 8631		rq = cpu_rq(i);
 8632		rt = fbq_classify_rq(rq);
 8633
 8634		/*
 8635		 * We classify groups/runqueues into three groups:
 8636		 *  - regular: there are !numa tasks
 8637		 *  - remote:  there are numa tasks that run on the 'wrong' node
 8638		 *  - all:     there is no distinction
 8639		 *
 8640		 * In order to avoid migrating ideally placed numa tasks,
 8641		 * ignore those when there's better options.
 8642		 *
 8643		 * If we ignore the actual busiest queue to migrate another
 8644		 * task, the next balance pass can still reduce the busiest
 8645		 * queue by moving tasks around inside the node.
 8646		 *
 8647		 * If we cannot move enough load due to this classification
 8648		 * the next pass will adjust the group classification and
 8649		 * allow migration of more tasks.
 8650		 *
 8651		 * Both cases only affect the total convergence complexity.
 8652		 */
 8653		if (rt > env->fbq_type)
 8654			continue;
 8655
 8656		capacity = capacity_of(i);
 8657
 8658		wl = weighted_cpuload(rq);
 8659
 8660		/*
 8661		 * When comparing with imbalance, use weighted_cpuload()
 8662		 * which is not scaled with the CPU capacity.
 8663		 */
 8664
 8665		if (rq->nr_running == 1 && wl > env->imbalance &&
 8666		    !check_cpu_capacity(rq, env->sd))
 8667			continue;
 8668
 8669		/*
 8670		 * For the load comparisons with the other CPU's, consider
 8671		 * the weighted_cpuload() scaled with the CPU capacity, so
 8672		 * that the load can be moved away from the CPU that is
 8673		 * potentially running at a lower capacity.
 8674		 *
 8675		 * Thus we're looking for max(wl_i / capacity_i), crosswise
 8676		 * multiplication to rid ourselves of the division works out
 8677		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
 8678		 * our previous maximum.
 8679		 */
 8680		if (wl * busiest_capacity > busiest_load * capacity) {
 8681			busiest_load = wl;
 8682			busiest_capacity = capacity;
 8683			busiest = rq;
 8684		}
 8685	}
 8686
 8687	return busiest;
 8688}
 8689
 8690/*
 8691 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
 8692 * so long as it is large enough.
 8693 */
 8694#define MAX_PINNED_INTERVAL	512
 8695
 8696static int need_active_balance(struct lb_env *env)
 8697{
 8698	struct sched_domain *sd = env->sd;
 8699
 8700	if (env->idle == CPU_NEWLY_IDLE) {
 8701
 8702		/*
 8703		 * ASYM_PACKING needs to force migrate tasks from busy but
 8704		 * lower priority CPUs in order to pack all tasks in the
 8705		 * highest priority CPUs.
 8706		 */
 8707		if ((sd->flags & SD_ASYM_PACKING) &&
 8708		    sched_asym_prefer(env->dst_cpu, env->src_cpu))
 8709			return 1;
 8710	}
 8711
 8712	/*
 8713	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
 8714	 * It's worth migrating the task if the src_cpu's capacity is reduced
 8715	 * because of other sched_class or IRQs if more capacity stays
 8716	 * available on dst_cpu.
 8717	 */
 8718	if ((env->idle != CPU_NOT_IDLE) &&
 8719	    (env->src_rq->cfs.h_nr_running == 1)) {
 8720		if ((check_cpu_capacity(env->src_rq, sd)) &&
 8721		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
 8722			return 1;
 8723	}
 8724
 8725	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 8726}
 8727
 8728static int active_load_balance_cpu_stop(void *data);
 8729
 8730static int should_we_balance(struct lb_env *env)
 8731{
 8732	struct sched_group *sg = env->sd->groups;
 8733	int cpu, balance_cpu = -1;
 8734
 8735	/*
 8736	 * Ensure the balancing environment is consistent; can happen
 8737	 * when the softirq triggers 'during' hotplug.
 8738	 */
 8739	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
 8740		return 0;
 8741
 8742	/*
 8743	 * In the newly idle case, we will allow all the CPUs
 8744	 * to do the newly idle load balance.
 8745	 */
 8746	if (env->idle == CPU_NEWLY_IDLE)
 8747		return 1;
 8748
 8749	/* Try to find first idle CPU */
 8750	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
 8751		if (!idle_cpu(cpu))
 8752			continue;
 8753
 8754		balance_cpu = cpu;
 8755		break;
 8756	}
 8757
 8758	if (balance_cpu == -1)
 8759		balance_cpu = group_balance_cpu(sg);
 8760
 8761	/*
 8762	 * First idle CPU or the first CPU(busiest) in this sched group
 8763	 * is eligible for doing load balancing at this and above domains.
 8764	 */
 8765	return balance_cpu == env->dst_cpu;
 8766}
 8767
 8768/*
 8769 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 8770 * tasks if there is an imbalance.
 8771 */
 8772static int load_balance(int this_cpu, struct rq *this_rq,
 8773			struct sched_domain *sd, enum cpu_idle_type idle,
 8774			int *continue_balancing)
 8775{
 8776	int ld_moved, cur_ld_moved, active_balance = 0;
 8777	struct sched_domain *sd_parent = sd->parent;
 8778	struct sched_group *group;
 8779	struct rq *busiest;
 8780	struct rq_flags rf;
 8781	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
 8782
 8783	struct lb_env env = {
 8784		.sd		= sd,
 8785		.dst_cpu	= this_cpu,
 8786		.dst_rq		= this_rq,
 8787		.dst_grpmask    = sched_group_span(sd->groups),
 8788		.idle		= idle,
 8789		.loop_break	= sched_nr_migrate_break,
 8790		.cpus		= cpus,
 8791		.fbq_type	= all,
 8792		.tasks		= LIST_HEAD_INIT(env.tasks),
 8793	};
 8794
 8795	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
 8796
 8797	schedstat_inc(sd->lb_count[idle]);
 8798
 8799redo:
 8800	if (!should_we_balance(&env)) {
 8801		*continue_balancing = 0;
 8802		goto out_balanced;
 8803	}
 8804
 8805	group = find_busiest_group(&env);
 8806	if (!group) {
 8807		schedstat_inc(sd->lb_nobusyg[idle]);
 8808		goto out_balanced;
 8809	}
 8810
 8811	busiest = find_busiest_queue(&env, group);
 8812	if (!busiest) {
 8813		schedstat_inc(sd->lb_nobusyq[idle]);
 8814		goto out_balanced;
 8815	}
 8816
 8817	BUG_ON(busiest == env.dst_rq);
 8818
 8819	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
 8820
 8821	env.src_cpu = busiest->cpu;
 8822	env.src_rq = busiest;
 8823
 8824	ld_moved = 0;
 8825	if (busiest->nr_running > 1) {
 8826		/*
 8827		 * Attempt to move tasks. If find_busiest_group has found
 8828		 * an imbalance but busiest->nr_running <= 1, the group is
 8829		 * still unbalanced. ld_moved simply stays zero, so it is
 8830		 * correctly treated as an imbalance.
 8831		 */
 8832		env.flags |= LBF_ALL_PINNED;
 8833		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 8834
 8835more_balance:
 8836		rq_lock_irqsave(busiest, &rf);
 8837		update_rq_clock(busiest);
 8838
 8839		/*
 8840		 * cur_ld_moved - load moved in current iteration
 8841		 * ld_moved     - cumulative load moved across iterations
 8842		 */
 8843		cur_ld_moved = detach_tasks(&env);
 8844
 8845		/*
 8846		 * We've detached some tasks from busiest_rq. Every
 8847		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
 8848		 * unlock busiest->lock, and we are able to be sure
 8849		 * that nobody can manipulate the tasks in parallel.
 8850		 * See task_rq_lock() family for the details.
 8851		 */
 8852
 8853		rq_unlock(busiest, &rf);
 8854
 8855		if (cur_ld_moved) {
 8856			attach_tasks(&env);
 8857			ld_moved += cur_ld_moved;
 8858		}
 8859
 8860		local_irq_restore(rf.flags);
 8861
 8862		if (env.flags & LBF_NEED_BREAK) {
 8863			env.flags &= ~LBF_NEED_BREAK;
 8864			goto more_balance;
 8865		}
 8866
 8867		/*
 8868		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
 8869		 * us and move them to an alternate dst_cpu in our sched_group
 8870		 * where they can run. The upper limit on how many times we
 8871		 * iterate on same src_cpu is dependent on number of CPUs in our
 8872		 * sched_group.
 8873		 *
 8874		 * This changes load balance semantics a bit on who can move
 8875		 * load to a given_cpu. In addition to the given_cpu itself
 8876		 * (or a ilb_cpu acting on its behalf where given_cpu is
 8877		 * nohz-idle), we now have balance_cpu in a position to move
 8878		 * load to given_cpu. In rare situations, this may cause
 8879		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
 8880		 * _independently_ and at _same_ time to move some load to
 8881		 * given_cpu) causing exceess load to be moved to given_cpu.
 8882		 * This however should not happen so much in practice and
 8883		 * moreover subsequent load balance cycles should correct the
 8884		 * excess load moved.
 8885		 */
 8886		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
 8887
 8888			/* Prevent to re-select dst_cpu via env's CPUs */
 8889			cpumask_clear_cpu(env.dst_cpu, env.cpus);
 8890
 8891			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
 8892			env.dst_cpu	 = env.new_dst_cpu;
 8893			env.flags	&= ~LBF_DST_PINNED;
 8894			env.loop	 = 0;
 8895			env.loop_break	 = sched_nr_migrate_break;
 8896
 8897			/*
 8898			 * Go back to "more_balance" rather than "redo" since we
 8899			 * need to continue with same src_cpu.
 8900			 */
 8901			goto more_balance;
 8902		}
 8903
 8904		/*
 8905		 * We failed to reach balance because of affinity.
 8906		 */
 8907		if (sd_parent) {
 8908			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
 8909
 8910			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
 8911				*group_imbalance = 1;
 8912		}
 8913
 8914		/* All tasks on this runqueue were pinned by CPU affinity */
 8915		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 8916			cpumask_clear_cpu(cpu_of(busiest), cpus);
 8917			/*
 8918			 * Attempting to continue load balancing at the current
 8919			 * sched_domain level only makes sense if there are
 8920			 * active CPUs remaining as possible busiest CPUs to
 8921			 * pull load from which are not contained within the
 8922			 * destination group that is receiving any migrated
 8923			 * load.
 8924			 */
 8925			if (!cpumask_subset(cpus, env.dst_grpmask)) {
 8926				env.loop = 0;
 8927				env.loop_break = sched_nr_migrate_break;
 8928				goto redo;
 8929			}
 8930			goto out_all_pinned;
 8931		}
 8932	}
 8933
 8934	if (!ld_moved) {
 8935		schedstat_inc(sd->lb_failed[idle]);
 8936		/*
 8937		 * Increment the failure counter only on periodic balance.
 8938		 * We do not want newidle balance, which can be very
 8939		 * frequent, pollute the failure counter causing
 8940		 * excessive cache_hot migrations and active balances.
 8941		 */
 8942		if (idle != CPU_NEWLY_IDLE)
 8943			sd->nr_balance_failed++;
 8944
 8945		if (need_active_balance(&env)) {
 8946			unsigned long flags;
 8947
 8948			raw_spin_lock_irqsave(&busiest->lock, flags);
 8949
 8950			/*
 8951			 * Don't kick the active_load_balance_cpu_stop,
 8952			 * if the curr task on busiest CPU can't be
 8953			 * moved to this_cpu:
 8954			 */
 8955			if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
 8956				raw_spin_unlock_irqrestore(&busiest->lock,
 8957							    flags);
 8958				env.flags |= LBF_ALL_PINNED;
 8959				goto out_one_pinned;
 8960			}
 8961
 8962			/*
 8963			 * ->active_balance synchronizes accesses to
 8964			 * ->active_balance_work.  Once set, it's cleared
 8965			 * only after active load balance is finished.
 8966			 */
 8967			if (!busiest->active_balance) {
 8968				busiest->active_balance = 1;
 8969				busiest->push_cpu = this_cpu;
 8970				active_balance = 1;
 8971			}
 8972			raw_spin_unlock_irqrestore(&busiest->lock, flags);
 8973
 8974			if (active_balance) {
 8975				stop_one_cpu_nowait(cpu_of(busiest),
 8976					active_load_balance_cpu_stop, busiest,
 8977					&busiest->active_balance_work);
 8978			}
 8979
 8980			/* We've kicked active balancing, force task migration. */
 8981			sd->nr_balance_failed = sd->cache_nice_tries+1;
 8982		}
 8983	} else
 8984		sd->nr_balance_failed = 0;
 8985
 8986	if (likely(!active_balance)) {
 8987		/* We were unbalanced, so reset the balancing interval */
 8988		sd->balance_interval = sd->min_interval;
 8989	} else {
 8990		/*
 8991		 * If we've begun active balancing, start to back off. This
 8992		 * case may not be covered by the all_pinned logic if there
 8993		 * is only 1 task on the busy runqueue (because we don't call
 8994		 * detach_tasks).
 8995		 */
 8996		if (sd->balance_interval < sd->max_interval)
 8997			sd->balance_interval *= 2;
 8998	}
 8999
 9000	goto out;
 9001
 9002out_balanced:
 9003	/*
 9004	 * We reach balance although we may have faced some affinity
 9005	 * constraints. Clear the imbalance flag if it was set.
 9006	 */
 9007	if (sd_parent) {
 9008		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
 9009
 9010		if (*group_imbalance)
 9011			*group_imbalance = 0;
 9012	}
 9013
 9014out_all_pinned:
 9015	/*
 9016	 * We reach balance because all tasks are pinned at this level so
 9017	 * we can't migrate them. Let the imbalance flag set so parent level
 9018	 * can try to migrate them.
 9019	 */
 9020	schedstat_inc(sd->lb_balanced[idle]);
 9021
 9022	sd->nr_balance_failed = 0;
 9023
 9024out_one_pinned:
 9025	/* tune up the balancing interval */
 9026	if (((env.flags & LBF_ALL_PINNED) &&
 9027			sd->balance_interval < MAX_PINNED_INTERVAL) ||
 9028			(sd->balance_interval < sd->max_interval))
 9029		sd->balance_interval *= 2;
 9030
 9031	ld_moved = 0;
 9032out:
 9033	return ld_moved;
 9034}
 9035
 9036static inline unsigned long
 9037get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
 9038{
 9039	unsigned long interval = sd->balance_interval;
 9040
 9041	if (cpu_busy)
 9042		interval *= sd->busy_factor;
 9043
 9044	/* scale ms to jiffies */
 9045	interval = msecs_to_jiffies(interval);
 9046	interval = clamp(interval, 1UL, max_load_balance_interval);
 9047
 9048	return interval;
 9049}
 9050
 9051static inline void
 9052update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 9053{
 9054	unsigned long interval, next;
 9055
 9056	/* used by idle balance, so cpu_busy = 0 */
 9057	interval = get_sd_balance_interval(sd, 0);
 9058	next = sd->last_balance + interval;
 9059
 9060	if (time_after(*next_balance, next))
 9061		*next_balance = next;
 9062}
 9063
 9064/*
 9065 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
 9066 * running tasks off the busiest CPU onto idle CPUs. It requires at
 9067 * least 1 task to be running on each physical CPU where possible, and
 9068 * avoids physical / logical imbalances.
 9069 */
 9070static int active_load_balance_cpu_stop(void *data)
 9071{
 9072	struct rq *busiest_rq = data;
 9073	int busiest_cpu = cpu_of(busiest_rq);
 9074	int target_cpu = busiest_rq->push_cpu;
 9075	struct rq *target_rq = cpu_rq(target_cpu);
 9076	struct sched_domain *sd;
 9077	struct task_struct *p = NULL;
 9078	struct rq_flags rf;
 9079
 9080	rq_lock_irq(busiest_rq, &rf);
 9081	/*
 9082	 * Between queueing the stop-work and running it is a hole in which
 9083	 * CPUs can become inactive. We should not move tasks from or to
 9084	 * inactive CPUs.
 9085	 */
 9086	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
 9087		goto out_unlock;
 9088
 9089	/* Make sure the requested CPU hasn't gone down in the meantime: */
 9090	if (unlikely(busiest_cpu != smp_processor_id() ||
 9091		     !busiest_rq->active_balance))
 9092		goto out_unlock;
 9093
 9094	/* Is there any task to move? */
 9095	if (busiest_rq->nr_running <= 1)
 9096		goto out_unlock;
 9097
 9098	/*
 9099	 * This condition is "impossible", if it occurs
 9100	 * we need to fix it. Originally reported by
 9101	 * Bjorn Helgaas on a 128-CPU setup.
 9102	 */
 9103	BUG_ON(busiest_rq == target_rq);
 9104
 9105	/* Search for an sd spanning us and the target CPU. */
 9106	rcu_read_lock();
 9107	for_each_domain(target_cpu, sd) {
 9108		if ((sd->flags & SD_LOAD_BALANCE) &&
 9109		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
 9110				break;
 9111	}
 9112
 9113	if (likely(sd)) {
 9114		struct lb_env env = {
 9115			.sd		= sd,
 9116			.dst_cpu	= target_cpu,
 9117			.dst_rq		= target_rq,
 9118			.src_cpu	= busiest_rq->cpu,
 9119			.src_rq		= busiest_rq,
 9120			.idle		= CPU_IDLE,
 9121			/*
 9122			 * can_migrate_task() doesn't need to compute new_dst_cpu
 9123			 * for active balancing. Since we have CPU_IDLE, but no
 9124			 * @dst_grpmask we need to make that test go away with lying
 9125			 * about DST_PINNED.
 9126			 */
 9127			.flags		= LBF_DST_PINNED,
 9128		};
 9129
 9130		schedstat_inc(sd->alb_count);
 9131		update_rq_clock(busiest_rq);
 9132
 9133		p = detach_one_task(&env);
 9134		if (p) {
 9135			schedstat_inc(sd->alb_pushed);
 9136			/* Active balancing done, reset the failure counter. */
 9137			sd->nr_balance_failed = 0;
 9138		} else {
 9139			schedstat_inc(sd->alb_failed);
 9140		}
 9141	}
 9142	rcu_read_unlock();
 9143out_unlock:
 9144	busiest_rq->active_balance = 0;
 9145	rq_unlock(busiest_rq, &rf);
 9146
 9147	if (p)
 9148		attach_one_task(target_rq, p);
 9149
 9150	local_irq_enable();
 9151
 9152	return 0;
 9153}
 9154
 9155static DEFINE_SPINLOCK(balancing);
 9156
 9157/*
 9158 * Scale the max load_balance interval with the number of CPUs in the system.
 9159 * This trades load-balance latency on larger machines for less cross talk.
 9160 */
 9161void update_max_interval(void)
 9162{
 9163	max_load_balance_interval = HZ*num_online_cpus()/10;
 9164}
 9165
 9166/*
 9167 * It checks each scheduling domain to see if it is due to be balanced,
 9168 * and initiates a balancing operation if so.
 9169 *
 9170 * Balancing parameters are set up in init_sched_domains.
 9171 */
 9172static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 9173{
 9174	int continue_balancing = 1;
 9175	int cpu = rq->cpu;
 9176	unsigned long interval;
 9177	struct sched_domain *sd;
 9178	/* Earliest time when we have to do rebalance again */
 9179	unsigned long next_balance = jiffies + 60*HZ;
 9180	int update_next_balance = 0;
 9181	int need_serialize, need_decay = 0;
 9182	u64 max_cost = 0;
 9183
 9184	rcu_read_lock();
 9185	for_each_domain(cpu, sd) {
 9186		/*
 9187		 * Decay the newidle max times here because this is a regular
 9188		 * visit to all the domains. Decay ~1% per second.
 9189		 */
 9190		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
 9191			sd->max_newidle_lb_cost =
 9192				(sd->max_newidle_lb_cost * 253) / 256;
 9193			sd->next_decay_max_lb_cost = jiffies + HZ;
 9194			need_decay = 1;
 9195		}
 9196		max_cost += sd->max_newidle_lb_cost;
 9197
 9198		if (!(sd->flags & SD_LOAD_BALANCE))
 9199			continue;
 9200
 9201		/*
 9202		 * Stop the load balance at this level. There is another
 9203		 * CPU in our sched group which is doing load balancing more
 9204		 * actively.
 9205		 */
 9206		if (!continue_balancing) {
 9207			if (need_decay)
 9208				continue;
 9209			break;
 9210		}
 9211
 9212		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
 9213
 9214		need_serialize = sd->flags & SD_SERIALIZE;
 9215		if (need_serialize) {
 9216			if (!spin_trylock(&balancing))
 9217				goto out;
 9218		}
 9219
 9220		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 9221			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
 9222				/*
 9223				 * The LBF_DST_PINNED logic could have changed
 9224				 * env->dst_cpu, so we can't know our idle
 9225				 * state even if we migrated tasks. Update it.
 9226				 */
 9227				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
 9228			}
 9229			sd->last_balance = jiffies;
 9230			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
 9231		}
 9232		if (need_serialize)
 9233			spin_unlock(&balancing);
 9234out:
 9235		if (time_after(next_balance, sd->last_balance + interval)) {
 9236			next_balance = sd->last_balance + interval;
 9237			update_next_balance = 1;
 9238		}
 9239	}
 9240	if (need_decay) {
 9241		/*
 9242		 * Ensure the rq-wide value also decays but keep it at a
 9243		 * reasonable floor to avoid funnies with rq->avg_idle.
 9244		 */
 9245		rq->max_idle_balance_cost =
 9246			max((u64)sysctl_sched_migration_cost, max_cost);
 9247	}
 9248	rcu_read_unlock();
 9249
 9250	/*
 9251	 * next_balance will be updated only when there is a need.
 9252	 * When the cpu is attached to null domain for ex, it will not be
 9253	 * updated.
 9254	 */
 9255	if (likely(update_next_balance)) {
 9256		rq->next_balance = next_balance;
 9257
 9258#ifdef CONFIG_NO_HZ_COMMON
 9259		/*
 9260		 * If this CPU has been elected to perform the nohz idle
 9261		 * balance. Other idle CPUs have already rebalanced with
 9262		 * nohz_idle_balance() and nohz.next_balance has been
 9263		 * updated accordingly. This CPU is now running the idle load
 9264		 * balance for itself and we need to update the
 9265		 * nohz.next_balance accordingly.
 9266		 */
 9267		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
 9268			nohz.next_balance = rq->next_balance;
 9269#endif
 9270	}
 9271}
 9272
 9273static inline int on_null_domain(struct rq *rq)
 9274{
 9275	return unlikely(!rcu_dereference_sched(rq->sd));
 9276}
 9277
 9278#ifdef CONFIG_NO_HZ_COMMON
 9279/*
 9280 * idle load balancing details
 9281 * - When one of the busy CPUs notice that there may be an idle rebalancing
 9282 *   needed, they will kick the idle load balancer, which then does idle
 9283 *   load balancing for all the idle CPUs.
 9284 */
 9285
 9286static inline int find_new_ilb(void)
 9287{
 9288	int ilb = cpumask_first(nohz.idle_cpus_mask);
 9289
 9290	if (ilb < nr_cpu_ids && idle_cpu(ilb))
 9291		return ilb;
 9292
 9293	return nr_cpu_ids;
 9294}
 9295
 9296/*
 9297 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
 9298 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
 9299 * CPU (if there is one).
 9300 */
 9301static void kick_ilb(unsigned int flags)
 9302{
 9303	int ilb_cpu;
 9304
 9305	nohz.next_balance++;
 9306
 9307	ilb_cpu = find_new_ilb();
 9308
 9309	if (ilb_cpu >= nr_cpu_ids)
 9310		return;
 9311
 9312	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
 9313	if (flags & NOHZ_KICK_MASK)
 9314		return;
 9315
 9316	/*
 9317	 * Use smp_send_reschedule() instead of resched_cpu().
 9318	 * This way we generate a sched IPI on the target CPU which
 9319	 * is idle. And the softirq performing nohz idle load balance
 9320	 * will be run before returning from the IPI.
 9321	 */
 9322	smp_send_reschedule(ilb_cpu);
 9323}
 9324
 9325/*
 9326 * Current heuristic for kicking the idle load balancer in the presence
 9327 * of an idle cpu in the system.
 9328 *   - This rq has more than one task.
 9329 *   - This rq has at least one CFS task and the capacity of the CPU is
 9330 *     significantly reduced because of RT tasks or IRQs.
 9331 *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
 9332 *     multiple busy cpu.
 9333 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
 9334 *     domain span are idle.
 9335 */
 9336static void nohz_balancer_kick(struct rq *rq)
 9337{
 9338	unsigned long now = jiffies;
 9339	struct sched_domain_shared *sds;
 9340	struct sched_domain *sd;
 9341	int nr_busy, i, cpu = rq->cpu;
 9342	unsigned int flags = 0;
 9343
 9344	if (unlikely(rq->idle_balance))
 9345		return;
 9346
 9347	/*
 9348	 * We may be recently in ticked or tickless idle mode. At the first
 9349	 * busy tick after returning from idle, we will update the busy stats.
 9350	 */
 9351	nohz_balance_exit_idle(rq);
 9352
 9353	/*
 9354	 * None are in tickless mode and hence no need for NOHZ idle load
 9355	 * balancing.
 9356	 */
 9357	if (likely(!atomic_read(&nohz.nr_cpus)))
 9358		return;
 9359
 9360	if (READ_ONCE(nohz.has_blocked) &&
 9361	    time_after(now, READ_ONCE(nohz.next_blocked)))
 9362		flags = NOHZ_STATS_KICK;
 9363
 9364	if (time_before(now, nohz.next_balance))
 9365		goto out;
 9366
 9367	if (rq->nr_running >= 2) {
 9368		flags = NOHZ_KICK_MASK;
 9369		goto out;
 9370	}
 9371
 9372	rcu_read_lock();
 9373	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 9374	if (sds) {
 9375		/*
 9376		 * XXX: write a coherent comment on why we do this.
 9377		 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
 9378		 */
 9379		nr_busy = atomic_read(&sds->nr_busy_cpus);
 9380		if (nr_busy > 1) {
 9381			flags = NOHZ_KICK_MASK;
 9382			goto unlock;
 9383		}
 9384
 9385	}
 9386
 9387	sd = rcu_dereference(rq->sd);
 9388	if (sd) {
 9389		if ((rq->cfs.h_nr_running >= 1) &&
 9390				check_cpu_capacity(rq, sd)) {
 9391			flags = NOHZ_KICK_MASK;
 9392			goto unlock;
 9393		}
 9394	}
 9395
 9396	sd = rcu_dereference(per_cpu(sd_asym, cpu));
 9397	if (sd) {
 9398		for_each_cpu(i, sched_domain_span(sd)) {
 9399			if (i == cpu ||
 9400			    !cpumask_test_cpu(i, nohz.idle_cpus_mask))
 9401				continue;
 9402
 9403			if (sched_asym_prefer(i, cpu)) {
 9404				flags = NOHZ_KICK_MASK;
 9405				goto unlock;
 9406			}
 9407		}
 9408	}
 9409unlock:
 9410	rcu_read_unlock();
 9411out:
 9412	if (flags)
 9413		kick_ilb(flags);
 9414}
 9415
 9416static void set_cpu_sd_state_busy(int cpu)
 9417{
 9418	struct sched_domain *sd;
 9419
 9420	rcu_read_lock();
 9421	sd = rcu_dereference(per_cpu(sd_llc, cpu));
 9422
 9423	if (!sd || !sd->nohz_idle)
 9424		goto unlock;
 9425	sd->nohz_idle = 0;
 9426
 9427	atomic_inc(&sd->shared->nr_busy_cpus);
 9428unlock:
 9429	rcu_read_unlock();
 9430}
 9431
 9432void nohz_balance_exit_idle(struct rq *rq)
 9433{
 9434	SCHED_WARN_ON(rq != this_rq());
 9435
 9436	if (likely(!rq->nohz_tick_stopped))
 9437		return;
 9438
 9439	rq->nohz_tick_stopped = 0;
 9440	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
 9441	atomic_dec(&nohz.nr_cpus);
 9442
 9443	set_cpu_sd_state_busy(rq->cpu);
 9444}
 9445
 9446static void set_cpu_sd_state_idle(int cpu)
 9447{
 9448	struct sched_domain *sd;
 9449
 9450	rcu_read_lock();
 9451	sd = rcu_dereference(per_cpu(sd_llc, cpu));
 9452
 9453	if (!sd || sd->nohz_idle)
 9454		goto unlock;
 9455	sd->nohz_idle = 1;
 9456
 9457	atomic_dec(&sd->shared->nr_busy_cpus);
 9458unlock:
 9459	rcu_read_unlock();
 9460}
 9461
 9462/*
 9463 * This routine will record that the CPU is going idle with tick stopped.
 9464 * This info will be used in performing idle load balancing in the future.
 9465 */
 9466void nohz_balance_enter_idle(int cpu)
 9467{
 9468	struct rq *rq = cpu_rq(cpu);
 9469
 9470	SCHED_WARN_ON(cpu != smp_processor_id());
 9471
 9472	/* If this CPU is going down, then nothing needs to be done: */
 9473	if (!cpu_active(cpu))
 9474		return;
 9475
 9476	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
 9477	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 9478		return;
 9479
 9480	/*
 9481	 * Can be set safely without rq->lock held
 9482	 * If a clear happens, it will have evaluated last additions because
 9483	 * rq->lock is held during the check and the clear
 9484	 */
 9485	rq->has_blocked_load = 1;
 9486
 9487	/*
 9488	 * The tick is still stopped but load could have been added in the
 9489	 * meantime. We set the nohz.has_blocked flag to trig a check of the
 9490	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
 9491	 * of nohz.has_blocked can only happen after checking the new load
 9492	 */
 9493	if (rq->nohz_tick_stopped)
 9494		goto out;
 9495
 9496	/* If we're a completely isolated CPU, we don't play: */
 9497	if (on_null_domain(rq))
 9498		return;
 9499
 9500	rq->nohz_tick_stopped = 1;
 9501
 9502	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 9503	atomic_inc(&nohz.nr_cpus);
 9504
 9505	/*
 9506	 * Ensures that if nohz_idle_balance() fails to observe our
 9507	 * @idle_cpus_mask store, it must observe the @has_blocked
 9508	 * store.
 9509	 */
 9510	smp_mb__after_atomic();
 9511
 9512	set_cpu_sd_state_idle(cpu);
 9513
 9514out:
 9515	/*
 9516	 * Each time a cpu enter idle, we assume that it has blocked load and
 9517	 * enable the periodic update of the load of idle cpus
 9518	 */
 9519	WRITE_ONCE(nohz.has_blocked, 1);
 9520}
 9521
 9522/*
 9523 * Internal function that runs load balance for all idle cpus. The load balance
 9524 * can be a simple update of blocked load or a complete load balance with
 9525 * tasks movement depending of flags.
 9526 * The function returns false if the loop has stopped before running
 9527 * through all idle CPUs.
 9528 */
 9529static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 9530			       enum cpu_idle_type idle)
 9531{
 9532	/* Earliest time when we have to do rebalance again */
 9533	unsigned long now = jiffies;
 9534	unsigned long next_balance = now + 60*HZ;
 9535	bool has_blocked_load = false;
 9536	int update_next_balance = 0;
 9537	int this_cpu = this_rq->cpu;
 9538	int balance_cpu;
 9539	int ret = false;
 9540	struct rq *rq;
 9541
 9542	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 9543
 9544	/*
 9545	 * We assume there will be no idle load after this update and clear
 9546	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
 9547	 * set the has_blocked flag and trig another update of idle load.
 9548	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
 9549	 * setting the flag, we are sure to not clear the state and not
 9550	 * check the load of an idle cpu.
 9551	 */
 9552	WRITE_ONCE(nohz.has_blocked, 0);
 9553
 9554	/*
 9555	 * Ensures that if we miss the CPU, we must see the has_blocked
 9556	 * store from nohz_balance_enter_idle().
 9557	 */
 9558	smp_mb();
 9559
 9560	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 9561		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 9562			continue;
 9563
 9564		/*
 9565		 * If this CPU gets work to do, stop the load balancing
 9566		 * work being done for other CPUs. Next load
 9567		 * balancing owner will pick it up.
 9568		 */
 9569		if (need_resched()) {
 9570			has_blocked_load = true;
 9571			goto abort;
 9572		}
 9573
 9574		rq = cpu_rq(balance_cpu);
 9575
 9576		has_blocked_load |= update_nohz_stats(rq, true);
 9577
 9578		/*
 9579		 * If time for next balance is due,
 9580		 * do the balance.
 9581		 */
 9582		if (time_after_eq(jiffies, rq->next_balance)) {
 9583			struct rq_flags rf;
 9584
 9585			rq_lock_irqsave(rq, &rf);
 9586			update_rq_clock(rq);
 9587			cpu_load_update_idle(rq);
 9588			rq_unlock_irqrestore(rq, &rf);
 9589
 9590			if (flags & NOHZ_BALANCE_KICK)
 9591				rebalance_domains(rq, CPU_IDLE);
 9592		}
 9593
 9594		if (time_after(next_balance, rq->next_balance)) {
 9595			next_balance = rq->next_balance;
 9596			update_next_balance = 1;
 9597		}
 9598	}
 9599
 9600	/* Newly idle CPU doesn't need an update */
 9601	if (idle != CPU_NEWLY_IDLE) {
 9602		update_blocked_averages(this_cpu);
 9603		has_blocked_load |= this_rq->has_blocked_load;
 9604	}
 9605
 9606	if (flags & NOHZ_BALANCE_KICK)
 9607		rebalance_domains(this_rq, CPU_IDLE);
 9608
 9609	WRITE_ONCE(nohz.next_blocked,
 9610		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 9611
 9612	/* The full idle balance loop has been done */
 9613	ret = true;
 9614
 9615abort:
 9616	/* There is still blocked load, enable periodic update */
 9617	if (has_blocked_load)
 9618		WRITE_ONCE(nohz.has_blocked, 1);
 9619
 9620	/*
 9621	 * next_balance will be updated only when there is a need.
 9622	 * When the CPU is attached to null domain for ex, it will not be
 9623	 * updated.
 9624	 */
 9625	if (likely(update_next_balance))
 9626		nohz.next_balance = next_balance;
 9627
 9628	return ret;
 9629}
 9630
 9631/*
 9632 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
 9633 * rebalancing for all the cpus for whom scheduler ticks are stopped.
 9634 */
 9635static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 9636{
 9637	int this_cpu = this_rq->cpu;
 9638	unsigned int flags;
 9639
 9640	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
 9641		return false;
 9642
 9643	if (idle != CPU_IDLE) {
 9644		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
 9645		return false;
 9646	}
 9647
 9648	/*
 9649	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
 9650	 */
 9651	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
 9652	if (!(flags & NOHZ_KICK_MASK))
 9653		return false;
 9654
 9655	_nohz_idle_balance(this_rq, flags, idle);
 9656
 9657	return true;
 9658}
 9659
 9660static void nohz_newidle_balance(struct rq *this_rq)
 9661{
 9662	int this_cpu = this_rq->cpu;
 9663
 9664	/*
 9665	 * This CPU doesn't want to be disturbed by scheduler
 9666	 * housekeeping
 9667	 */
 9668	if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
 9669		return;
 9670
 9671	/* Will wake up very soon. No time for doing anything else*/
 9672	if (this_rq->avg_idle < sysctl_sched_migration_cost)
 9673		return;
 9674
 9675	/* Don't need to update blocked load of idle CPUs*/
 9676	if (!READ_ONCE(nohz.has_blocked) ||
 9677	    time_before(jiffies, READ_ONCE(nohz.next_blocked)))
 9678		return;
 9679
 9680	raw_spin_unlock(&this_rq->lock);
 9681	/*
 9682	 * This CPU is going to be idle and blocked load of idle CPUs
 9683	 * need to be updated. Run the ilb locally as it is a good
 9684	 * candidate for ilb instead of waking up another idle CPU.
 9685	 * Kick an normal ilb if we failed to do the update.
 9686	 */
 9687	if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
 9688		kick_ilb(NOHZ_STATS_KICK);
 9689	raw_spin_lock(&this_rq->lock);
 9690}
 9691
 9692#else /* !CONFIG_NO_HZ_COMMON */
 9693static inline void nohz_balancer_kick(struct rq *rq) { }
 9694
 9695static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 9696{
 9697	return false;
 9698}
 9699
 9700static inline void nohz_newidle_balance(struct rq *this_rq) { }
 9701#endif /* CONFIG_NO_HZ_COMMON */
 9702
 9703/*
 9704 * idle_balance is called by schedule() if this_cpu is about to become
 9705 * idle. Attempts to pull tasks from other CPUs.
 9706 */
 9707static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 9708{
 9709	unsigned long next_balance = jiffies + HZ;
 9710	int this_cpu = this_rq->cpu;
 9711	struct sched_domain *sd;
 9712	int pulled_task = 0;
 9713	u64 curr_cost = 0;
 9714
 9715	/*
 9716	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 9717	 * measure the duration of idle_balance() as idle time.
 9718	 */
 9719	this_rq->idle_stamp = rq_clock(this_rq);
 9720
 9721	/*
 9722	 * Do not pull tasks towards !active CPUs...
 9723	 */
 9724	if (!cpu_active(this_cpu))
 9725		return 0;
 9726
 9727	/*
 9728	 * This is OK, because current is on_cpu, which avoids it being picked
 9729	 * for load-balance and preemption/IRQs are still disabled avoiding
 9730	 * further scheduler activity on it and we're being very careful to
 9731	 * re-start the picking loop.
 9732	 */
 9733	rq_unpin_lock(this_rq, rf);
 9734
 9735	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 9736	    !this_rq->rd->overload) {
 9737
 9738		rcu_read_lock();
 9739		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 9740		if (sd)
 9741			update_next_balance(sd, &next_balance);
 9742		rcu_read_unlock();
 9743
 9744		nohz_newidle_balance(this_rq);
 9745
 9746		goto out;
 9747	}
 9748
 9749	raw_spin_unlock(&this_rq->lock);
 9750
 9751	update_blocked_averages(this_cpu);
 9752	rcu_read_lock();
 9753	for_each_domain(this_cpu, sd) {
 9754		int continue_balancing = 1;
 9755		u64 t0, domain_cost;
 9756
 9757		if (!(sd->flags & SD_LOAD_BALANCE))
 9758			continue;
 9759
 9760		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
 9761			update_next_balance(sd, &next_balance);
 9762			break;
 9763		}
 9764
 9765		if (sd->flags & SD_BALANCE_NEWIDLE) {
 9766			t0 = sched_clock_cpu(this_cpu);
 9767
 9768			pulled_task = load_balance(this_cpu, this_rq,
 9769						   sd, CPU_NEWLY_IDLE,
 9770						   &continue_balancing);
 9771
 9772			domain_cost = sched_clock_cpu(this_cpu) - t0;
 9773			if (domain_cost > sd->max_newidle_lb_cost)
 9774				sd->max_newidle_lb_cost = domain_cost;
 9775
 9776			curr_cost += domain_cost;
 9777		}
 9778
 9779		update_next_balance(sd, &next_balance);
 9780
 9781		/*
 9782		 * Stop searching for tasks to pull if there are
 9783		 * now runnable tasks on this rq.
 9784		 */
 9785		if (pulled_task || this_rq->nr_running > 0)
 9786			break;
 9787	}
 9788	rcu_read_unlock();
 9789
 9790	raw_spin_lock(&this_rq->lock);
 9791
 9792	if (curr_cost > this_rq->max_idle_balance_cost)
 9793		this_rq->max_idle_balance_cost = curr_cost;
 9794
 9795out:
 9796	/*
 9797	 * While browsing the domains, we released the rq lock, a task could
 9798	 * have been enqueued in the meantime. Since we're not going idle,
 9799	 * pretend we pulled a task.
 9800	 */
 9801	if (this_rq->cfs.h_nr_running && !pulled_task)
 9802		pulled_task = 1;
 9803
 9804	/* Move the next balance forward */
 9805	if (time_after(this_rq->next_balance, next_balance))
 9806		this_rq->next_balance = next_balance;
 9807
 9808	/* Is there a task of a high priority class? */
 9809	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
 9810		pulled_task = -1;
 9811
 9812	if (pulled_task)
 9813		this_rq->idle_stamp = 0;
 9814
 9815	rq_repin_lock(this_rq, rf);
 9816
 9817	return pulled_task;
 9818}
 9819
 9820/*
 9821 * run_rebalance_domains is triggered when needed from the scheduler tick.
 9822 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
 9823 */
 9824static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 9825{
 9826	struct rq *this_rq = this_rq();
 9827	enum cpu_idle_type idle = this_rq->idle_balance ?
 9828						CPU_IDLE : CPU_NOT_IDLE;
 9829
 9830	/*
 9831	 * If this CPU has a pending nohz_balance_kick, then do the
 9832	 * balancing on behalf of the other idle CPUs whose ticks are
 9833	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
 9834	 * give the idle CPUs a chance to load balance. Else we may
 9835	 * load balance only within the local sched_domain hierarchy
 9836	 * and abort nohz_idle_balance altogether if we pull some load.
 9837	 */
 9838	if (nohz_idle_balance(this_rq, idle))
 9839		return;
 9840
 9841	/* normal load balance */
 9842	update_blocked_averages(this_rq->cpu);
 9843	rebalance_domains(this_rq, idle);
 9844}
 9845
 9846/*
 9847 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 9848 */
 9849void trigger_load_balance(struct rq *rq)
 9850{
 9851	/* Don't need to rebalance while attached to NULL domain */
 9852	if (unlikely(on_null_domain(rq)))
 9853		return;
 9854
 9855	if (time_after_eq(jiffies, rq->next_balance))
 9856		raise_softirq(SCHED_SOFTIRQ);
 9857
 9858	nohz_balancer_kick(rq);
 9859}
 9860
 9861static void rq_online_fair(struct rq *rq)
 9862{
 9863	update_sysctl();
 9864
 9865	update_runtime_enabled(rq);
 9866}
 9867
 9868static void rq_offline_fair(struct rq *rq)
 9869{
 9870	update_sysctl();
 9871
 9872	/* Ensure any throttled groups are reachable by pick_next_task */
 9873	unthrottle_offline_cfs_rqs(rq);
 9874}
 9875
 9876#endif /* CONFIG_SMP */
 9877
 9878/*
 9879 * scheduler tick hitting a task of our scheduling class.
 9880 *
 9881 * NOTE: This function can be called remotely by the tick offload that
 9882 * goes along full dynticks. Therefore no local assumption can be made
 9883 * and everything must be accessed through the @rq and @curr passed in
 9884 * parameters.
 9885 */
 9886static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 9887{
 9888	struct cfs_rq *cfs_rq;
 9889	struct sched_entity *se = &curr->se;
 9890
 9891	for_each_sched_entity(se) {
 9892		cfs_rq = cfs_rq_of(se);
 9893		entity_tick(cfs_rq, se, queued);
 9894	}
 9895
 9896	if (static_branch_unlikely(&sched_numa_balancing))
 9897		task_tick_numa(rq, curr);
 9898}
 9899
 9900/*
 9901 * called on fork with the child task as argument from the parent's context
 9902 *  - child not yet on the tasklist
 9903 *  - preemption disabled
 9904 */
 9905static void task_fork_fair(struct task_struct *p)
 9906{
 9907	struct cfs_rq *cfs_rq;
 9908	struct sched_entity *se = &p->se, *curr;
 9909	struct rq *rq = this_rq();
 9910	struct rq_flags rf;
 9911
 9912	rq_lock(rq, &rf);
 9913	update_rq_clock(rq);
 9914
 9915	cfs_rq = task_cfs_rq(current);
 9916	curr = cfs_rq->curr;
 9917	if (curr) {
 9918		update_curr(cfs_rq);
 9919		se->vruntime = curr->vruntime;
 9920	}
 9921	place_entity(cfs_rq, se, 1);
 9922
 9923	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
 9924		/*
 9925		 * Upon rescheduling, sched_class::put_prev_task() will place
 9926		 * 'current' within the tree based on its new key value.
 9927		 */
 9928		swap(curr->vruntime, se->vruntime);
 9929		resched_curr(rq);
 9930	}
 9931
 9932	se->vruntime -= cfs_rq->min_vruntime;
 9933	rq_unlock(rq, &rf);
 9934}
 9935
 9936/*
 9937 * Priority of the task has changed. Check to see if we preempt
 9938 * the current task.
 9939 */
 9940static void
 9941prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 9942{
 9943	if (!task_on_rq_queued(p))
 9944		return;
 9945
 9946	/*
 9947	 * Reschedule if we are currently running on this runqueue and
 9948	 * our priority decreased, or if we are not currently running on
 9949	 * this runqueue and our priority is higher than the current's
 9950	 */
 9951	if (rq->curr == p) {
 9952		if (p->prio > oldprio)
 9953			resched_curr(rq);
 9954	} else
 9955		check_preempt_curr(rq, p, 0);
 9956}
 9957
 9958static inline bool vruntime_normalized(struct task_struct *p)
 9959{
 9960	struct sched_entity *se = &p->se;
 9961
 9962	/*
 9963	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
 9964	 * the dequeue_entity(.flags=0) will already have normalized the
 9965	 * vruntime.
 9966	 */
 9967	if (p->on_rq)
 9968		return true;
 9969
 9970	/*
 9971	 * When !on_rq, vruntime of the task has usually NOT been normalized.
 9972	 * But there are some cases where it has already been normalized:
 9973	 *
 9974	 * - A forked child which is waiting for being woken up by
 9975	 *   wake_up_new_task().
 9976	 * - A task which has been woken up by try_to_wake_up() and
 9977	 *   waiting for actually being woken up by sched_ttwu_pending().
 9978	 */
 9979	if (!se->sum_exec_runtime || p->state == TASK_WAKING)
 9980		return true;
 9981
 9982	return false;
 9983}
 9984
 9985#ifdef CONFIG_FAIR_GROUP_SCHED
 9986/*
 9987 * Propagate the changes of the sched_entity across the tg tree to make it
 9988 * visible to the root
 9989 */
 9990static void propagate_entity_cfs_rq(struct sched_entity *se)
 9991{
 9992	struct cfs_rq *cfs_rq;
 9993
 9994	/* Start to propagate at parent */
 9995	se = se->parent;
 9996
 9997	for_each_sched_entity(se) {
 9998		cfs_rq = cfs_rq_of(se);
 9999
10000		if (cfs_rq_throttled(cfs_rq))
10001			break;
10002
10003		update_load_avg(cfs_rq, se, UPDATE_TG);
10004	}
10005}
10006#else
10007static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10008#endif
10009
10010static void detach_entity_cfs_rq(struct sched_entity *se)
10011{
10012	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10013
10014	/* Catch up with the cfs_rq and remove our load when we leave */
10015	update_load_avg(cfs_rq, se, 0);
10016	detach_entity_load_avg(cfs_rq, se);
10017	update_tg_load_avg(cfs_rq, false);
10018	propagate_entity_cfs_rq(se);
10019}
10020
10021static void attach_entity_cfs_rq(struct sched_entity *se)
10022{
10023	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10024
10025#ifdef CONFIG_FAIR_GROUP_SCHED
10026	/*
10027	 * Since the real-depth could have been changed (only FAIR
10028	 * class maintain depth value), reset depth properly.
10029	 */
10030	se->depth = se->parent ? se->parent->depth + 1 : 0;
10031#endif
10032
10033	/* Synchronize entity with its cfs_rq */
10034	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10035	attach_entity_load_avg(cfs_rq, se, 0);
10036	update_tg_load_avg(cfs_rq, false);
10037	propagate_entity_cfs_rq(se);
10038}
10039
10040static void detach_task_cfs_rq(struct task_struct *p)
10041{
10042	struct sched_entity *se = &p->se;
10043	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10044
10045	if (!vruntime_normalized(p)) {
10046		/*
10047		 * Fix up our vruntime so that the current sleep doesn't
10048		 * cause 'unlimited' sleep bonus.
10049		 */
10050		place_entity(cfs_rq, se, 0);
10051		se->vruntime -= cfs_rq->min_vruntime;
10052	}
10053
10054	detach_entity_cfs_rq(se);
10055}
10056
10057static void attach_task_cfs_rq(struct task_struct *p)
10058{
10059	struct sched_entity *se = &p->se;
10060	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10061
10062	attach_entity_cfs_rq(se);
10063
10064	if (!vruntime_normalized(p))
10065		se->vruntime += cfs_rq->min_vruntime;
10066}
10067
10068static void switched_from_fair(struct rq *rq, struct task_struct *p)
10069{
10070	detach_task_cfs_rq(p);
10071}
10072
10073static void switched_to_fair(struct rq *rq, struct task_struct *p)
10074{
10075	attach_task_cfs_rq(p);
10076
10077	if (task_on_rq_queued(p)) {
10078		/*
10079		 * We were most likely switched from sched_rt, so
10080		 * kick off the schedule if running, otherwise just see
10081		 * if we can still preempt the current task.
10082		 */
10083		if (rq->curr == p)
10084			resched_curr(rq);
10085		else
10086			check_preempt_curr(rq, p, 0);
10087	}
10088}
10089
10090/* Account for a task changing its policy or group.
10091 *
10092 * This routine is mostly called to set cfs_rq->curr field when a task
10093 * migrates between groups/classes.
10094 */
10095static void set_curr_task_fair(struct rq *rq)
10096{
10097	struct sched_entity *se = &rq->curr->se;
10098
10099	for_each_sched_entity(se) {
10100		struct cfs_rq *cfs_rq = cfs_rq_of(se);
10101
10102		set_next_entity(cfs_rq, se);
10103		/* ensure bandwidth has been allocated on our new cfs_rq */
10104		account_cfs_rq_runtime(cfs_rq, 0);
10105	}
10106}
10107
10108void init_cfs_rq(struct cfs_rq *cfs_rq)
10109{
10110	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
10111	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10112#ifndef CONFIG_64BIT
10113	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10114#endif
10115#ifdef CONFIG_SMP
10116	raw_spin_lock_init(&cfs_rq->removed.lock);
10117#endif
10118}
10119
10120#ifdef CONFIG_FAIR_GROUP_SCHED
10121static void task_set_group_fair(struct task_struct *p)
10122{
10123	struct sched_entity *se = &p->se;
10124
10125	set_task_rq(p, task_cpu(p));
10126	se->depth = se->parent ? se->parent->depth + 1 : 0;
10127}
10128
10129static void task_move_group_fair(struct task_struct *p)
10130{
10131	detach_task_cfs_rq(p);
10132	set_task_rq(p, task_cpu(p));
10133
10134#ifdef CONFIG_SMP
10135	/* Tell se's cfs_rq has been changed -- migrated */
10136	p->se.avg.last_update_time = 0;
10137#endif
10138	attach_task_cfs_rq(p);
10139}
10140
10141static void task_change_group_fair(struct task_struct *p, int type)
10142{
10143	switch (type) {
10144	case TASK_SET_GROUP:
10145		task_set_group_fair(p);
10146		break;
10147
10148	case TASK_MOVE_GROUP:
10149		task_move_group_fair(p);
10150		break;
10151	}
10152}
10153
10154void free_fair_sched_group(struct task_group *tg)
10155{
10156	int i;
10157
10158	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10159
10160	for_each_possible_cpu(i) {
10161		if (tg->cfs_rq)
10162			kfree(tg->cfs_rq[i]);
10163		if (tg->se)
10164			kfree(tg->se[i]);
10165	}
10166
10167	kfree(tg->cfs_rq);
10168	kfree(tg->se);
10169}
10170
10171int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10172{
10173	struct sched_entity *se;
10174	struct cfs_rq *cfs_rq;
10175	int i;
10176
10177	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
10178	if (!tg->cfs_rq)
10179		goto err;
10180	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
10181	if (!tg->se)
10182		goto err;
10183
10184	tg->shares = NICE_0_LOAD;
10185
10186	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10187
10188	for_each_possible_cpu(i) {
10189		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10190				      GFP_KERNEL, cpu_to_node(i));
10191		if (!cfs_rq)
10192			goto err;
10193
10194		se = kzalloc_node(sizeof(struct sched_entity),
10195				  GFP_KERNEL, cpu_to_node(i));
10196		if (!se)
10197			goto err_free_rq;
10198
10199		init_cfs_rq(cfs_rq);
10200		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10201		init_entity_runnable_average(se);
10202	}
10203
10204	return 1;
10205
10206err_free_rq:
10207	kfree(cfs_rq);
10208err:
10209	return 0;
10210}
10211
10212void online_fair_sched_group(struct task_group *tg)
10213{
10214	struct sched_entity *se;
10215	struct rq *rq;
10216	int i;
10217
10218	for_each_possible_cpu(i) {
10219		rq = cpu_rq(i);
10220		se = tg->se[i];
10221
10222		raw_spin_lock_irq(&rq->lock);
10223		update_rq_clock(rq);
10224		attach_entity_cfs_rq(se);
10225		sync_throttle(tg, i);
10226		raw_spin_unlock_irq(&rq->lock);
10227	}
10228}
10229
10230void unregister_fair_sched_group(struct task_group *tg)
10231{
10232	unsigned long flags;
10233	struct rq *rq;
10234	int cpu;
10235
10236	for_each_possible_cpu(cpu) {
10237		if (tg->se[cpu])
10238			remove_entity_load_avg(tg->se[cpu]);
10239
10240		/*
10241		 * Only empty task groups can be destroyed; so we can speculatively
10242		 * check on_list without danger of it being re-added.
10243		 */
10244		if (!tg->cfs_rq[cpu]->on_list)
10245			continue;
10246
10247		rq = cpu_rq(cpu);
10248
10249		raw_spin_lock_irqsave(&rq->lock, flags);
10250		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10251		raw_spin_unlock_irqrestore(&rq->lock, flags);
10252	}
10253}
10254
10255void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10256			struct sched_entity *se, int cpu,
10257			struct sched_entity *parent)
10258{
10259	struct rq *rq = cpu_rq(cpu);
10260
10261	cfs_rq->tg = tg;
10262	cfs_rq->rq = rq;
10263	init_cfs_rq_runtime(cfs_rq);
10264
10265	tg->cfs_rq[cpu] = cfs_rq;
10266	tg->se[cpu] = se;
10267
10268	/* se could be NULL for root_task_group */
10269	if (!se)
10270		return;
10271
10272	if (!parent) {
10273		se->cfs_rq = &rq->cfs;
10274		se->depth = 0;
10275	} else {
10276		se->cfs_rq = parent->my_q;
10277		se->depth = parent->depth + 1;
10278	}
10279
10280	se->my_q = cfs_rq;
10281	/* guarantee group entities always have weight */
10282	update_load_set(&se->load, NICE_0_LOAD);
10283	se->parent = parent;
10284}
10285
10286static DEFINE_MUTEX(shares_mutex);
10287
10288int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10289{
10290	int i;
10291
10292	/*
10293	 * We can't change the weight of the root cgroup.
10294	 */
10295	if (!tg->se[0])
10296		return -EINVAL;
10297
10298	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10299
10300	mutex_lock(&shares_mutex);
10301	if (tg->shares == shares)
10302		goto done;
10303
10304	tg->shares = shares;
10305	for_each_possible_cpu(i) {
10306		struct rq *rq = cpu_rq(i);
10307		struct sched_entity *se = tg->se[i];
10308		struct rq_flags rf;
10309
10310		/* Propagate contribution to hierarchy */
10311		rq_lock_irqsave(rq, &rf);
10312		update_rq_clock(rq);
10313		for_each_sched_entity(se) {
10314			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
10315			update_cfs_group(se);
10316		}
10317		rq_unlock_irqrestore(rq, &rf);
10318	}
10319
10320done:
10321	mutex_unlock(&shares_mutex);
10322	return 0;
10323}
10324#else /* CONFIG_FAIR_GROUP_SCHED */
10325
10326void free_fair_sched_group(struct task_group *tg) { }
10327
10328int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10329{
10330	return 1;
10331}
10332
10333void online_fair_sched_group(struct task_group *tg) { }
10334
10335void unregister_fair_sched_group(struct task_group *tg) { }
10336
10337#endif /* CONFIG_FAIR_GROUP_SCHED */
10338
10339
10340static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10341{
10342	struct sched_entity *se = &task->se;
10343	unsigned int rr_interval = 0;
10344
10345	/*
10346	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
10347	 * idle runqueue:
10348	 */
10349	if (rq->cfs.load.weight)
10350		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10351
10352	return rr_interval;
10353}
10354
10355/*
10356 * All the scheduling class methods:
10357 */
10358const struct sched_class fair_sched_class = {
10359	.next			= &idle_sched_class,
10360	.enqueue_task		= enqueue_task_fair,
10361	.dequeue_task		= dequeue_task_fair,
10362	.yield_task		= yield_task_fair,
10363	.yield_to_task		= yield_to_task_fair,
10364
10365	.check_preempt_curr	= check_preempt_wakeup,
10366
10367	.pick_next_task		= pick_next_task_fair,
10368	.put_prev_task		= put_prev_task_fair,
10369
10370#ifdef CONFIG_SMP
10371	.select_task_rq		= select_task_rq_fair,
10372	.migrate_task_rq	= migrate_task_rq_fair,
10373
10374	.rq_online		= rq_online_fair,
10375	.rq_offline		= rq_offline_fair,
10376
10377	.task_dead		= task_dead_fair,
10378	.set_cpus_allowed	= set_cpus_allowed_common,
10379#endif
10380
10381	.set_curr_task          = set_curr_task_fair,
10382	.task_tick		= task_tick_fair,
10383	.task_fork		= task_fork_fair,
10384
10385	.prio_changed		= prio_changed_fair,
10386	.switched_from		= switched_from_fair,
10387	.switched_to		= switched_to_fair,
10388
10389	.get_rr_interval	= get_rr_interval_fair,
10390
10391	.update_curr		= update_curr_fair,
10392
10393#ifdef CONFIG_FAIR_GROUP_SCHED
10394	.task_change_group	= task_change_group_fair,
10395#endif
10396};
10397
10398#ifdef CONFIG_SCHED_DEBUG
10399void print_cfs_stats(struct seq_file *m, int cpu)
10400{
10401	struct cfs_rq *cfs_rq, *pos;
10402
10403	rcu_read_lock();
10404	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10405		print_cfs_rq(m, cpu, cfs_rq);
10406	rcu_read_unlock();
10407}
10408
10409#ifdef CONFIG_NUMA_BALANCING
10410void show_numa_stats(struct task_struct *p, struct seq_file *m)
10411{
10412	int node;
10413	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10414
10415	for_each_online_node(node) {
10416		if (p->numa_faults) {
10417			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10418			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10419		}
10420		if (p->numa_group) {
10421			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10422			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10423		}
10424		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10425	}
10426}
10427#endif /* CONFIG_NUMA_BALANCING */
10428#endif /* CONFIG_SCHED_DEBUG */
10429
10430__init void init_sched_fair_class(void)
10431{
10432#ifdef CONFIG_SMP
10433	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10434
10435#ifdef CONFIG_NO_HZ_COMMON
10436	nohz.next_balance = jiffies;
10437	nohz.next_blocked = jiffies;
10438	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10439#endif
10440#endif /* SMP */
10441
10442}