fair.c - kernel/sched/fair.c - Linux diff v5.4 - Bootlin Elixir Cross Referencer

    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
    4 *
    5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
    6 *
    7 *  Interactivity improvements by Mike Galbraith
    8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
    9 *
   10 *  Various enhancements by Dmitry Adamushko.
   11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
   12 *
   13 *  Group scheduling enhancements by Srivatsa Vaddagiri
   14 *  Copyright IBM Corporation, 2007
   15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
   16 *
   17 *  Scaled math optimizations by Thomas Gleixner
   18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
   19 *
   20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
   21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
   22 */
   23#include "sched.h"
 
 
 
 
 
 
   24
   25#include <trace/events/sched.h>
   26
 
 
   27/*
   28 * Targeted preemption latency for CPU-bound tasks:
 
   29 *
   30 * NOTE: this latency value is not the same as the concept of
   31 * 'timeslice length' - timeslices in CFS are of variable length
   32 * and have no persistent notion like in traditional, time-slice
   33 * based scheduling concepts.
   34 *
   35 * (to see the precise effective timeslice length of your workload,
   36 *  run vmstat and monitor the context-switches (cs) field)
   37 *
   38 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
   39 */
   40unsigned int sysctl_sched_latency			= 6000000ULL;
   41static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
   42
   43/*
   44 * The initial- and re-scaling of tunables is configurable
 
   45 *
   46 * Options are:
   47 *
   48 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
   49 *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
   50 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
   51 *
   52 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
   53 */
   54enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 
   55
   56/*
   57 * Minimal preemption granularity for CPU-bound tasks:
   58 *
   59 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
   60 */
   61unsigned int sysctl_sched_min_granularity			= 750000ULL;
   62static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
   63
   64/*
   65 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
   66 */
   67static unsigned int sched_nr_latency = 8;
   68
   69/*
   70 * After fork, child runs first. If set to 0 (default) then
   71 * parent will (try to) run first.
   72 */
   73unsigned int sysctl_sched_child_runs_first __read_mostly;
   74
   75/*
   76 * SCHED_OTHER wake-up granularity.
 
   77 *
   78 * This option delays the preemption effects of decoupled workloads
   79 * and reduces their over-scheduling. Synchronous workloads will still
   80 * have immediate wakeup/sleep latencies.
   81 *
   82 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
   83 */
   84unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
   85static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
   86
   87const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
   88
   89#ifdef CONFIG_SMP
   90/*
   91 * For asym packing, by default the lower numbered CPU has higher priority.
 
 
   92 */
   93int __weak arch_asym_cpu_priority(int cpu)
   94{
   95	return -cpu;
   96}
   97
   98/*
   99 * The margin used when comparing utilization with CPU capacity.
  100 *
  101 * (default: ~20%)
  102 */
  103#define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)
  104
  105#endif
  106
  107#ifdef CONFIG_CFS_BANDWIDTH
  108/*
  109 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
  110 * each time a cfs_rq requests quota.
  111 *
  112 * Note: in the case that the slice exceeds the runtime remaining (either due
  113 * to consumption or the quota being specified to be smaller than the slice)
  114 * we will always only issue the remaining available time.
  115 *
  116 * (default: 5 msec, units: microseconds)
  117 */
  118unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
  119#endif
  120
  121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  122{
  123	lw->weight += inc;
  124	lw->inv_weight = 0;
  125}
  126
  127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
  128{
  129	lw->weight -= dec;
  130	lw->inv_weight = 0;
  131}
  132
  133static inline void update_load_set(struct load_weight *lw, unsigned long w)
  134{
  135	lw->weight = w;
  136	lw->inv_weight = 0;
  137}
  138
  139/*
  140 * Increase the granularity value when there are more CPUs,
  141 * because with more CPUs the 'effective latency' as visible
  142 * to users decreases. But the relationship is not linear,
  143 * so pick a second-best guess by going with the log2 of the
  144 * number of CPUs.
  145 *
  146 * This idea comes from the SD scheduler of Con Kolivas:
  147 */
  148static unsigned int get_update_sysctl_factor(void)
  149{
  150	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
  151	unsigned int factor;
  152
  153	switch (sysctl_sched_tunable_scaling) {
  154	case SCHED_TUNABLESCALING_NONE:
  155		factor = 1;
  156		break;
  157	case SCHED_TUNABLESCALING_LINEAR:
  158		factor = cpus;
  159		break;
  160	case SCHED_TUNABLESCALING_LOG:
  161	default:
  162		factor = 1 + ilog2(cpus);
  163		break;
  164	}
  165
  166	return factor;
  167}
  168
  169static void update_sysctl(void)
  170{
  171	unsigned int factor = get_update_sysctl_factor();
  172
  173#define SET_SYSCTL(name) \
  174	(sysctl_##name = (factor) * normalized_sysctl_##name)
  175	SET_SYSCTL(sched_min_granularity);
  176	SET_SYSCTL(sched_latency);
  177	SET_SYSCTL(sched_wakeup_granularity);
  178#undef SET_SYSCTL
  179}
  180
  181void sched_init_granularity(void)
  182{
  183	update_sysctl();
  184}
  185
  186#define WMULT_CONST	(~0U)
  187#define WMULT_SHIFT	32
  188
  189static void __update_inv_weight(struct load_weight *lw)
  190{
  191	unsigned long w;
  192
  193	if (likely(lw->inv_weight))
  194		return;
  195
  196	w = scale_load_down(lw->weight);
  197
  198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
  199		lw->inv_weight = 1;
  200	else if (unlikely(!w))
  201		lw->inv_weight = WMULT_CONST;
  202	else
  203		lw->inv_weight = WMULT_CONST / w;
  204}
  205
  206/*
  207 * delta_exec * weight / lw.weight
  208 *   OR
  209 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
  210 *
  211 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
  212 * we're guaranteed shift stays positive because inv_weight is guaranteed to
  213 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
  214 *
  215 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
  216 * weight/lw.weight <= 1, and therefore our shift will also be positive.
  217 */
  218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 
 
  219{
  220	u64 fact = scale_load_down(weight);
  221	int shift = WMULT_SHIFT;
  222
  223	__update_inv_weight(lw);
  224
  225	if (unlikely(fact >> 32)) {
  226		while (fact >> 32) {
  227			fact >>= 1;
  228			shift--;
  229		}
  230	}
 
  231
  232	/* hint to use a 32x32->64 mul */
  233	fact = (u64)(u32)fact * lw->inv_weight;
  234
  235	while (fact >> 32) {
  236		fact >>= 1;
  237		shift--;
 
 
 
  238	}
  239
  240	return mul_u64_u32_shr(delta_exec, fact, shift);
 
 
 
 
 
 
 
 
 
  241}
  242
  243
  244const struct sched_class fair_sched_class;
  245
  246/**************************************************************
  247 * CFS operations on generic schedulable entities:
  248 */
  249
  250#ifdef CONFIG_FAIR_GROUP_SCHED
 
 
 
 
 
 
 
 
 
 
  251static inline struct task_struct *task_of(struct sched_entity *se)
  252{
  253	SCHED_WARN_ON(!entity_is_task(se));
 
 
  254	return container_of(se, struct task_struct, se);
  255}
  256
  257/* Walk up scheduling entities hierarchy */
  258#define for_each_sched_entity(se) \
  259		for (; se; se = se->parent)
  260
  261static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
  262{
  263	return p->se.cfs_rq;
  264}
  265
  266/* runqueue on which this entity is (to be) queued */
  267static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  268{
  269	return se->cfs_rq;
  270}
  271
  272/* runqueue "owned" by this group */
  273static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  274{
  275	return grp->my_q;
  276}
  277
  278static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
  279{
  280	if (!path)
  281		return;
  282
  283	if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
  284		autogroup_path(cfs_rq->tg, path, len);
  285	else if (cfs_rq && cfs_rq->tg->css.cgroup)
  286		cgroup_path(cfs_rq->tg->css.cgroup, path, len);
  287	else
  288		strlcpy(path, "(null)", len);
  289}
  290
  291static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  292{
  293	struct rq *rq = rq_of(cfs_rq);
  294	int cpu = cpu_of(rq);
  295
  296	if (cfs_rq->on_list)
  297		return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
  298
  299	cfs_rq->on_list = 1;
  300
  301	/*
  302	 * Ensure we either appear before our parent (if already
  303	 * enqueued) or force our parent to appear after us when it is
  304	 * enqueued. The fact that we always enqueue bottom-up
  305	 * reduces this to two cases and a special case for the root
  306	 * cfs_rq. Furthermore, it also means that we will always reset
  307	 * tmp_alone_branch either when the branch is connected
  308	 * to a tree or when we reach the top of the tree
  309	 */
  310	if (cfs_rq->tg->parent &&
  311	    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
  312		/*
  313		 * If parent is already on the list, we add the child
  314		 * just before. Thanks to circular linked property of
  315		 * the list, this means to put the child at the tail
  316		 * of the list that starts by parent.
  317		 */
  318		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  319			&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
  320		/*
  321		 * The branch is now connected to its tree so we can
  322		 * reset tmp_alone_branch to the beginning of the
  323		 * list.
  324		 */
  325		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  326		return true;
  327	}
  328
  329	if (!cfs_rq->tg->parent) {
  330		/*
  331		 * cfs rq without parent should be put
  332		 * at the tail of the list.
  333		 */
  334		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  335			&rq->leaf_cfs_rq_list);
  336		/*
  337		 * We have reach the top of a tree so we can reset
  338		 * tmp_alone_branch to the beginning of the list.
  339		 */
  340		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  341		return true;
  342	}
  343
  344	/*
  345	 * The parent has not already been added so we want to
  346	 * make sure that it will be put after us.
  347	 * tmp_alone_branch points to the begin of the branch
  348	 * where we will add parent.
  349	 */
  350	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
  351	/*
  352	 * update tmp_alone_branch to points to the new begin
  353	 * of the branch
  354	 */
  355	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
  356	return false;
  357}
  358
  359static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  360{
  361	if (cfs_rq->on_list) {
  362		struct rq *rq = rq_of(cfs_rq);
  363
  364		/*
  365		 * With cfs_rq being unthrottled/throttled during an enqueue,
  366		 * it can happen the tmp_alone_branch points the a leaf that
  367		 * we finally want to del. In this case, tmp_alone_branch moves
  368		 * to the prev element but it will point to rq->leaf_cfs_rq_list
  369		 * at the end of the enqueue.
  370		 */
  371		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
  372			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
  373
  374		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
  375		cfs_rq->on_list = 0;
  376	}
  377}
  378
  379static inline void assert_list_leaf_cfs_rq(struct rq *rq)
  380{
  381	SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
  382}
  383
  384/* Iterate thr' all leaf cfs_rq's on a runqueue */
  385#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
  386	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
  387				 leaf_cfs_rq_list)
  388
  389/* Do the two (enqueued) entities belong to the same group ? */
  390static inline struct cfs_rq *
  391is_same_group(struct sched_entity *se, struct sched_entity *pse)
  392{
  393	if (se->cfs_rq == pse->cfs_rq)
  394		return se->cfs_rq;
  395
  396	return NULL;
  397}
  398
  399static inline struct sched_entity *parent_entity(struct sched_entity *se)
  400{
  401	return se->parent;
  402}
  403
 
 
 
 
 
 
 
 
 
 
 
  404static void
  405find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  406{
  407	int se_depth, pse_depth;
  408
  409	/*
  410	 * preemption test can be made between sibling entities who are in the
  411	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
  412	 * both tasks until we find their ancestors who are siblings of common
  413	 * parent.
  414	 */
  415
  416	/* First walk up until both entities are at same depth */
  417	se_depth = (*se)->depth;
  418	pse_depth = (*pse)->depth;
  419
  420	while (se_depth > pse_depth) {
  421		se_depth--;
  422		*se = parent_entity(*se);
  423	}
  424
  425	while (pse_depth > se_depth) {
  426		pse_depth--;
  427		*pse = parent_entity(*pse);
  428	}
  429
  430	while (!is_same_group(*se, *pse)) {
  431		*se = parent_entity(*se);
  432		*pse = parent_entity(*pse);
  433	}
  434}
  435
  436#else	/* !CONFIG_FAIR_GROUP_SCHED */
  437
  438static inline struct task_struct *task_of(struct sched_entity *se)
  439{
  440	return container_of(se, struct task_struct, se);
  441}
  442
 
 
 
 
 
 
 
  443#define for_each_sched_entity(se) \
  444		for (; se; se = NULL)
  445
  446static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
  447{
  448	return &task_rq(p)->cfs;
  449}
  450
  451static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  452{
  453	struct task_struct *p = task_of(se);
  454	struct rq *rq = task_rq(p);
  455
  456	return &rq->cfs;
  457}
  458
  459/* runqueue "owned" by this group */
  460static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  461{
  462	return NULL;
  463}
  464
  465static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
  466{
  467	if (path)
  468		strlcpy(path, "(null)", len);
  469}
  470
  471static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  472{
  473	return true;
  474}
  475
  476static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  477{
  478}
  479
  480static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 
 
 
 
  481{
 
  482}
  483
  484#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
  485		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
  486
  487static inline struct sched_entity *parent_entity(struct sched_entity *se)
  488{
  489	return NULL;
  490}
  491
  492static inline void
  493find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  494{
  495}
  496
  497#endif	/* CONFIG_FAIR_GROUP_SCHED */
  498
  499static __always_inline
  500void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
  501
  502/**************************************************************
  503 * Scheduling class tree data structure manipulation methods:
  504 */
  505
  506static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
  507{
  508	s64 delta = (s64)(vruntime - max_vruntime);
  509	if (delta > 0)
  510		max_vruntime = vruntime;
  511
  512	return max_vruntime;
  513}
  514
  515static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
  516{
  517	s64 delta = (s64)(vruntime - min_vruntime);
  518	if (delta < 0)
  519		min_vruntime = vruntime;
  520
  521	return min_vruntime;
  522}
  523
  524static inline int entity_before(struct sched_entity *a,
  525				struct sched_entity *b)
  526{
  527	return (s64)(a->vruntime - b->vruntime) < 0;
  528}
  529
  530static void update_min_vruntime(struct cfs_rq *cfs_rq)
  531{
  532	struct sched_entity *curr = cfs_rq->curr;
  533	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
  534
  535	u64 vruntime = cfs_rq->min_vruntime;
  536
  537	if (curr) {
  538		if (curr->on_rq)
  539			vruntime = curr->vruntime;
  540		else
  541			curr = NULL;
  542	}
  543
  544	if (leftmost) { /* non-empty tree */
  545		struct sched_entity *se;
  546		se = rb_entry(leftmost, struct sched_entity, run_node);
 
  547
  548		if (!curr)
  549			vruntime = se->vruntime;
  550		else
  551			vruntime = min_vruntime(vruntime, se->vruntime);
  552	}
  553
  554	/* ensure we never gain time by being placed backwards. */
  555	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
  556#ifndef CONFIG_64BIT
  557	smp_wmb();
  558	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  559#endif
  560}
  561
  562/*
  563 * Enqueue an entity into the rb-tree:
  564 */
  565static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  566{
  567	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
  568	struct rb_node *parent = NULL;
  569	struct sched_entity *entry;
  570	bool leftmost = true;
  571
  572	/*
  573	 * Find the right place in the rbtree:
  574	 */
  575	while (*link) {
  576		parent = *link;
  577		entry = rb_entry(parent, struct sched_entity, run_node);
  578		/*
  579		 * We dont care about collisions. Nodes with
  580		 * the same key stay together.
  581		 */
  582		if (entity_before(se, entry)) {
  583			link = &parent->rb_left;
  584		} else {
  585			link = &parent->rb_right;
  586			leftmost = false;
  587		}
  588	}
  589
 
 
 
 
 
 
 
  590	rb_link_node(&se->run_node, parent, link);
  591	rb_insert_color_cached(&se->run_node,
  592			       &cfs_rq->tasks_timeline, leftmost);
  593}
  594
  595static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  596{
  597	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
 
 
 
 
 
 
 
  598}
  599
  600struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  601{
  602	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
  603
  604	if (!left)
  605		return NULL;
  606
  607	return rb_entry(left, struct sched_entity, run_node);
  608}
  609
  610static struct sched_entity *__pick_next_entity(struct sched_entity *se)
  611{
  612	struct rb_node *next = rb_next(&se->run_node);
  613
  614	if (!next)
  615		return NULL;
  616
  617	return rb_entry(next, struct sched_entity, run_node);
  618}
  619
  620#ifdef CONFIG_SCHED_DEBUG
  621struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  622{
  623	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
  624
  625	if (!last)
  626		return NULL;
  627
  628	return rb_entry(last, struct sched_entity, run_node);
  629}
  630
  631/**************************************************************
  632 * Scheduling class statistics methods:
  633 */
  634
  635int sched_proc_update_handler(struct ctl_table *table, int write,
  636		void __user *buffer, size_t *lenp,
  637		loff_t *ppos)
  638{
  639	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  640	unsigned int factor = get_update_sysctl_factor();
  641
  642	if (ret || !write)
  643		return ret;
  644
  645	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
  646					sysctl_sched_min_granularity);
  647
  648#define WRT_SYSCTL(name) \
  649	(normalized_sysctl_##name = sysctl_##name / (factor))
  650	WRT_SYSCTL(sched_min_granularity);
  651	WRT_SYSCTL(sched_latency);
  652	WRT_SYSCTL(sched_wakeup_granularity);
  653#undef WRT_SYSCTL
  654
  655	return 0;
  656}
  657#endif
  658
  659/*
  660 * delta /= w
  661 */
  662static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 
  663{
  664	if (unlikely(se->load.weight != NICE_0_LOAD))
  665		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
  666
  667	return delta;
  668}
  669
  670/*
  671 * The idea is to set a period in which each task runs once.
  672 *
  673 * When there are too many tasks (sched_nr_latency) we have to stretch
  674 * this period because otherwise the slices get too small.
  675 *
  676 * p = (nr <= nl) ? l : l*nr/nl
  677 */
  678static u64 __sched_period(unsigned long nr_running)
  679{
  680	if (unlikely(nr_running > sched_nr_latency))
  681		return nr_running * sysctl_sched_min_granularity;
  682	else
  683		return sysctl_sched_latency;
 
 
 
 
 
  684}
  685
  686/*
  687 * We calculate the wall-time slice from the period by taking a part
  688 * proportional to the weight.
  689 *
  690 * s = p*P[w/rw]
  691 */
  692static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  693{
  694	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
  695
  696	for_each_sched_entity(se) {
  697		struct load_weight *load;
  698		struct load_weight lw;
  699
  700		cfs_rq = cfs_rq_of(se);
  701		load = &cfs_rq->load;
  702
  703		if (unlikely(!se->on_rq)) {
  704			lw = cfs_rq->load;
  705
  706			update_load_add(&lw, se->load.weight);
  707			load = &lw;
  708		}
  709		slice = __calc_delta(slice, se->load.weight, load);
  710	}
  711	return slice;
  712}
  713
  714/*
  715 * We calculate the vruntime slice of a to-be-inserted task.
  716 *
  717 * vs = s/w
  718 */
  719static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  720{
  721	return calc_delta_fair(sched_slice(cfs_rq, se), se);
  722}
  723
  724#include "pelt.h"
  725#ifdef CONFIG_SMP
  726
  727static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  728static unsigned long task_h_load(struct task_struct *p);
  729static unsigned long capacity_of(int cpu);
  730
  731/* Give new sched_entity start runnable values to heavy its load in infant time */
  732void init_entity_runnable_average(struct sched_entity *se)
  733{
  734	struct sched_avg *sa = &se->avg;
  735
  736	memset(sa, 0, sizeof(*sa));
  737
  738	/*
  739	 * Tasks are initialized with full load to be seen as heavy tasks until
  740	 * they get a chance to stabilize to their real load level.
  741	 * Group entities are initialized with zero load to reflect the fact that
  742	 * nothing has been attached to the task group yet.
  743	 */
  744	if (entity_is_task(se))
  745		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
  746
  747	se->runnable_weight = se->load.weight;
  748
  749	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  750}
  751
  752static void attach_entity_cfs_rq(struct sched_entity *se);
  753
  754/*
  755 * With new tasks being created, their initial util_avgs are extrapolated
  756 * based on the cfs_rq's current util_avg:
  757 *
  758 *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
  759 *
  760 * However, in many cases, the above util_avg does not give a desired
  761 * value. Moreover, the sum of the util_avgs may be divergent, such
  762 * as when the series is a harmonic series.
  763 *
  764 * To solve this problem, we also cap the util_avg of successive tasks to
  765 * only 1/2 of the left utilization budget:
  766 *
  767 *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
  768 *
  769 * where n denotes the nth task and cpu_scale the CPU capacity.
  770 *
  771 * For example, for a CPU with 1024 of capacity, a simplest series from
  772 * the beginning would be like:
  773 *
  774 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
  775 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
  776 *
  777 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
  778 * if util_avg > util_avg_cap.
  779 */
  780void post_init_entity_util_avg(struct task_struct *p)
 
 
  781{
  782	struct sched_entity *se = &p->se;
  783	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  784	struct sched_avg *sa = &se->avg;
  785	long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
  786	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
  787
  788	if (cap > 0) {
  789		if (cfs_rq->avg.util_avg != 0) {
  790			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
  791			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
  792
  793			if (sa->util_avg > cap)
  794				sa->util_avg = cap;
  795		} else {
  796			sa->util_avg = cap;
  797		}
  798	}
  799
  800	if (p->sched_class != &fair_sched_class) {
  801		/*
  802		 * For !fair tasks do:
  803		 *
  804		update_cfs_rq_load_avg(now, cfs_rq);
  805		attach_entity_load_avg(cfs_rq, se, 0);
  806		switched_from_fair(rq, p);
  807		 *
  808		 * such that the next switched_to_fair() has the
  809		 * expected state.
  810		 */
  811		se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
  812		return;
  813	}
  814
  815	attach_entity_cfs_rq(se);
  816}
  817
  818#else /* !CONFIG_SMP */
  819void init_entity_runnable_average(struct sched_entity *se)
  820{
  821}
  822void post_init_entity_util_avg(struct task_struct *p)
  823{
  824}
  825static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  826{
  827}
  828#endif /* CONFIG_SMP */
  829
  830/*
  831 * Update the current task's runtime statistics.
  832 */
  833static void update_curr(struct cfs_rq *cfs_rq)
  834{
  835	struct sched_entity *curr = cfs_rq->curr;
  836	u64 now = rq_clock_task(rq_of(cfs_rq));
  837	u64 delta_exec;
  838
  839	if (unlikely(!curr))
  840		return;
  841
  842	delta_exec = now - curr->exec_start;
  843	if (unlikely((s64)delta_exec <= 0))
 
 
 
 
 
  844		return;
  845
 
  846	curr->exec_start = now;
  847
  848	schedstat_set(curr->statistics.exec_max,
  849		      max(delta_exec, curr->statistics.exec_max));
  850
  851	curr->sum_exec_runtime += delta_exec;
  852	schedstat_add(cfs_rq->exec_clock, delta_exec);
  853
  854	curr->vruntime += calc_delta_fair(delta_exec, curr);
  855	update_min_vruntime(cfs_rq);
  856
  857	if (entity_is_task(curr)) {
  858		struct task_struct *curtask = task_of(curr);
  859
  860		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
  861		cgroup_account_cputime(curtask, delta_exec);
  862		account_group_exec_runtime(curtask, delta_exec);
  863	}
  864
  865	account_cfs_rq_runtime(cfs_rq, delta_exec);
  866}
  867
  868static void update_curr_fair(struct rq *rq)
  869{
  870	update_curr(cfs_rq_of(&rq->curr->se));
  871}
  872
  873static inline void
  874update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  875{
  876	u64 wait_start, prev_wait_start;
  877
  878	if (!schedstat_enabled())
  879		return;
  880
  881	wait_start = rq_clock(rq_of(cfs_rq));
  882	prev_wait_start = schedstat_val(se->statistics.wait_start);
  883
  884	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
  885	    likely(wait_start > prev_wait_start))
  886		wait_start -= prev_wait_start;
  887
  888	__schedstat_set(se->statistics.wait_start, wait_start);
  889}
  890
  891static inline void
  892update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  893{
  894	struct task_struct *p;
  895	u64 delta;
  896
  897	if (!schedstat_enabled())
  898		return;
  899
  900	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
  901
  902	if (entity_is_task(se)) {
  903		p = task_of(se);
  904		if (task_on_rq_migrating(p)) {
  905			/*
  906			 * Preserve migrating task's wait time so wait_start
  907			 * time stamp can be adjusted to accumulate wait time
  908			 * prior to migration.
  909			 */
  910			__schedstat_set(se->statistics.wait_start, delta);
  911			return;
  912		}
  913		trace_sched_stat_wait(p, delta);
  914	}
  915
  916	__schedstat_set(se->statistics.wait_max,
  917		      max(schedstat_val(se->statistics.wait_max), delta));
  918	__schedstat_inc(se->statistics.wait_count);
  919	__schedstat_add(se->statistics.wait_sum, delta);
  920	__schedstat_set(se->statistics.wait_start, 0);
  921}
  922
  923static inline void
  924update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  925{
  926	struct task_struct *tsk = NULL;
  927	u64 sleep_start, block_start;
  928
  929	if (!schedstat_enabled())
  930		return;
  931
  932	sleep_start = schedstat_val(se->statistics.sleep_start);
  933	block_start = schedstat_val(se->statistics.block_start);
  934
  935	if (entity_is_task(se))
  936		tsk = task_of(se);
  937
  938	if (sleep_start) {
  939		u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
  940
  941		if ((s64)delta < 0)
  942			delta = 0;
  943
  944		if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
  945			__schedstat_set(se->statistics.sleep_max, delta);
  946
  947		__schedstat_set(se->statistics.sleep_start, 0);
  948		__schedstat_add(se->statistics.sum_sleep_runtime, delta);
  949
  950		if (tsk) {
  951			account_scheduler_latency(tsk, delta >> 10, 1);
  952			trace_sched_stat_sleep(tsk, delta);
  953		}
  954	}
  955	if (block_start) {
  956		u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
  957
  958		if ((s64)delta < 0)
  959			delta = 0;
  960
  961		if (unlikely(delta > schedstat_val(se->statistics.block_max)))
  962			__schedstat_set(se->statistics.block_max, delta);
  963
  964		__schedstat_set(se->statistics.block_start, 0);
  965		__schedstat_add(se->statistics.sum_sleep_runtime, delta);
  966
  967		if (tsk) {
  968			if (tsk->in_iowait) {
  969				__schedstat_add(se->statistics.iowait_sum, delta);
  970				__schedstat_inc(se->statistics.iowait_count);
  971				trace_sched_stat_iowait(tsk, delta);
  972			}
  973
  974			trace_sched_stat_blocked(tsk, delta);
  975
  976			/*
  977			 * Blocking time is in units of nanosecs, so shift by
  978			 * 20 to get a milliseconds-range estimation of the
  979			 * amount of time that the task spent sleeping:
  980			 */
  981			if (unlikely(prof_on == SLEEP_PROFILING)) {
  982				profile_hits(SLEEP_PROFILING,
  983						(void *)get_wchan(tsk),
  984						delta >> 20);
  985			}
  986			account_scheduler_latency(tsk, delta >> 10, 0);
  987		}
  988	}
  989}
  990
  991/*
  992 * Task is being enqueued - update stats:
  993 */
  994static inline void
  995update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  996{
  997	if (!schedstat_enabled())
  998		return;
  999
 1000	/*
 1001	 * Are we enqueueing a waiting task? (for current tasks
 1002	 * a dequeue/enqueue event is a NOP)
 1003	 */
 1004	if (se != cfs_rq->curr)
 1005		update_stats_wait_start(cfs_rq, se);
 
 1006
 1007	if (flags & ENQUEUE_WAKEUP)
 1008		update_stats_enqueue_sleeper(cfs_rq, se);
 
 
 
 
 
 
 
 
 
 
 
 
 
 1009}
 1010
 1011static inline void
 1012update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 1013{
 1014
 1015	if (!schedstat_enabled())
 1016		return;
 1017
 1018	/*
 1019	 * Mark the end of the wait period if dequeueing a
 1020	 * waiting task:
 1021	 */
 1022	if (se != cfs_rq->curr)
 1023		update_stats_wait_end(cfs_rq, se);
 1024
 1025	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
 1026		struct task_struct *tsk = task_of(se);
 1027
 1028		if (tsk->state & TASK_INTERRUPTIBLE)
 1029			__schedstat_set(se->statistics.sleep_start,
 1030				      rq_clock(rq_of(cfs_rq)));
 1031		if (tsk->state & TASK_UNINTERRUPTIBLE)
 1032			__schedstat_set(se->statistics.block_start,
 1033				      rq_clock(rq_of(cfs_rq)));
 1034	}
 1035}
 1036
 1037/*
 1038 * We are picking a new current task - update its stats:
 1039 */
 1040static inline void
 1041update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 1042{
 1043	/*
 1044	 * We are starting a new run period:
 1045	 */
 1046	se->exec_start = rq_clock_task(rq_of(cfs_rq));
 1047}
 1048
 1049/**************************************************
 1050 * Scheduling class queueing methods:
 1051 */
 1052
 1053#ifdef CONFIG_NUMA_BALANCING
 1054/*
 1055 * Approximate time to scan a full NUMA task in ms. The task scan period is
 1056 * calculated based on the tasks virtual memory size and
 1057 * numa_balancing_scan_size.
 1058 */
 1059unsigned int sysctl_numa_balancing_scan_period_min = 1000;
 1060unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 1061
 1062/* Portion of address space to scan in MB */
 1063unsigned int sysctl_numa_balancing_scan_size = 256;
 1064
 1065/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 1066unsigned int sysctl_numa_balancing_scan_delay = 1000;
 1067
 1068struct numa_group {
 1069	refcount_t refcount;
 1070
 1071	spinlock_t lock; /* nr_tasks, tasks */
 1072	int nr_tasks;
 1073	pid_t gid;
 1074	int active_nodes;
 1075
 1076	struct rcu_head rcu;
 1077	unsigned long total_faults;
 1078	unsigned long max_faults_cpu;
 1079	/*
 1080	 * Faults_cpu is used to decide whether memory should move
 1081	 * towards the CPU. As a consequence, these stats are weighted
 1082	 * more by CPU use than by memory faults.
 1083	 */
 1084	unsigned long *faults_cpu;
 1085	unsigned long faults[0];
 1086};
 1087
 1088/*
 1089 * For functions that can be called in multiple contexts that permit reading
 1090 * ->numa_group (see struct task_struct for locking rules).
 1091 */
 1092static struct numa_group *deref_task_numa_group(struct task_struct *p)
 1093{
 1094	return rcu_dereference_check(p->numa_group, p == current ||
 1095		(lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
 1096}
 1097
 1098static struct numa_group *deref_curr_numa_group(struct task_struct *p)
 1099{
 1100	return rcu_dereference_protected(p->numa_group, p == current);
 1101}
 1102
 1103static inline unsigned long group_faults_priv(struct numa_group *ng);
 1104static inline unsigned long group_faults_shared(struct numa_group *ng);
 1105
 1106static unsigned int task_nr_scan_windows(struct task_struct *p)
 1107{
 1108	unsigned long rss = 0;
 1109	unsigned long nr_scan_pages;
 1110
 1111	/*
 1112	 * Calculations based on RSS as non-present and empty pages are skipped
 1113	 * by the PTE scanner and NUMA hinting faults should be trapped based
 1114	 * on resident pages
 1115	 */
 1116	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
 1117	rss = get_mm_rss(p->mm);
 1118	if (!rss)
 1119		rss = nr_scan_pages;
 1120
 1121	rss = round_up(rss, nr_scan_pages);
 1122	return rss / nr_scan_pages;
 1123}
 1124
 1125/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
 1126#define MAX_SCAN_WINDOW 2560
 1127
 1128static unsigned int task_scan_min(struct task_struct *p)
 1129{
 1130	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 1131	unsigned int scan, floor;
 1132	unsigned int windows = 1;
 1133
 1134	if (scan_size < MAX_SCAN_WINDOW)
 1135		windows = MAX_SCAN_WINDOW / scan_size;
 1136	floor = 1000 / windows;
 1137
 1138	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
 1139	return max_t(unsigned int, floor, scan);
 1140}
 1141
 1142static unsigned int task_scan_start(struct task_struct *p)
 1143{
 1144	unsigned long smin = task_scan_min(p);
 1145	unsigned long period = smin;
 1146	struct numa_group *ng;
 1147
 1148	/* Scale the maximum scan period with the amount of shared memory. */
 1149	rcu_read_lock();
 1150	ng = rcu_dereference(p->numa_group);
 1151	if (ng) {
 1152		unsigned long shared = group_faults_shared(ng);
 1153		unsigned long private = group_faults_priv(ng);
 1154
 1155		period *= refcount_read(&ng->refcount);
 1156		period *= shared + 1;
 1157		period /= private + shared + 1;
 1158	}
 1159	rcu_read_unlock();
 1160
 1161	return max(smin, period);
 1162}
 1163
 1164static unsigned int task_scan_max(struct task_struct *p)
 1165{
 1166	unsigned long smin = task_scan_min(p);
 1167	unsigned long smax;
 1168	struct numa_group *ng;
 1169
 1170	/* Watch for min being lower than max due to floor calculations */
 1171	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
 1172
 1173	/* Scale the maximum scan period with the amount of shared memory. */
 1174	ng = deref_curr_numa_group(p);
 1175	if (ng) {
 1176		unsigned long shared = group_faults_shared(ng);
 1177		unsigned long private = group_faults_priv(ng);
 1178		unsigned long period = smax;
 1179
 1180		period *= refcount_read(&ng->refcount);
 1181		period *= shared + 1;
 1182		period /= private + shared + 1;
 1183
 1184		smax = max(smax, period);
 1185	}
 1186
 1187	return max(smin, smax);
 1188}
 1189
 1190static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 1191{
 1192	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
 1193	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
 1194}
 1195
 1196static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 1197{
 1198	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
 1199	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 1200}
 1201
 1202/* Shared or private faults. */
 1203#define NR_NUMA_HINT_FAULT_TYPES 2
 1204
 1205/* Memory and CPU locality */
 1206#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
 1207
 1208/* Averaged statistics, and temporary buffers. */
 1209#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 1210
 1211pid_t task_numa_group_id(struct task_struct *p)
 1212{
 1213	struct numa_group *ng;
 1214	pid_t gid = 0;
 1215
 1216	rcu_read_lock();
 1217	ng = rcu_dereference(p->numa_group);
 1218	if (ng)
 1219		gid = ng->gid;
 1220	rcu_read_unlock();
 1221
 1222	return gid;
 1223}
 1224
 1225/*
 1226 * The averaged statistics, shared & private, memory & CPU,
 1227 * occupy the first half of the array. The second half of the
 1228 * array is for current counters, which are averaged into the
 1229 * first set by task_numa_placement.
 1230 */
 1231static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
 1232{
 1233	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
 1234}
 1235
 1236static inline unsigned long task_faults(struct task_struct *p, int nid)
 1237{
 1238	if (!p->numa_faults)
 1239		return 0;
 1240
 1241	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 1242		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
 1243}
 1244
 1245static inline unsigned long group_faults(struct task_struct *p, int nid)
 1246{
 1247	struct numa_group *ng = deref_task_numa_group(p);
 1248
 1249	if (!ng)
 1250		return 0;
 1251
 1252	return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 1253		ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
 1254}
 1255
 1256static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 1257{
 1258	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
 1259		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 1260}
 1261
 1262static inline unsigned long group_faults_priv(struct numa_group *ng)
 1263{
 1264	unsigned long faults = 0;
 1265	int node;
 1266
 1267	for_each_online_node(node) {
 1268		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
 1269	}
 1270
 1271	return faults;
 1272}
 1273
 1274static inline unsigned long group_faults_shared(struct numa_group *ng)
 1275{
 1276	unsigned long faults = 0;
 1277	int node;
 1278
 1279	for_each_online_node(node) {
 1280		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
 1281	}
 1282
 1283	return faults;
 1284}
 1285
 1286/*
 1287 * A node triggering more than 1/3 as many NUMA faults as the maximum is
 1288 * considered part of a numa group's pseudo-interleaving set. Migrations
 1289 * between these nodes are slowed down, to allow things to settle down.
 1290 */
 1291#define ACTIVE_NODE_FRACTION 3
 1292
 1293static bool numa_is_active_node(int nid, struct numa_group *ng)
 1294{
 1295	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
 1296}
 1297
 1298/* Handle placement on systems where not all nodes are directly connected. */
 1299static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 1300					int maxdist, bool task)
 1301{
 1302	unsigned long score = 0;
 1303	int node;
 1304
 1305	/*
 1306	 * All nodes are directly connected, and the same distance
 1307	 * from each other. No need for fancy placement algorithms.
 1308	 */
 1309	if (sched_numa_topology_type == NUMA_DIRECT)
 1310		return 0;
 1311
 1312	/*
 1313	 * This code is called for each node, introducing N^2 complexity,
 1314	 * which should be ok given the number of nodes rarely exceeds 8.
 1315	 */
 1316	for_each_online_node(node) {
 1317		unsigned long faults;
 1318		int dist = node_distance(nid, node);
 1319
 1320		/*
 1321		 * The furthest away nodes in the system are not interesting
 1322		 * for placement; nid was already counted.
 1323		 */
 1324		if (dist == sched_max_numa_distance || node == nid)
 1325			continue;
 1326
 1327		/*
 1328		 * On systems with a backplane NUMA topology, compare groups
 1329		 * of nodes, and move tasks towards the group with the most
 1330		 * memory accesses. When comparing two nodes at distance
 1331		 * "hoplimit", only nodes closer by than "hoplimit" are part
 1332		 * of each group. Skip other nodes.
 1333		 */
 1334		if (sched_numa_topology_type == NUMA_BACKPLANE &&
 1335					dist >= maxdist)
 1336			continue;
 1337
 1338		/* Add up the faults from nearby nodes. */
 1339		if (task)
 1340			faults = task_faults(p, node);
 1341		else
 1342			faults = group_faults(p, node);
 1343
 1344		/*
 1345		 * On systems with a glueless mesh NUMA topology, there are
 1346		 * no fixed "groups of nodes". Instead, nodes that are not
 1347		 * directly connected bounce traffic through intermediate
 1348		 * nodes; a numa_group can occupy any set of nodes.
 1349		 * The further away a node is, the less the faults count.
 1350		 * This seems to result in good task placement.
 1351		 */
 1352		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
 1353			faults *= (sched_max_numa_distance - dist);
 1354			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
 1355		}
 1356
 1357		score += faults;
 1358	}
 1359
 1360	return score;
 1361}
 1362
 1363/*
 1364 * These return the fraction of accesses done by a particular task, or
 1365 * task group, on a particular numa node.  The group weight is given a
 1366 * larger multiplier, in order to group tasks together that are almost
 1367 * evenly spread out between numa nodes.
 1368 */
 1369static inline unsigned long task_weight(struct task_struct *p, int nid,
 1370					int dist)
 1371{
 1372	unsigned long faults, total_faults;
 1373
 1374	if (!p->numa_faults)
 1375		return 0;
 1376
 1377	total_faults = p->total_numa_faults;
 1378
 1379	if (!total_faults)
 1380		return 0;
 1381
 1382	faults = task_faults(p, nid);
 1383	faults += score_nearby_nodes(p, nid, dist, true);
 1384
 1385	return 1000 * faults / total_faults;
 1386}
 1387
 1388static inline unsigned long group_weight(struct task_struct *p, int nid,
 1389					 int dist)
 1390{
 1391	struct numa_group *ng = deref_task_numa_group(p);
 1392	unsigned long faults, total_faults;
 1393
 1394	if (!ng)
 1395		return 0;
 1396
 1397	total_faults = ng->total_faults;
 1398
 1399	if (!total_faults)
 1400		return 0;
 1401
 1402	faults = group_faults(p, nid);
 1403	faults += score_nearby_nodes(p, nid, dist, false);
 1404
 1405	return 1000 * faults / total_faults;
 1406}
 1407
 1408bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 1409				int src_nid, int dst_cpu)
 1410{
 1411	struct numa_group *ng = deref_curr_numa_group(p);
 1412	int dst_nid = cpu_to_node(dst_cpu);
 1413	int last_cpupid, this_cpupid;
 1414
 1415	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
 1416	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
 1417
 1418	/*
 1419	 * Allow first faults or private faults to migrate immediately early in
 1420	 * the lifetime of a task. The magic number 4 is based on waiting for
 1421	 * two full passes of the "multi-stage node selection" test that is
 1422	 * executed below.
 1423	 */
 1424	if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
 1425	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
 1426		return true;
 1427
 1428	/*
 1429	 * Multi-stage node selection is used in conjunction with a periodic
 1430	 * migration fault to build a temporal task<->page relation. By using
 1431	 * a two-stage filter we remove short/unlikely relations.
 1432	 *
 1433	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
 1434	 * a task's usage of a particular page (n_p) per total usage of this
 1435	 * page (n_t) (in a given time-span) to a probability.
 1436	 *
 1437	 * Our periodic faults will sample this probability and getting the
 1438	 * same result twice in a row, given these samples are fully
 1439	 * independent, is then given by P(n)^2, provided our sample period
 1440	 * is sufficiently short compared to the usage pattern.
 1441	 *
 1442	 * This quadric squishes small probabilities, making it less likely we
 1443	 * act on an unlikely task<->page relation.
 1444	 */
 1445	if (!cpupid_pid_unset(last_cpupid) &&
 1446				cpupid_to_nid(last_cpupid) != dst_nid)
 1447		return false;
 1448
 1449	/* Always allow migrate on private faults */
 1450	if (cpupid_match_pid(p, last_cpupid))
 1451		return true;
 1452
 1453	/* A shared fault, but p->numa_group has not been set up yet. */
 1454	if (!ng)
 1455		return true;
 1456
 1457	/*
 1458	 * Destination node is much more heavily used than the source
 1459	 * node? Allow migration.
 1460	 */
 1461	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
 1462					ACTIVE_NODE_FRACTION)
 1463		return true;
 1464
 1465	/*
 1466	 * Distribute memory according to CPU & memory use on each node,
 1467	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
 1468	 *
 1469	 * faults_cpu(dst)   3   faults_cpu(src)
 1470	 * --------------- * - > ---------------
 1471	 * faults_mem(dst)   4   faults_mem(src)
 1472	 */
 1473	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
 1474	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 1475}
 1476
 1477static unsigned long cpu_runnable_load(struct rq *rq);
 1478
 1479/* Cached statistics for all CPUs within a node */
 1480struct numa_stats {
 1481	unsigned long load;
 1482
 1483	/* Total compute capacity of CPUs on a node */
 1484	unsigned long compute_capacity;
 1485};
 1486
 1487/*
 1488 * XXX borrowed from update_sg_lb_stats
 1489 */
 1490static void update_numa_stats(struct numa_stats *ns, int nid)
 1491{
 1492	int cpu;
 1493
 1494	memset(ns, 0, sizeof(*ns));
 1495	for_each_cpu(cpu, cpumask_of_node(nid)) {
 1496		struct rq *rq = cpu_rq(cpu);
 1497
 1498		ns->load += cpu_runnable_load(rq);
 1499		ns->compute_capacity += capacity_of(cpu);
 1500	}
 1501
 1502}
 1503
 1504struct task_numa_env {
 1505	struct task_struct *p;
 1506
 1507	int src_cpu, src_nid;
 1508	int dst_cpu, dst_nid;
 1509
 1510	struct numa_stats src_stats, dst_stats;
 1511
 1512	int imbalance_pct;
 1513	int dist;
 1514
 1515	struct task_struct *best_task;
 1516	long best_imp;
 1517	int best_cpu;
 1518};
 1519
 1520static void task_numa_assign(struct task_numa_env *env,
 1521			     struct task_struct *p, long imp)
 1522{
 1523	struct rq *rq = cpu_rq(env->dst_cpu);
 1524
 1525	/* Bail out if run-queue part of active NUMA balance. */
 1526	if (xchg(&rq->numa_migrate_on, 1))
 1527		return;
 1528
 1529	/*
 1530	 * Clear previous best_cpu/rq numa-migrate flag, since task now
 1531	 * found a better CPU to move/swap.
 1532	 */
 1533	if (env->best_cpu != -1) {
 1534		rq = cpu_rq(env->best_cpu);
 1535		WRITE_ONCE(rq->numa_migrate_on, 0);
 1536	}
 1537
 1538	if (env->best_task)
 1539		put_task_struct(env->best_task);
 1540	if (p)
 1541		get_task_struct(p);
 1542
 1543	env->best_task = p;
 1544	env->best_imp = imp;
 1545	env->best_cpu = env->dst_cpu;
 1546}
 1547
 1548static bool load_too_imbalanced(long src_load, long dst_load,
 1549				struct task_numa_env *env)
 1550{
 1551	long imb, old_imb;
 1552	long orig_src_load, orig_dst_load;
 1553	long src_capacity, dst_capacity;
 1554
 1555	/*
 1556	 * The load is corrected for the CPU capacity available on each node.
 1557	 *
 1558	 * src_load        dst_load
 1559	 * ------------ vs ---------
 1560	 * src_capacity    dst_capacity
 1561	 */
 1562	src_capacity = env->src_stats.compute_capacity;
 1563	dst_capacity = env->dst_stats.compute_capacity;
 1564
 1565	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
 1566
 1567	orig_src_load = env->src_stats.load;
 1568	orig_dst_load = env->dst_stats.load;
 1569
 1570	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
 1571
 1572	/* Would this change make things worse? */
 1573	return (imb > old_imb);
 1574}
 1575
 1576/*
 1577 * Maximum NUMA importance can be 1998 (2*999);
 1578 * SMALLIMP @ 30 would be close to 1998/64.
 1579 * Used to deter task migration.
 1580 */
 1581#define SMALLIMP	30
 1582
 1583/*
 1584 * This checks if the overall compute and NUMA accesses of the system would
 1585 * be improved if the source tasks was migrated to the target dst_cpu taking
 1586 * into account that it might be best if task running on the dst_cpu should
 1587 * be exchanged with the source task
 1588 */
 1589static void task_numa_compare(struct task_numa_env *env,
 1590			      long taskimp, long groupimp, bool maymove)
 1591{
 1592	struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
 1593	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 1594	long imp = p_ng ? groupimp : taskimp;
 1595	struct task_struct *cur;
 1596	long src_load, dst_load;
 1597	int dist = env->dist;
 1598	long moveimp = imp;
 1599	long load;
 1600
 1601	if (READ_ONCE(dst_rq->numa_migrate_on))
 1602		return;
 1603
 1604	rcu_read_lock();
 1605	cur = rcu_dereference(dst_rq->curr);
 1606	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
 1607		cur = NULL;
 1608
 1609	/*
 1610	 * Because we have preemption enabled we can get migrated around and
 1611	 * end try selecting ourselves (current == env->p) as a swap candidate.
 1612	 */
 1613	if (cur == env->p)
 1614		goto unlock;
 1615
 1616	if (!cur) {
 1617		if (maymove && moveimp >= env->best_imp)
 1618			goto assign;
 1619		else
 1620			goto unlock;
 1621	}
 1622
 1623	/*
 1624	 * "imp" is the fault differential for the source task between the
 1625	 * source and destination node. Calculate the total differential for
 1626	 * the source task and potential destination task. The more negative
 1627	 * the value is, the more remote accesses that would be expected to
 1628	 * be incurred if the tasks were swapped.
 1629	 */
 1630	/* Skip this swap candidate if cannot move to the source cpu */
 1631	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
 1632		goto unlock;
 1633
 1634	/*
 1635	 * If dst and source tasks are in the same NUMA group, or not
 1636	 * in any group then look only at task weights.
 1637	 */
 1638	cur_ng = rcu_dereference(cur->numa_group);
 1639	if (cur_ng == p_ng) {
 1640		imp = taskimp + task_weight(cur, env->src_nid, dist) -
 1641		      task_weight(cur, env->dst_nid, dist);
 1642		/*
 1643		 * Add some hysteresis to prevent swapping the
 1644		 * tasks within a group over tiny differences.
 1645		 */
 1646		if (cur_ng)
 1647			imp -= imp / 16;
 1648	} else {
 1649		/*
 1650		 * Compare the group weights. If a task is all by itself
 1651		 * (not part of a group), use the task weight instead.
 1652		 */
 1653		if (cur_ng && p_ng)
 1654			imp += group_weight(cur, env->src_nid, dist) -
 1655			       group_weight(cur, env->dst_nid, dist);
 1656		else
 1657			imp += task_weight(cur, env->src_nid, dist) -
 1658			       task_weight(cur, env->dst_nid, dist);
 1659	}
 1660
 1661	if (maymove && moveimp > imp && moveimp > env->best_imp) {
 1662		imp = moveimp;
 1663		cur = NULL;
 1664		goto assign;
 1665	}
 1666
 1667	/*
 1668	 * If the NUMA importance is less than SMALLIMP,
 1669	 * task migration might only result in ping pong
 1670	 * of tasks and also hurt performance due to cache
 1671	 * misses.
 1672	 */
 1673	if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
 1674		goto unlock;
 1675
 1676	/*
 1677	 * In the overloaded case, try and keep the load balanced.
 1678	 */
 1679	load = task_h_load(env->p) - task_h_load(cur);
 1680	if (!load)
 1681		goto assign;
 1682
 1683	dst_load = env->dst_stats.load + load;
 1684	src_load = env->src_stats.load - load;
 1685
 1686	if (load_too_imbalanced(src_load, dst_load, env))
 1687		goto unlock;
 1688
 1689assign:
 1690	/*
 1691	 * One idle CPU per node is evaluated for a task numa move.
 1692	 * Call select_idle_sibling to maybe find a better one.
 1693	 */
 1694	if (!cur) {
 1695		/*
 1696		 * select_idle_siblings() uses an per-CPU cpumask that
 1697		 * can be used from IRQ context.
 1698		 */
 1699		local_irq_disable();
 1700		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
 1701						   env->dst_cpu);
 1702		local_irq_enable();
 1703	}
 1704
 1705	task_numa_assign(env, cur, imp);
 1706unlock:
 1707	rcu_read_unlock();
 1708}
 1709
 1710static void task_numa_find_cpu(struct task_numa_env *env,
 1711				long taskimp, long groupimp)
 1712{
 1713	long src_load, dst_load, load;
 1714	bool maymove = false;
 1715	int cpu;
 1716
 1717	load = task_h_load(env->p);
 1718	dst_load = env->dst_stats.load + load;
 1719	src_load = env->src_stats.load - load;
 1720
 1721	/*
 1722	 * If the improvement from just moving env->p direction is better
 1723	 * than swapping tasks around, check if a move is possible.
 1724	 */
 1725	maymove = !load_too_imbalanced(src_load, dst_load, env);
 1726
 1727	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 1728		/* Skip this CPU if the source task cannot migrate */
 1729		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
 1730			continue;
 1731
 1732		env->dst_cpu = cpu;
 1733		task_numa_compare(env, taskimp, groupimp, maymove);
 1734	}
 1735}
 1736
 1737static int task_numa_migrate(struct task_struct *p)
 1738{
 1739	struct task_numa_env env = {
 1740		.p = p,
 1741
 1742		.src_cpu = task_cpu(p),
 1743		.src_nid = task_node(p),
 1744
 1745		.imbalance_pct = 112,
 1746
 1747		.best_task = NULL,
 1748		.best_imp = 0,
 1749		.best_cpu = -1,
 1750	};
 1751	unsigned long taskweight, groupweight;
 1752	struct sched_domain *sd;
 1753	long taskimp, groupimp;
 1754	struct numa_group *ng;
 1755	struct rq *best_rq;
 1756	int nid, ret, dist;
 1757
 1758	/*
 1759	 * Pick the lowest SD_NUMA domain, as that would have the smallest
 1760	 * imbalance and would be the first to start moving tasks about.
 1761	 *
 1762	 * And we want to avoid any moving of tasks about, as that would create
 1763	 * random movement of tasks -- counter the numa conditions we're trying
 1764	 * to satisfy here.
 1765	 */
 1766	rcu_read_lock();
 1767	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
 1768	if (sd)
 1769		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
 1770	rcu_read_unlock();
 1771
 1772	/*
 1773	 * Cpusets can break the scheduler domain tree into smaller
 1774	 * balance domains, some of which do not cross NUMA boundaries.
 1775	 * Tasks that are "trapped" in such domains cannot be migrated
 1776	 * elsewhere, so there is no point in (re)trying.
 1777	 */
 1778	if (unlikely(!sd)) {
 1779		sched_setnuma(p, task_node(p));
 1780		return -EINVAL;
 1781	}
 1782
 1783	env.dst_nid = p->numa_preferred_nid;
 1784	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
 1785	taskweight = task_weight(p, env.src_nid, dist);
 1786	groupweight = group_weight(p, env.src_nid, dist);
 1787	update_numa_stats(&env.src_stats, env.src_nid);
 1788	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
 1789	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
 1790	update_numa_stats(&env.dst_stats, env.dst_nid);
 1791
 1792	/* Try to find a spot on the preferred nid. */
 1793	task_numa_find_cpu(&env, taskimp, groupimp);
 1794
 1795	/*
 1796	 * Look at other nodes in these cases:
 1797	 * - there is no space available on the preferred_nid
 1798	 * - the task is part of a numa_group that is interleaved across
 1799	 *   multiple NUMA nodes; in order to better consolidate the group,
 1800	 *   we need to check other locations.
 1801	 */
 1802	ng = deref_curr_numa_group(p);
 1803	if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
 1804		for_each_online_node(nid) {
 1805			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 1806				continue;
 1807
 1808			dist = node_distance(env.src_nid, env.dst_nid);
 1809			if (sched_numa_topology_type == NUMA_BACKPLANE &&
 1810						dist != env.dist) {
 1811				taskweight = task_weight(p, env.src_nid, dist);
 1812				groupweight = group_weight(p, env.src_nid, dist);
 1813			}
 1814
 1815			/* Only consider nodes where both task and groups benefit */
 1816			taskimp = task_weight(p, nid, dist) - taskweight;
 1817			groupimp = group_weight(p, nid, dist) - groupweight;
 1818			if (taskimp < 0 && groupimp < 0)
 1819				continue;
 1820
 1821			env.dist = dist;
 1822			env.dst_nid = nid;
 1823			update_numa_stats(&env.dst_stats, env.dst_nid);
 1824			task_numa_find_cpu(&env, taskimp, groupimp);
 1825		}
 1826	}
 1827
 1828	/*
 1829	 * If the task is part of a workload that spans multiple NUMA nodes,
 1830	 * and is migrating into one of the workload's active nodes, remember
 1831	 * this node as the task's preferred numa node, so the workload can
 1832	 * settle down.
 1833	 * A task that migrated to a second choice node will be better off
 1834	 * trying for a better one later. Do not set the preferred node here.
 1835	 */
 1836	if (ng) {
 1837		if (env.best_cpu == -1)
 1838			nid = env.src_nid;
 1839		else
 1840			nid = cpu_to_node(env.best_cpu);
 1841
 1842		if (nid != p->numa_preferred_nid)
 1843			sched_setnuma(p, nid);
 1844	}
 1845
 1846	/* No better CPU than the current one was found. */
 1847	if (env.best_cpu == -1)
 1848		return -EAGAIN;
 1849
 1850	best_rq = cpu_rq(env.best_cpu);
 1851	if (env.best_task == NULL) {
 1852		ret = migrate_task_to(p, env.best_cpu);
 1853		WRITE_ONCE(best_rq->numa_migrate_on, 0);
 1854		if (ret != 0)
 1855			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
 1856		return ret;
 1857	}
 1858
 1859	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
 1860	WRITE_ONCE(best_rq->numa_migrate_on, 0);
 1861
 1862	if (ret != 0)
 1863		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
 1864	put_task_struct(env.best_task);
 1865	return ret;
 1866}
 1867
 1868/* Attempt to migrate a task to a CPU on the preferred node. */
 1869static void numa_migrate_preferred(struct task_struct *p)
 1870{
 1871	unsigned long interval = HZ;
 1872
 1873	/* This task has no NUMA fault statistics yet */
 1874	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
 1875		return;
 1876
 1877	/* Periodically retry migrating the task to the preferred node */
 1878	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
 1879	p->numa_migrate_retry = jiffies + interval;
 1880
 1881	/* Success if task is already running on preferred CPU */
 1882	if (task_node(p) == p->numa_preferred_nid)
 1883		return;
 1884
 1885	/* Otherwise, try migrate to a CPU on the preferred node */
 1886	task_numa_migrate(p);
 1887}
 1888
 1889/*
 1890 * Find out how many nodes on the workload is actively running on. Do this by
 1891 * tracking the nodes from which NUMA hinting faults are triggered. This can
 1892 * be different from the set of nodes where the workload's memory is currently
 1893 * located.
 1894 */
 1895static void numa_group_count_active_nodes(struct numa_group *numa_group)
 1896{
 1897	unsigned long faults, max_faults = 0;
 1898	int nid, active_nodes = 0;
 1899
 1900	for_each_online_node(nid) {
 1901		faults = group_faults_cpu(numa_group, nid);
 1902		if (faults > max_faults)
 1903			max_faults = faults;
 1904	}
 1905
 1906	for_each_online_node(nid) {
 1907		faults = group_faults_cpu(numa_group, nid);
 1908		if (faults * ACTIVE_NODE_FRACTION > max_faults)
 1909			active_nodes++;
 1910	}
 1911
 1912	numa_group->max_faults_cpu = max_faults;
 1913	numa_group->active_nodes = active_nodes;
 1914}
 1915
 1916/*
 1917 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
 1918 * increments. The more local the fault statistics are, the higher the scan
 1919 * period will be for the next scan window. If local/(local+remote) ratio is
 1920 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
 1921 * the scan period will decrease. Aim for 70% local accesses.
 1922 */
 1923#define NUMA_PERIOD_SLOTS 10
 1924#define NUMA_PERIOD_THRESHOLD 7
 1925
 1926/*
 1927 * Increase the scan period (slow down scanning) if the majority of
 1928 * our memory is already on our local node, or if the majority of
 1929 * the page accesses are shared with other processes.
 1930 * Otherwise, decrease the scan period.
 1931 */
 1932static void update_task_scan_period(struct task_struct *p,
 1933			unsigned long shared, unsigned long private)
 1934{
 1935	unsigned int period_slot;
 1936	int lr_ratio, ps_ratio;
 1937	int diff;
 1938
 1939	unsigned long remote = p->numa_faults_locality[0];
 1940	unsigned long local = p->numa_faults_locality[1];
 1941
 1942	/*
 1943	 * If there were no record hinting faults then either the task is
 1944	 * completely idle or all activity is areas that are not of interest
 1945	 * to automatic numa balancing. Related to that, if there were failed
 1946	 * migration then it implies we are migrating too quickly or the local
 1947	 * node is overloaded. In either case, scan slower
 1948	 */
 1949	if (local + shared == 0 || p->numa_faults_locality[2]) {
 1950		p->numa_scan_period = min(p->numa_scan_period_max,
 1951			p->numa_scan_period << 1);
 1952
 1953		p->mm->numa_next_scan = jiffies +
 1954			msecs_to_jiffies(p->numa_scan_period);
 1955
 1956		return;
 1957	}
 1958
 1959	/*
 1960	 * Prepare to scale scan period relative to the current period.
 1961	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
 1962	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
 1963	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
 1964	 */
 1965	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
 1966	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
 1967	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
 1968
 1969	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
 1970		/*
 1971		 * Most memory accesses are local. There is no need to
 1972		 * do fast NUMA scanning, since memory is already local.
 1973		 */
 1974		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
 1975		if (!slot)
 1976			slot = 1;
 1977		diff = slot * period_slot;
 1978	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
 1979		/*
 1980		 * Most memory accesses are shared with other tasks.
 1981		 * There is no point in continuing fast NUMA scanning,
 1982		 * since other tasks may just move the memory elsewhere.
 1983		 */
 1984		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
 1985		if (!slot)
 1986			slot = 1;
 1987		diff = slot * period_slot;
 1988	} else {
 1989		/*
 1990		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
 1991		 * yet they are not on the local NUMA node. Speed up
 1992		 * NUMA scanning to get the memory moved over.
 1993		 */
 1994		int ratio = max(lr_ratio, ps_ratio);
 1995		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
 1996	}
 1997
 1998	p->numa_scan_period = clamp(p->numa_scan_period + diff,
 1999			task_scan_min(p), task_scan_max(p));
 2000	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 2001}
 2002
 2003/*
 2004 * Get the fraction of time the task has been running since the last
 2005 * NUMA placement cycle. The scheduler keeps similar statistics, but
 2006 * decays those on a 32ms period, which is orders of magnitude off
 2007 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
 2008 * stats only if the task is so new there are no NUMA statistics yet.
 2009 */
 2010static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 2011{
 2012	u64 runtime, delta, now;
 2013	/* Use the start of this time slice to avoid calculations. */
 2014	now = p->se.exec_start;
 2015	runtime = p->se.sum_exec_runtime;
 2016
 2017	if (p->last_task_numa_placement) {
 2018		delta = runtime - p->last_sum_exec_runtime;
 2019		*period = now - p->last_task_numa_placement;
 2020
 2021		/* Avoid time going backwards, prevent potential divide error: */
 2022		if (unlikely((s64)*period < 0))
 2023			*period = 0;
 2024	} else {
 2025		delta = p->se.avg.load_sum;
 2026		*period = LOAD_AVG_MAX;
 2027	}
 2028
 2029	p->last_sum_exec_runtime = runtime;
 2030	p->last_task_numa_placement = now;
 2031
 2032	return delta;
 2033}
 2034
 2035/*
 2036 * Determine the preferred nid for a task in a numa_group. This needs to
 2037 * be done in a way that produces consistent results with group_weight,
 2038 * otherwise workloads might not converge.
 2039 */
 2040static int preferred_group_nid(struct task_struct *p, int nid)
 2041{
 2042	nodemask_t nodes;
 2043	int dist;
 2044
 2045	/* Direct connections between all NUMA nodes. */
 2046	if (sched_numa_topology_type == NUMA_DIRECT)
 2047		return nid;
 2048
 2049	/*
 2050	 * On a system with glueless mesh NUMA topology, group_weight
 2051	 * scores nodes according to the number of NUMA hinting faults on
 2052	 * both the node itself, and on nearby nodes.
 2053	 */
 2054	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
 2055		unsigned long score, max_score = 0;
 2056		int node, max_node = nid;
 2057
 2058		dist = sched_max_numa_distance;
 2059
 2060		for_each_online_node(node) {
 2061			score = group_weight(p, node, dist);
 2062			if (score > max_score) {
 2063				max_score = score;
 2064				max_node = node;
 2065			}
 2066		}
 2067		return max_node;
 2068	}
 2069
 2070	/*
 2071	 * Finding the preferred nid in a system with NUMA backplane
 2072	 * interconnect topology is more involved. The goal is to locate
 2073	 * tasks from numa_groups near each other in the system, and
 2074	 * untangle workloads from different sides of the system. This requires
 2075	 * searching down the hierarchy of node groups, recursively searching
 2076	 * inside the highest scoring group of nodes. The nodemask tricks
 2077	 * keep the complexity of the search down.
 2078	 */
 2079	nodes = node_online_map;
 2080	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
 2081		unsigned long max_faults = 0;
 2082		nodemask_t max_group = NODE_MASK_NONE;
 2083		int a, b;
 2084
 2085		/* Are there nodes at this distance from each other? */
 2086		if (!find_numa_distance(dist))
 2087			continue;
 2088
 2089		for_each_node_mask(a, nodes) {
 2090			unsigned long faults = 0;
 2091			nodemask_t this_group;
 2092			nodes_clear(this_group);
 2093
 2094			/* Sum group's NUMA faults; includes a==b case. */
 2095			for_each_node_mask(b, nodes) {
 2096				if (node_distance(a, b) < dist) {
 2097					faults += group_faults(p, b);
 2098					node_set(b, this_group);
 2099					node_clear(b, nodes);
 2100				}
 2101			}
 2102
 2103			/* Remember the top group. */
 2104			if (faults > max_faults) {
 2105				max_faults = faults;
 2106				max_group = this_group;
 2107				/*
 2108				 * subtle: at the smallest distance there is
 2109				 * just one node left in each "group", the
 2110				 * winner is the preferred nid.
 2111				 */
 2112				nid = a;
 2113			}
 2114		}
 2115		/* Next round, evaluate the nodes within max_group. */
 2116		if (!max_faults)
 2117			break;
 2118		nodes = max_group;
 2119	}
 2120	return nid;
 2121}
 2122
 2123static void task_numa_placement(struct task_struct *p)
 2124{
 2125	int seq, nid, max_nid = NUMA_NO_NODE;
 2126	unsigned long max_faults = 0;
 2127	unsigned long fault_types[2] = { 0, 0 };
 2128	unsigned long total_faults;
 2129	u64 runtime, period;
 2130	spinlock_t *group_lock = NULL;
 2131	struct numa_group *ng;
 2132
 2133	/*
 2134	 * The p->mm->numa_scan_seq field gets updated without
 2135	 * exclusive access. Use READ_ONCE() here to ensure
 2136	 * that the field is read in a single access:
 2137	 */
 2138	seq = READ_ONCE(p->mm->numa_scan_seq);
 2139	if (p->numa_scan_seq == seq)
 2140		return;
 2141	p->numa_scan_seq = seq;
 2142	p->numa_scan_period_max = task_scan_max(p);
 2143
 2144	total_faults = p->numa_faults_locality[0] +
 2145		       p->numa_faults_locality[1];
 2146	runtime = numa_get_avg_runtime(p, &period);
 2147
 2148	/* If the task is part of a group prevent parallel updates to group stats */
 2149	ng = deref_curr_numa_group(p);
 2150	if (ng) {
 2151		group_lock = &ng->lock;
 2152		spin_lock_irq(group_lock);
 2153	}
 2154
 2155	/* Find the node with the highest number of faults */
 2156	for_each_online_node(nid) {
 2157		/* Keep track of the offsets in numa_faults array */
 2158		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
 2159		unsigned long faults = 0, group_faults = 0;
 2160		int priv;
 2161
 2162		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
 2163			long diff, f_diff, f_weight;
 2164
 2165			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
 2166			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
 2167			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
 2168			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
 2169
 2170			/* Decay existing window, copy faults since last scan */
 2171			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
 2172			fault_types[priv] += p->numa_faults[membuf_idx];
 2173			p->numa_faults[membuf_idx] = 0;
 2174
 2175			/*
 2176			 * Normalize the faults_from, so all tasks in a group
 2177			 * count according to CPU use, instead of by the raw
 2178			 * number of faults. Tasks with little runtime have
 2179			 * little over-all impact on throughput, and thus their
 2180			 * faults are less important.
 2181			 */
 2182			f_weight = div64_u64(runtime << 16, period + 1);
 2183			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
 2184				   (total_faults + 1);
 2185			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
 2186			p->numa_faults[cpubuf_idx] = 0;
 2187
 2188			p->numa_faults[mem_idx] += diff;
 2189			p->numa_faults[cpu_idx] += f_diff;
 2190			faults += p->numa_faults[mem_idx];
 2191			p->total_numa_faults += diff;
 2192			if (ng) {
 2193				/*
 2194				 * safe because we can only change our own group
 2195				 *
 2196				 * mem_idx represents the offset for a given
 2197				 * nid and priv in a specific region because it
 2198				 * is at the beginning of the numa_faults array.
 2199				 */
 2200				ng->faults[mem_idx] += diff;
 2201				ng->faults_cpu[mem_idx] += f_diff;
 2202				ng->total_faults += diff;
 2203				group_faults += ng->faults[mem_idx];
 2204			}
 2205		}
 2206
 2207		if (!ng) {
 2208			if (faults > max_faults) {
 2209				max_faults = faults;
 2210				max_nid = nid;
 2211			}
 2212		} else if (group_faults > max_faults) {
 2213			max_faults = group_faults;
 2214			max_nid = nid;
 2215		}
 2216	}
 2217
 2218	if (ng) {
 2219		numa_group_count_active_nodes(ng);
 2220		spin_unlock_irq(group_lock);
 2221		max_nid = preferred_group_nid(p, max_nid);
 2222	}
 2223
 2224	if (max_faults) {
 2225		/* Set the new preferred node */
 2226		if (max_nid != p->numa_preferred_nid)
 2227			sched_setnuma(p, max_nid);
 2228	}
 2229
 2230	update_task_scan_period(p, fault_types[0], fault_types[1]);
 2231}
 2232
 2233static inline int get_numa_group(struct numa_group *grp)
 2234{
 2235	return refcount_inc_not_zero(&grp->refcount);
 2236}
 2237
 2238static inline void put_numa_group(struct numa_group *grp)
 2239{
 2240	if (refcount_dec_and_test(&grp->refcount))
 2241		kfree_rcu(grp, rcu);
 
 
 
 
 
 
 2242}
 2243
 2244static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 2245			int *priv)
 2246{
 2247	struct numa_group *grp, *my_grp;
 2248	struct task_struct *tsk;
 2249	bool join = false;
 2250	int cpu = cpupid_to_cpu(cpupid);
 2251	int i;
 2252
 2253	if (unlikely(!deref_curr_numa_group(p))) {
 2254		unsigned int size = sizeof(struct numa_group) +
 2255				    4*nr_node_ids*sizeof(unsigned long);
 2256
 2257		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
 2258		if (!grp)
 2259			return;
 2260
 2261		refcount_set(&grp->refcount, 1);
 2262		grp->active_nodes = 1;
 2263		grp->max_faults_cpu = 0;
 2264		spin_lock_init(&grp->lock);
 2265		grp->gid = p->pid;
 2266		/* Second half of the array tracks nids where faults happen */
 2267		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
 2268						nr_node_ids;
 2269
 2270		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 2271			grp->faults[i] = p->numa_faults[i];
 2272
 2273		grp->total_faults = p->total_numa_faults;
 2274
 2275		grp->nr_tasks++;
 2276		rcu_assign_pointer(p->numa_group, grp);
 2277	}
 2278
 2279	rcu_read_lock();
 2280	tsk = READ_ONCE(cpu_rq(cpu)->curr);
 2281
 2282	if (!cpupid_match_pid(tsk, cpupid))
 2283		goto no_join;
 2284
 2285	grp = rcu_dereference(tsk->numa_group);
 2286	if (!grp)
 2287		goto no_join;
 2288
 2289	my_grp = deref_curr_numa_group(p);
 2290	if (grp == my_grp)
 2291		goto no_join;
 2292
 2293	/*
 2294	 * Only join the other group if its bigger; if we're the bigger group,
 2295	 * the other task will join us.
 2296	 */
 2297	if (my_grp->nr_tasks > grp->nr_tasks)
 2298		goto no_join;
 2299
 2300	/*
 2301	 * Tie-break on the grp address.
 2302	 */
 2303	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
 2304		goto no_join;
 2305
 2306	/* Always join threads in the same process. */
 2307	if (tsk->mm == current->mm)
 2308		join = true;
 2309
 2310	/* Simple filter to avoid false positives due to PID collisions */
 2311	if (flags & TNF_SHARED)
 2312		join = true;
 2313
 2314	/* Update priv based on whether false sharing was detected */
 2315	*priv = !join;
 2316
 2317	if (join && !get_numa_group(grp))
 2318		goto no_join;
 2319
 2320	rcu_read_unlock();
 2321
 2322	if (!join)
 2323		return;
 2324
 2325	BUG_ON(irqs_disabled());
 2326	double_lock_irq(&my_grp->lock, &grp->lock);
 2327
 2328	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
 2329		my_grp->faults[i] -= p->numa_faults[i];
 2330		grp->faults[i] += p->numa_faults[i];
 2331	}
 2332	my_grp->total_faults -= p->total_numa_faults;
 2333	grp->total_faults += p->total_numa_faults;
 2334
 2335	my_grp->nr_tasks--;
 2336	grp->nr_tasks++;
 2337
 2338	spin_unlock(&my_grp->lock);
 2339	spin_unlock_irq(&grp->lock);
 2340
 2341	rcu_assign_pointer(p->numa_group, grp);
 2342
 2343	put_numa_group(my_grp);
 2344	return;
 2345
 2346no_join:
 2347	rcu_read_unlock();
 2348	return;
 2349}
 2350
 2351/*
 2352 * Get rid of NUMA staticstics associated with a task (either current or dead).
 2353 * If @final is set, the task is dead and has reached refcount zero, so we can
 2354 * safely free all relevant data structures. Otherwise, there might be
 2355 * concurrent reads from places like load balancing and procfs, and we should
 2356 * reset the data back to default state without freeing ->numa_faults.
 2357 */
 2358void task_numa_free(struct task_struct *p, bool final)
 2359{
 2360	/* safe: p either is current or is being freed by current */
 2361	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
 2362	unsigned long *numa_faults = p->numa_faults;
 2363	unsigned long flags;
 2364	int i;
 2365
 2366	if (!numa_faults)
 2367		return;
 2368
 2369	if (grp) {
 2370		spin_lock_irqsave(&grp->lock, flags);
 2371		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 2372			grp->faults[i] -= p->numa_faults[i];
 2373		grp->total_faults -= p->total_numa_faults;
 2374
 2375		grp->nr_tasks--;
 2376		spin_unlock_irqrestore(&grp->lock, flags);
 2377		RCU_INIT_POINTER(p->numa_group, NULL);
 2378		put_numa_group(grp);
 2379	}
 2380
 2381	if (final) {
 2382		p->numa_faults = NULL;
 2383		kfree(numa_faults);
 2384	} else {
 2385		p->total_numa_faults = 0;
 2386		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 2387			numa_faults[i] = 0;
 2388	}
 2389}
 2390
 2391/*
 2392 * Got a PROT_NONE fault for a page on @node.
 2393 */
 2394void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 
 2395{
 2396	struct task_struct *p = current;
 2397	bool migrated = flags & TNF_MIGRATED;
 2398	int cpu_node = task_node(current);
 2399	int local = !!(flags & TNF_FAULT_LOCAL);
 2400	struct numa_group *ng;
 2401	int priv;
 2402
 2403	if (!static_branch_likely(&sched_numa_balancing))
 2404		return;
 2405
 2406	/* for example, ksmd faulting in a user's mm */
 2407	if (!p->mm)
 2408		return;
 2409
 2410	/* Allocate buffer to track faults on a per-node basis */
 2411	if (unlikely(!p->numa_faults)) {
 2412		int size = sizeof(*p->numa_faults) *
 2413			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
 2414
 2415		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
 2416		if (!p->numa_faults)
 2417			return;
 2418
 2419		p->total_numa_faults = 0;
 2420		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 2421	}
 2422
 2423	/*
 2424	 * First accesses are treated as private, otherwise consider accesses
 2425	 * to be private if the accessing pid has not changed
 2426	 */
 2427	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
 2428		priv = 1;
 2429	} else {
 2430		priv = cpupid_match_pid(p, last_cpupid);
 2431		if (!priv && !(flags & TNF_NO_GROUP))
 2432			task_numa_group(p, last_cpupid, flags, &priv);
 2433	}
 2434
 2435	/*
 2436	 * If a workload spans multiple NUMA nodes, a shared fault that
 2437	 * occurs wholly within the set of nodes that the workload is
 2438	 * actively using should be counted as local. This allows the
 2439	 * scan rate to slow down when a workload has settled down.
 2440	 */
 2441	ng = deref_curr_numa_group(p);
 2442	if (!priv && !local && ng && ng->active_nodes > 1 &&
 2443				numa_is_active_node(cpu_node, ng) &&
 2444				numa_is_active_node(mem_node, ng))
 2445		local = 1;
 2446
 2447	/*
 2448	 * Retry to migrate task to preferred node periodically, in case it
 2449	 * previously failed, or the scheduler moved us.
 2450	 */
 2451	if (time_after(jiffies, p->numa_migrate_retry)) {
 2452		task_numa_placement(p);
 2453		numa_migrate_preferred(p);
 2454	}
 2455
 2456	if (migrated)
 2457		p->numa_pages_migrated += pages;
 2458	if (flags & TNF_MIGRATE_FAIL)
 2459		p->numa_faults_locality[2] += pages;
 2460
 2461	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 2462	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
 2463	p->numa_faults_locality[local] += pages;
 2464}
 2465
 2466static void reset_ptenuma_scan(struct task_struct *p)
 2467{
 2468	/*
 2469	 * We only did a read acquisition of the mmap sem, so
 2470	 * p->mm->numa_scan_seq is written to without exclusive access
 2471	 * and the update is not guaranteed to be atomic. That's not
 2472	 * much of an issue though, since this is just used for
 2473	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
 2474	 * expensive, to avoid any form of compiler optimizations:
 2475	 */
 2476	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 2477	p->mm->numa_scan_offset = 0;
 2478}
 2479
 2480/*
 2481 * The expensive part of numa migration is done from task_work context.
 2482 * Triggered from task_tick_numa().
 2483 */
 2484static void task_numa_work(struct callback_head *work)
 2485{
 2486	unsigned long migrate, next_scan, now = jiffies;
 2487	struct task_struct *p = current;
 2488	struct mm_struct *mm = p->mm;
 2489	u64 runtime = p->se.sum_exec_runtime;
 2490	struct vm_area_struct *vma;
 2491	unsigned long start, end;
 2492	unsigned long nr_pte_updates = 0;
 2493	long pages, virtpages;
 2494
 2495	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 2496
 2497	work->next = work;
 2498	/*
 2499	 * Who cares about NUMA placement when they're dying.
 2500	 *
 2501	 * NOTE: make sure not to dereference p->mm before this check,
 2502	 * exit_task_work() happens _after_ exit_mm() so we could be called
 2503	 * without p->mm even though we still had it when we enqueued this
 2504	 * work.
 2505	 */
 2506	if (p->flags & PF_EXITING)
 2507		return;
 2508
 2509	if (!mm->numa_next_scan) {
 2510		mm->numa_next_scan = now +
 2511			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 2512	}
 2513
 2514	/*
 2515	 * Enforce maximal scan/migration frequency..
 2516	 */
 2517	migrate = mm->numa_next_scan;
 2518	if (time_before(now, migrate))
 2519		return;
 2520
 2521	if (p->numa_scan_period == 0) {
 2522		p->numa_scan_period_max = task_scan_max(p);
 2523		p->numa_scan_period = task_scan_start(p);
 
 
 
 2524	}
 2525
 2526	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
 2527	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
 2528		return;
 2529
 2530	/*
 2531	 * Delay this task enough that another task of this mm will likely win
 2532	 * the next time around.
 2533	 */
 2534	p->node_stamp += 2 * TICK_NSEC;
 2535
 2536	start = mm->numa_scan_offset;
 2537	pages = sysctl_numa_balancing_scan_size;
 2538	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 2539	virtpages = pages * 8;	   /* Scan up to this much virtual space */
 2540	if (!pages)
 2541		return;
 2542
 2543
 2544	if (!down_read_trylock(&mm->mmap_sem))
 2545		return;
 2546	vma = find_vma(mm, start);
 2547	if (!vma) {
 2548		reset_ptenuma_scan(p);
 2549		start = 0;
 2550		vma = mm->mmap;
 2551	}
 2552	for (; vma; vma = vma->vm_next) {
 2553		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
 2554			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
 2555			continue;
 2556		}
 2557
 2558		/*
 2559		 * Shared library pages mapped by multiple processes are not
 2560		 * migrated as it is expected they are cache replicated. Avoid
 2561		 * hinting faults in read-only file-backed mappings or the vdso
 2562		 * as migrating the pages will be of marginal benefit.
 2563		 */
 2564		if (!vma->vm_mm ||
 2565		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
 2566			continue;
 2567
 
 2568		/*
 2569		 * Skip inaccessible VMAs to avoid any confusion between
 2570		 * PROT_NONE and NUMA hinting ptes
 
 2571		 */
 2572		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 2573			continue;
 2574
 2575		do {
 2576			start = max(start, vma->vm_start);
 2577			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
 2578			end = min(end, vma->vm_end);
 2579			nr_pte_updates = change_prot_numa(vma, start, end);
 2580
 2581			/*
 2582			 * Try to scan sysctl_numa_balancing_size worth of
 2583			 * hpages that have at least one present PTE that
 2584			 * is not already pte-numa. If the VMA contains
 2585			 * areas that are unused or already full of prot_numa
 2586			 * PTEs, scan up to virtpages, to skip through those
 2587			 * areas faster.
 2588			 */
 2589			if (nr_pte_updates)
 2590				pages -= (end - start) >> PAGE_SHIFT;
 2591			virtpages -= (end - start) >> PAGE_SHIFT;
 2592
 2593			start = end;
 2594			if (pages <= 0 || virtpages <= 0)
 2595				goto out;
 2596
 2597			cond_resched();
 2598		} while (end != vma->vm_end);
 2599	}
 2600
 2601out:
 2602	/*
 2603	 * It is possible to reach the end of the VMA list but the last few
 2604	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
 2605	 * would find the !migratable VMA on the next scan but not reset the
 2606	 * scanner to the start so check it now.
 2607	 */
 2608	if (vma)
 2609		mm->numa_scan_offset = start;
 2610	else
 2611		reset_ptenuma_scan(p);
 2612	up_read(&mm->mmap_sem);
 2613
 2614	/*
 2615	 * Make sure tasks use at least 32x as much time to run other code
 2616	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
 2617	 * Usually update_task_scan_period slows down scanning enough; on an
 2618	 * overloaded system we need to limit overhead on a per task basis.
 2619	 */
 2620	if (unlikely(p->se.sum_exec_runtime != runtime)) {
 2621		u64 diff = p->se.sum_exec_runtime - runtime;
 2622		p->node_stamp += 32 * diff;
 2623	}
 2624}
 2625
 2626void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 2627{
 2628	int mm_users = 0;
 2629	struct mm_struct *mm = p->mm;
 2630
 2631	if (mm) {
 2632		mm_users = atomic_read(&mm->mm_users);
 2633		if (mm_users == 1) {
 2634			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 2635			mm->numa_scan_seq = 0;
 2636		}
 2637	}
 2638	p->node_stamp			= 0;
 2639	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
 2640	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
 2641	/* Protect against double add, see task_tick_numa and task_numa_work */
 2642	p->numa_work.next		= &p->numa_work;
 2643	p->numa_faults			= NULL;
 2644	RCU_INIT_POINTER(p->numa_group, NULL);
 2645	p->last_task_numa_placement	= 0;
 2646	p->last_sum_exec_runtime	= 0;
 2647
 2648	init_task_work(&p->numa_work, task_numa_work);
 2649
 2650	/* New address space, reset the preferred nid */
 2651	if (!(clone_flags & CLONE_VM)) {
 2652		p->numa_preferred_nid = NUMA_NO_NODE;
 2653		return;
 2654	}
 2655
 2656	/*
 2657	 * New thread, keep existing numa_preferred_nid which should be copied
 2658	 * already by arch_dup_task_struct but stagger when scans start.
 2659	 */
 2660	if (mm) {
 2661		unsigned int delay;
 2662
 2663		delay = min_t(unsigned int, task_scan_max(current),
 2664			current->numa_scan_period * mm_users * NSEC_PER_MSEC);
 2665		delay += 2 * TICK_NSEC;
 2666		p->node_stamp = delay;
 2667	}
 2668}
 2669
 2670/*
 2671 * Drive the periodic memory faults..
 2672 */
 2673static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 2674{
 2675	struct callback_head *work = &curr->numa_work;
 2676	u64 period, now;
 2677
 2678	/*
 2679	 * We don't care about NUMA placement if we don't have memory.
 2680	 */
 2681	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
 2682		return;
 2683
 2684	/*
 2685	 * Using runtime rather than walltime has the dual advantage that
 2686	 * we (mostly) drive the selection from busy threads and that the
 2687	 * task needs to have done some actual work before we bother with
 2688	 * NUMA placement.
 2689	 */
 2690	now = curr->se.sum_exec_runtime;
 2691	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 
 2692
 2693	if (now > curr->node_stamp + period) {
 2694		if (!curr->node_stamp)
 2695			curr->numa_scan_period = task_scan_start(curr);
 2696		curr->node_stamp += period;
 2697
 2698		if (!time_before(jiffies, curr->mm->numa_next_scan))
 2699			task_work_add(curr, work, true);
 2700	}
 2701}
 2702
 2703static void update_scan_period(struct task_struct *p, int new_cpu)
 2704{
 2705	int src_nid = cpu_to_node(task_cpu(p));
 2706	int dst_nid = cpu_to_node(new_cpu);
 2707
 2708	if (!static_branch_likely(&sched_numa_balancing))
 2709		return;
 2710
 2711	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
 2712		return;
 2713
 2714	if (src_nid == dst_nid)
 2715		return;
 2716
 2717	/*
 2718	 * Allow resets if faults have been trapped before one scan
 2719	 * has completed. This is most likely due to a new task that
 2720	 * is pulled cross-node due to wakeups or load balancing.
 2721	 */
 2722	if (p->numa_scan_seq) {
 2723		/*
 2724		 * Avoid scan adjustments if moving to the preferred
 2725		 * node or if the task was not previously running on
 2726		 * the preferred node.
 2727		 */
 2728		if (dst_nid == p->numa_preferred_nid ||
 2729		    (p->numa_preferred_nid != NUMA_NO_NODE &&
 2730			src_nid != p->numa_preferred_nid))
 2731			return;
 2732	}
 2733
 2734	p->numa_scan_period = task_scan_start(p);
 2735}
 2736
 2737#else
 2738static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 2739{
 2740}
 2741
 2742static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 2743{
 2744}
 
 2745
 2746static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 2747{
 2748}
 2749
 2750static inline void update_scan_period(struct task_struct *p, int new_cpu)
 2751{
 2752}
 2753
 2754#endif /* CONFIG_NUMA_BALANCING */
 2755
 2756static void
 2757account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2758{
 2759	update_load_add(&cfs_rq->load, se->load.weight);
 2760#ifdef CONFIG_SMP
 2761	if (entity_is_task(se)) {
 2762		struct rq *rq = rq_of(cfs_rq);
 2763
 2764		account_numa_enqueue(rq, task_of(se));
 2765		list_add(&se->group_node, &rq->cfs_tasks);
 2766	}
 2767#endif
 2768	cfs_rq->nr_running++;
 2769}
 2770
 2771static void
 2772account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2773{
 2774	update_load_sub(&cfs_rq->load, se->load.weight);
 2775#ifdef CONFIG_SMP
 2776	if (entity_is_task(se)) {
 2777		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 2778		list_del_init(&se->group_node);
 2779	}
 2780#endif
 2781	cfs_rq->nr_running--;
 2782}
 2783
 2784/*
 2785 * Signed add and clamp on underflow.
 2786 *
 2787 * Explicitly do a load-store to ensure the intermediate value never hits
 2788 * memory. This allows lockless observations without ever seeing the negative
 2789 * values.
 2790 */
 2791#define add_positive(_ptr, _val) do {                           \
 2792	typeof(_ptr) ptr = (_ptr);                              \
 2793	typeof(_val) val = (_val);                              \
 2794	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
 2795								\
 2796	res = var + val;                                        \
 2797								\
 2798	if (val < 0 && res > var)                               \
 2799		res = 0;                                        \
 2800								\
 2801	WRITE_ONCE(*ptr, res);                                  \
 2802} while (0)
 2803
 2804/*
 2805 * Unsigned subtract and clamp on underflow.
 2806 *
 2807 * Explicitly do a load-store to ensure the intermediate value never hits
 2808 * memory. This allows lockless observations without ever seeing the negative
 2809 * values.
 2810 */
 2811#define sub_positive(_ptr, _val) do {				\
 2812	typeof(_ptr) ptr = (_ptr);				\
 2813	typeof(*ptr) val = (_val);				\
 2814	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
 2815	res = var - val;					\
 2816	if (res > var)						\
 2817		res = 0;					\
 2818	WRITE_ONCE(*ptr, res);					\
 2819} while (0)
 2820
 2821/*
 2822 * Remove and clamp on negative, from a local variable.
 2823 *
 2824 * A variant of sub_positive(), which does not use explicit load-store
 2825 * and is thus optimized for local variable updates.
 2826 */
 2827#define lsub_positive(_ptr, _val) do {				\
 2828	typeof(_ptr) ptr = (_ptr);				\
 2829	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
 2830} while (0)
 2831
 2832#ifdef CONFIG_SMP
 2833static inline void
 2834enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2835{
 2836	cfs_rq->runnable_weight += se->runnable_weight;
 2837
 2838	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
 2839	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
 2840}
 2841
 2842static inline void
 2843dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2844{
 2845	cfs_rq->runnable_weight -= se->runnable_weight;
 2846
 2847	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
 2848	sub_positive(&cfs_rq->avg.runnable_load_sum,
 2849		     se_runnable(se) * se->avg.runnable_load_sum);
 2850}
 2851
 2852static inline void
 2853enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2854{
 2855	cfs_rq->avg.load_avg += se->avg.load_avg;
 2856	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
 2857}
 2858
 2859static inline void
 2860dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2861{
 2862	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
 2863	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
 2864}
 2865#else
 2866static inline void
 2867enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 2868static inline void
 2869dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 2870static inline void
 2871enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 2872static inline void
 2873dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 2874#endif
 2875
 2876static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 2877			    unsigned long weight, unsigned long runnable)
 2878{
 2879	if (se->on_rq) {
 2880		/* commit outstanding execution time */
 2881		if (cfs_rq->curr == se)
 2882			update_curr(cfs_rq);
 2883		account_entity_dequeue(cfs_rq, se);
 2884		dequeue_runnable_load_avg(cfs_rq, se);
 2885	}
 2886	dequeue_load_avg(cfs_rq, se);
 2887
 2888	se->runnable_weight = runnable;
 2889	update_load_set(&se->load, weight);
 2890
 2891#ifdef CONFIG_SMP
 2892	do {
 2893		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
 2894
 2895		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
 2896		se->avg.runnable_load_avg =
 2897			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
 2898	} while (0);
 2899#endif
 2900
 2901	enqueue_load_avg(cfs_rq, se);
 2902	if (se->on_rq) {
 2903		account_entity_enqueue(cfs_rq, se);
 2904		enqueue_runnable_load_avg(cfs_rq, se);
 2905	}
 2906}
 2907
 2908void reweight_task(struct task_struct *p, int prio)
 2909{
 2910	struct sched_entity *se = &p->se;
 2911	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 2912	struct load_weight *load = &se->load;
 2913	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
 2914
 2915	reweight_entity(cfs_rq, se, weight, weight);
 2916	load->inv_weight = sched_prio_to_wmult[prio];
 2917}
 2918
 2919#ifdef CONFIG_FAIR_GROUP_SCHED
 2920#ifdef CONFIG_SMP
 2921/*
 2922 * All this does is approximate the hierarchical proportion which includes that
 2923 * global sum we all love to hate.
 2924 *
 2925 * That is, the weight of a group entity, is the proportional share of the
 2926 * group weight based on the group runqueue weights. That is:
 2927 *
 2928 *                     tg->weight * grq->load.weight
 2929 *   ge->load.weight = -----------------------------               (1)
 2930 *			  \Sum grq->load.weight
 2931 *
 2932 * Now, because computing that sum is prohibitively expensive to compute (been
 2933 * there, done that) we approximate it with this average stuff. The average
 2934 * moves slower and therefore the approximation is cheaper and more stable.
 2935 *
 2936 * So instead of the above, we substitute:
 2937 *
 2938 *   grq->load.weight -> grq->avg.load_avg                         (2)
 2939 *
 2940 * which yields the following:
 2941 *
 2942 *                     tg->weight * grq->avg.load_avg
 2943 *   ge->load.weight = ------------------------------              (3)
 2944 *				tg->load_avg
 2945 *
 2946 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
 2947 *
 2948 * That is shares_avg, and it is right (given the approximation (2)).
 2949 *
 2950 * The problem with it is that because the average is slow -- it was designed
 2951 * to be exactly that of course -- this leads to transients in boundary
 2952 * conditions. In specific, the case where the group was idle and we start the
 2953 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
 2954 * yielding bad latency etc..
 2955 *
 2956 * Now, in that special case (1) reduces to:
 2957 *
 2958 *                     tg->weight * grq->load.weight
 2959 *   ge->load.weight = ----------------------------- = tg->weight   (4)
 2960 *			    grp->load.weight
 2961 *
 2962 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
 2963 *
 2964 * So what we do is modify our approximation (3) to approach (4) in the (near)
 2965 * UP case, like:
 2966 *
 2967 *   ge->load.weight =
 2968 *
 2969 *              tg->weight * grq->load.weight
 2970 *     ---------------------------------------------------         (5)
 2971 *     tg->load_avg - grq->avg.load_avg + grq->load.weight
 2972 *
 2973 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
 2974 * we need to use grq->avg.load_avg as its lower bound, which then gives:
 2975 *
 2976 *
 2977 *                     tg->weight * grq->load.weight
 2978 *   ge->load.weight = -----------------------------		   (6)
 2979 *				tg_load_avg'
 2980 *
 2981 * Where:
 2982 *
 2983 *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
 2984 *                  max(grq->load.weight, grq->avg.load_avg)
 2985 *
 2986 * And that is shares_weight and is icky. In the (near) UP case it approaches
 2987 * (4) while in the normal case it approaches (3). It consistently
 2988 * overestimates the ge->load.weight and therefore:
 2989 *
 2990 *   \Sum ge->load.weight >= tg->weight
 2991 *
 2992 * hence icky!
 2993 */
 2994static long calc_group_shares(struct cfs_rq *cfs_rq)
 2995{
 2996	long tg_weight, tg_shares, load, shares;
 2997	struct task_group *tg = cfs_rq->tg;
 2998
 2999	tg_shares = READ_ONCE(tg->shares);
 3000
 3001	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 3002
 3003	tg_weight = atomic_long_read(&tg->load_avg);
 3004
 3005	/* Ensure tg_weight >= load */
 3006	tg_weight -= cfs_rq->tg_load_avg_contrib;
 3007	tg_weight += load;
 3008
 3009	shares = (tg_shares * load);
 3010	if (tg_weight)
 3011		shares /= tg_weight;
 3012
 3013	/*
 3014	 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
 3015	 * of a group with small tg->shares value. It is a floor value which is
 3016	 * assigned as a minimum load.weight to the sched_entity representing
 3017	 * the group on a CPU.
 3018	 *
 3019	 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
 3020	 * on an 8-core system with 8 tasks each runnable on one CPU shares has
 3021	 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
 3022	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
 3023	 * instead of 0.
 3024	 */
 3025	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 3026}
 3027
 3028/*
 3029 * This calculates the effective runnable weight for a group entity based on
 3030 * the group entity weight calculated above.
 3031 *
 3032 * Because of the above approximation (2), our group entity weight is
 3033 * an load_avg based ratio (3). This means that it includes blocked load and
 3034 * does not represent the runnable weight.
 3035 *
 3036 * Approximate the group entity's runnable weight per ratio from the group
 3037 * runqueue:
 3038 *
 3039 *					     grq->avg.runnable_load_avg
 3040 *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
 3041 *						 grq->avg.load_avg
 3042 *
 3043 * However, analogous to above, since the avg numbers are slow, this leads to
 3044 * transients in the from-idle case. Instead we use:
 3045 *
 3046 *   ge->runnable_weight = ge->load.weight *
 3047 *
 3048 *		max(grq->avg.runnable_load_avg, grq->runnable_weight)
 3049 *		-----------------------------------------------------	(8)
 3050 *		      max(grq->avg.load_avg, grq->load.weight)
 3051 *
 3052 * Where these max() serve both to use the 'instant' values to fix the slow
 3053 * from-idle and avoid the /0 on to-idle, similar to (6).
 3054 */
 3055static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
 3056{
 3057	long runnable, load_avg;
 3058
 3059	load_avg = max(cfs_rq->avg.load_avg,
 3060		       scale_load_down(cfs_rq->load.weight));
 3061
 3062	runnable = max(cfs_rq->avg.runnable_load_avg,
 3063		       scale_load_down(cfs_rq->runnable_weight));
 3064
 3065	runnable *= shares;
 3066	if (load_avg)
 3067		runnable /= load_avg;
 3068
 3069	return clamp_t(long, runnable, MIN_SHARES, shares);
 3070}
 3071#endif /* CONFIG_SMP */
 3072
 3073static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 3074
 3075/*
 3076 * Recomputes the group entity based on the current state of its group
 3077 * runqueue.
 3078 */
 3079static void update_cfs_group(struct sched_entity *se)
 3080{
 3081	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 3082	long shares, runnable;
 3083
 3084	if (!gcfs_rq)
 3085		return;
 3086
 3087	if (throttled_hierarchy(gcfs_rq))
 
 
 3088		return;
 3089
 3090#ifndef CONFIG_SMP
 3091	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
 3092
 3093	if (likely(se->load.weight == shares))
 3094		return;
 3095#else
 3096	shares   = calc_group_shares(gcfs_rq);
 3097	runnable = calc_group_runnable(gcfs_rq, shares);
 3098#endif
 
 3099
 3100	reweight_entity(cfs_rq_of(se), se, shares, runnable);
 3101}
 3102
 3103#else /* CONFIG_FAIR_GROUP_SCHED */
 3104static inline void update_cfs_group(struct sched_entity *se)
 3105{
 3106}
 3107#endif /* CONFIG_FAIR_GROUP_SCHED */
 3108
 3109static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 3110{
 3111	struct rq *rq = rq_of(cfs_rq);
 3112
 3113	if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
 3114		/*
 3115		 * There are a few boundary cases this might miss but it should
 3116		 * get called often enough that that should (hopefully) not be
 3117		 * a real problem.
 3118		 *
 3119		 * It will not get called when we go idle, because the idle
 3120		 * thread is a different class (!fair), nor will the utilization
 3121		 * number include things like RT tasks.
 3122		 *
 3123		 * As is, the util number is not freq-invariant (we'd have to
 3124		 * implement arch_scale_freq_capacity() for that).
 3125		 *
 3126		 * See cpu_util().
 3127		 */
 3128		cpufreq_update_util(rq, flags);
 3129	}
 3130}
 3131
 3132#ifdef CONFIG_SMP
 3133#ifdef CONFIG_FAIR_GROUP_SCHED
 3134/**
 3135 * update_tg_load_avg - update the tg's load avg
 3136 * @cfs_rq: the cfs_rq whose avg changed
 3137 * @force: update regardless of how small the difference
 3138 *
 3139 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
 3140 * However, because tg->load_avg is a global value there are performance
 3141 * considerations.
 3142 *
 3143 * In order to avoid having to look at the other cfs_rq's, we use a
 3144 * differential update where we store the last value we propagated. This in
 3145 * turn allows skipping updates if the differential is 'small'.
 3146 *
 3147 * Updating tg's load_avg is necessary before update_cfs_share().
 3148 */
 3149static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 3150{
 3151	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 3152
 3153	/*
 3154	 * No need to update load_avg for root_task_group as it is not used.
 3155	 */
 3156	if (cfs_rq->tg == &root_task_group)
 3157		return;
 3158
 3159	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 3160		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 3161		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
 3162	}
 3163}
 3164
 3165/*
 3166 * Called within set_task_rq() right before setting a task's CPU. The
 3167 * caller only guarantees p->pi_lock is held; no other assumptions,
 3168 * including the state of rq->lock, should be made.
 3169 */
 3170void set_task_rq_fair(struct sched_entity *se,
 3171		      struct cfs_rq *prev, struct cfs_rq *next)
 3172{
 3173	u64 p_last_update_time;
 3174	u64 n_last_update_time;
 3175
 3176	if (!sched_feat(ATTACH_AGE_LOAD))
 3177		return;
 3178
 3179	/*
 3180	 * We are supposed to update the task to "current" time, then its up to
 3181	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
 3182	 * getting what current time is, so simply throw away the out-of-date
 3183	 * time. This will result in the wakee task is less decayed, but giving
 3184	 * the wakee more load sounds not bad.
 3185	 */
 3186	if (!(se->avg.last_update_time && prev))
 3187		return;
 3188
 3189#ifndef CONFIG_64BIT
 3190	{
 3191		u64 p_last_update_time_copy;
 3192		u64 n_last_update_time_copy;
 3193
 3194		do {
 3195			p_last_update_time_copy = prev->load_last_update_time_copy;
 3196			n_last_update_time_copy = next->load_last_update_time_copy;
 3197
 3198			smp_rmb();
 3199
 3200			p_last_update_time = prev->avg.last_update_time;
 3201			n_last_update_time = next->avg.last_update_time;
 3202
 3203		} while (p_last_update_time != p_last_update_time_copy ||
 3204			 n_last_update_time != n_last_update_time_copy);
 3205	}
 3206#else
 3207	p_last_update_time = prev->avg.last_update_time;
 3208	n_last_update_time = next->avg.last_update_time;
 3209#endif
 3210	__update_load_avg_blocked_se(p_last_update_time, se);
 3211	se->avg.last_update_time = n_last_update_time;
 3212}
 3213
 3214
 3215/*
 3216 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
 3217 * propagate its contribution. The key to this propagation is the invariant
 3218 * that for each group:
 3219 *
 3220 *   ge->avg == grq->avg						(1)
 3221 *
 3222 * _IFF_ we look at the pure running and runnable sums. Because they
 3223 * represent the very same entity, just at different points in the hierarchy.
 3224 *
 3225 * Per the above update_tg_cfs_util() is trivial and simply copies the running
 3226 * sum over (but still wrong, because the group entity and group rq do not have
 3227 * their PELT windows aligned).
 3228 *
 3229 * However, update_tg_cfs_runnable() is more complex. So we have:
 3230 *
 3231 *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
 3232 *
 3233 * And since, like util, the runnable part should be directly transferable,
 3234 * the following would _appear_ to be the straight forward approach:
 3235 *
 3236 *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg	(3)
 3237 *
 3238 * And per (1) we have:
 3239 *
 3240 *   ge->avg.runnable_avg == grq->avg.runnable_avg
 3241 *
 3242 * Which gives:
 3243 *
 3244 *                      ge->load.weight * grq->avg.load_avg
 3245 *   ge->avg.load_avg = -----------------------------------		(4)
 3246 *                               grq->load.weight
 3247 *
 3248 * Except that is wrong!
 3249 *
 3250 * Because while for entities historical weight is not important and we
 3251 * really only care about our future and therefore can consider a pure
 3252 * runnable sum, runqueues can NOT do this.
 3253 *
 3254 * We specifically want runqueues to have a load_avg that includes
 3255 * historical weights. Those represent the blocked load, the load we expect
 3256 * to (shortly) return to us. This only works by keeping the weights as
 3257 * integral part of the sum. We therefore cannot decompose as per (3).
 3258 *
 3259 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
 3260 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
 3261 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
 3262 * runnable section of these tasks overlap (or not). If they were to perfectly
 3263 * align the rq as a whole would be runnable 2/3 of the time. If however we
 3264 * always have at least 1 runnable task, the rq as a whole is always runnable.
 3265 *
 3266 * So we'll have to approximate.. :/
 3267 *
 3268 * Given the constraint:
 3269 *
 3270 *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
 3271 *
 3272 * We can construct a rule that adds runnable to a rq by assuming minimal
 3273 * overlap.
 3274 *
 3275 * On removal, we'll assume each task is equally runnable; which yields:
 3276 *
 3277 *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
 3278 *
 3279 * XXX: only do this for the part of runnable > running ?
 3280 *
 3281 */
 3282
 3283static inline void
 3284update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 3285{
 3286	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
 3287
 3288	/* Nothing to update */
 3289	if (!delta)
 3290		return;
 3291
 3292	/*
 3293	 * The relation between sum and avg is:
 3294	 *
 3295	 *   LOAD_AVG_MAX - 1024 + sa->period_contrib
 3296	 *
 3297	 * however, the PELT windows are not aligned between grq and gse.
 3298	 */
 3299
 3300	/* Set new sched_entity's utilization */
 3301	se->avg.util_avg = gcfs_rq->avg.util_avg;
 3302	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
 3303
 3304	/* Update parent cfs_rq utilization */
 3305	add_positive(&cfs_rq->avg.util_avg, delta);
 3306	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
 3307}
 3308
 3309static inline void
 3310update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 3311{
 3312	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
 3313	unsigned long runnable_load_avg, load_avg;
 3314	u64 runnable_load_sum, load_sum = 0;
 3315	s64 delta_sum;
 3316
 3317	if (!runnable_sum)
 3318		return;
 3319
 3320	gcfs_rq->prop_runnable_sum = 0;
 3321
 3322	if (runnable_sum >= 0) {
 3323		/*
 3324		 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
 3325		 * the CPU is saturated running == runnable.
 3326		 */
 3327		runnable_sum += se->avg.load_sum;
 3328		runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
 3329	} else {
 3330		/*
 3331		 * Estimate the new unweighted runnable_sum of the gcfs_rq by
 3332		 * assuming all tasks are equally runnable.
 3333		 */
 3334		if (scale_load_down(gcfs_rq->load.weight)) {
 3335			load_sum = div_s64(gcfs_rq->avg.load_sum,
 3336				scale_load_down(gcfs_rq->load.weight));
 3337		}
 3338
 3339		/* But make sure to not inflate se's runnable */
 3340		runnable_sum = min(se->avg.load_sum, load_sum);
 3341	}
 3342
 3343	/*
 3344	 * runnable_sum can't be lower than running_sum
 3345	 * Rescale running sum to be in the same range as runnable sum
 3346	 * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
 3347	 * runnable_sum is in [0 : LOAD_AVG_MAX]
 3348	 */
 3349	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
 3350	runnable_sum = max(runnable_sum, running_sum);
 3351
 3352	load_sum = (s64)se_weight(se) * runnable_sum;
 3353	load_avg = div_s64(load_sum, LOAD_AVG_MAX);
 3354
 3355	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
 3356	delta_avg = load_avg - se->avg.load_avg;
 3357
 3358	se->avg.load_sum = runnable_sum;
 3359	se->avg.load_avg = load_avg;
 3360	add_positive(&cfs_rq->avg.load_avg, delta_avg);
 3361	add_positive(&cfs_rq->avg.load_sum, delta_sum);
 3362
 3363	runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
 3364	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
 3365	delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
 3366	delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
 3367
 3368	se->avg.runnable_load_sum = runnable_sum;
 3369	se->avg.runnable_load_avg = runnable_load_avg;
 3370
 3371	if (se->on_rq) {
 3372		add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
 3373		add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
 3374	}
 3375}
 3376
 3377static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
 3378{
 3379	cfs_rq->propagate = 1;
 3380	cfs_rq->prop_runnable_sum += runnable_sum;
 3381}
 3382
 3383/* Update task and its cfs_rq load average */
 3384static inline int propagate_entity_load_avg(struct sched_entity *se)
 3385{
 3386	struct cfs_rq *cfs_rq, *gcfs_rq;
 3387
 3388	if (entity_is_task(se))
 3389		return 0;
 3390
 3391	gcfs_rq = group_cfs_rq(se);
 3392	if (!gcfs_rq->propagate)
 3393		return 0;
 3394
 3395	gcfs_rq->propagate = 0;
 3396
 3397	cfs_rq = cfs_rq_of(se);
 3398
 3399	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
 3400
 3401	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
 3402	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
 3403
 3404	trace_pelt_cfs_tp(cfs_rq);
 3405	trace_pelt_se_tp(se);
 3406
 3407	return 1;
 3408}
 3409
 3410/*
 3411 * Check if we need to update the load and the utilization of a blocked
 3412 * group_entity:
 3413 */
 3414static inline bool skip_blocked_update(struct sched_entity *se)
 3415{
 3416	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 3417
 3418	/*
 3419	 * If sched_entity still have not zero load or utilization, we have to
 3420	 * decay it:
 3421	 */
 3422	if (se->avg.load_avg || se->avg.util_avg)
 3423		return false;
 3424
 3425	/*
 3426	 * If there is a pending propagation, we have to update the load and
 3427	 * the utilization of the sched_entity:
 3428	 */
 3429	if (gcfs_rq->propagate)
 3430		return false;
 3431
 3432	/*
 3433	 * Otherwise, the load and the utilization of the sched_entity is
 3434	 * already zero and there is no pending propagation, so it will be a
 3435	 * waste of time to try to decay it:
 3436	 */
 3437	return true;
 3438}
 3439
 3440#else /* CONFIG_FAIR_GROUP_SCHED */
 3441
 3442static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 3443
 3444static inline int propagate_entity_load_avg(struct sched_entity *se)
 3445{
 3446	return 0;
 3447}
 3448
 3449static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
 3450
 3451#endif /* CONFIG_FAIR_GROUP_SCHED */
 3452
 3453/**
 3454 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
 3455 * @now: current time, as per cfs_rq_clock_pelt()
 3456 * @cfs_rq: cfs_rq to update
 3457 *
 3458 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
 3459 * avg. The immediate corollary is that all (fair) tasks must be attached, see
 3460 * post_init_entity_util_avg().
 3461 *
 3462 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
 3463 *
 3464 * Returns true if the load decayed or we removed load.
 3465 *
 3466 * Since both these conditions indicate a changed cfs_rq->avg.load we should
 3467 * call update_tg_load_avg() when this function returns true.
 3468 */
 3469static inline int
 3470update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 3471{
 3472	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
 3473	struct sched_avg *sa = &cfs_rq->avg;
 3474	int decayed = 0;
 3475
 3476	if (cfs_rq->removed.nr) {
 3477		unsigned long r;
 3478		u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 3479
 3480		raw_spin_lock(&cfs_rq->removed.lock);
 3481		swap(cfs_rq->removed.util_avg, removed_util);
 3482		swap(cfs_rq->removed.load_avg, removed_load);
 3483		swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
 3484		cfs_rq->removed.nr = 0;
 3485		raw_spin_unlock(&cfs_rq->removed.lock);
 3486
 3487		r = removed_load;
 3488		sub_positive(&sa->load_avg, r);
 3489		sub_positive(&sa->load_sum, r * divider);
 3490
 3491		r = removed_util;
 3492		sub_positive(&sa->util_avg, r);
 3493		sub_positive(&sa->util_sum, r * divider);
 3494
 3495		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
 
 3496
 3497		decayed = 1;
 
 
 
 3498	}
 
 
 3499
 3500	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
 3501
 3502#ifndef CONFIG_64BIT
 3503	smp_wmb();
 3504	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 3505#endif
 3506
 3507	if (decayed)
 3508		cfs_rq_util_change(cfs_rq, 0);
 3509
 3510	return decayed;
 3511}
 3512
 3513/**
 3514 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
 3515 * @cfs_rq: cfs_rq to attach to
 3516 * @se: sched_entity to attach
 3517 * @flags: migration hints
 3518 *
 3519 * Must call update_cfs_rq_load_avg() before this, since we rely on
 3520 * cfs_rq->avg.last_update_time being current.
 3521 */
 3522static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 3523{
 3524	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
 3525
 3526	/*
 3527	 * When we attach the @se to the @cfs_rq, we must align the decay
 3528	 * window because without that, really weird and wonderful things can
 3529	 * happen.
 3530	 *
 3531	 * XXX illustrate
 3532	 */
 3533	se->avg.last_update_time = cfs_rq->avg.last_update_time;
 3534	se->avg.period_contrib = cfs_rq->avg.period_contrib;
 3535
 3536	/*
 3537	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
 3538	 * period_contrib. This isn't strictly correct, but since we're
 3539	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
 3540	 * _sum a little.
 3541	 */
 3542	se->avg.util_sum = se->avg.util_avg * divider;
 3543
 3544	se->avg.load_sum = divider;
 3545	if (se_weight(se)) {
 3546		se->avg.load_sum =
 3547			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
 
 
 
 
 
 
 
 
 3548	}
 3549
 3550	se->avg.runnable_load_sum = se->avg.load_sum;
 3551
 3552	enqueue_load_avg(cfs_rq, se);
 3553	cfs_rq->avg.util_avg += se->avg.util_avg;
 3554	cfs_rq->avg.util_sum += se->avg.util_sum;
 3555
 3556	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 3557
 3558	cfs_rq_util_change(cfs_rq, flags);
 3559
 3560	trace_pelt_cfs_tp(cfs_rq);
 3561}
 3562
 3563/**
 3564 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
 3565 * @cfs_rq: cfs_rq to detach from
 3566 * @se: sched_entity to detach
 3567 *
 3568 * Must call update_cfs_rq_load_avg() before this, since we rely on
 3569 * cfs_rq->avg.last_update_time being current.
 3570 */
 3571static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 3572{
 3573	dequeue_load_avg(cfs_rq, se);
 3574	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 3575	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 3576
 3577	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 3578
 3579	cfs_rq_util_change(cfs_rq, 0);
 3580
 3581	trace_pelt_cfs_tp(cfs_rq);
 3582}
 3583
 3584/*
 3585 * Optional action to be done while updating the load average
 3586 */
 3587#define UPDATE_TG	0x1
 3588#define SKIP_AGE_LOAD	0x2
 3589#define DO_ATTACH	0x4
 3590
 3591/* Update task and its cfs_rq load average */
 3592static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 3593{
 3594	u64 now = cfs_rq_clock_pelt(cfs_rq);
 3595	int decayed;
 3596
 3597	/*
 3598	 * Track task load average for carrying it to new CPU after migrated, and
 3599	 * track group sched_entity load average for task_h_load calc in migration
 3600	 */
 3601	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
 3602		__update_load_avg_se(now, cfs_rq, se);
 3603
 3604	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
 3605	decayed |= propagate_entity_load_avg(se);
 3606
 3607	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
 3608
 3609		/*
 3610		 * DO_ATTACH means we're here from enqueue_entity().
 3611		 * !last_update_time means we've passed through
 3612		 * migrate_task_rq_fair() indicating we migrated.
 3613		 *
 3614		 * IOW we're enqueueing a task on a new CPU.
 3615		 */
 3616		attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
 3617		update_tg_load_avg(cfs_rq, 0);
 3618
 3619	} else if (decayed && (flags & UPDATE_TG))
 3620		update_tg_load_avg(cfs_rq, 0);
 3621}
 3622
 3623#ifndef CONFIG_64BIT
 3624static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 3625{
 3626	u64 last_update_time_copy;
 3627	u64 last_update_time;
 3628
 3629	do {
 3630		last_update_time_copy = cfs_rq->load_last_update_time_copy;
 3631		smp_rmb();
 3632		last_update_time = cfs_rq->avg.last_update_time;
 3633	} while (last_update_time != last_update_time_copy);
 3634
 3635	return last_update_time;
 3636}
 3637#else
 3638static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 3639{
 3640	return cfs_rq->avg.last_update_time;
 3641}
 3642#endif
 3643
 3644/*
 3645 * Synchronize entity load avg of dequeued entity without locking
 3646 * the previous rq.
 3647 */
 3648static void sync_entity_load_avg(struct sched_entity *se)
 3649{
 3650	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 3651	u64 last_update_time;
 3652
 3653	last_update_time = cfs_rq_last_update_time(cfs_rq);
 3654	__update_load_avg_blocked_se(last_update_time, se);
 3655}
 3656
 3657/*
 3658 * Task first catches up with cfs_rq, and then subtract
 3659 * itself from the cfs_rq (task must be off the queue now).
 3660 */
 3661static void remove_entity_load_avg(struct sched_entity *se)
 3662{
 3663	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 3664	unsigned long flags;
 3665
 3666	/*
 3667	 * tasks cannot exit without having gone through wake_up_new_task() ->
 3668	 * post_init_entity_util_avg() which will have added things to the
 3669	 * cfs_rq, so we can remove unconditionally.
 3670	 */
 3671
 3672	sync_entity_load_avg(se);
 3673
 3674	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
 3675	++cfs_rq->removed.nr;
 3676	cfs_rq->removed.util_avg	+= se->avg.util_avg;
 3677	cfs_rq->removed.load_avg	+= se->avg.load_avg;
 3678	cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */
 3679	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 3680}
 3681
 3682static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 3683{
 3684	return cfs_rq->avg.runnable_load_avg;
 3685}
 3686
 3687static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 3688{
 3689	return cfs_rq->avg.load_avg;
 3690}
 3691
 3692static inline unsigned long task_util(struct task_struct *p)
 3693{
 3694	return READ_ONCE(p->se.avg.util_avg);
 3695}
 3696
 3697static inline unsigned long _task_util_est(struct task_struct *p)
 3698{
 3699	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 3700
 3701	return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
 3702}
 3703
 3704static inline unsigned long task_util_est(struct task_struct *p)
 3705{
 3706	return max(task_util(p), _task_util_est(p));
 3707}
 3708
 3709static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 3710				    struct task_struct *p)
 3711{
 3712	unsigned int enqueued;
 3713
 3714	if (!sched_feat(UTIL_EST))
 3715		return;
 3716
 3717	/* Update root cfs_rq's estimated utilization */
 3718	enqueued  = cfs_rq->avg.util_est.enqueued;
 3719	enqueued += _task_util_est(p);
 3720	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
 3721}
 3722
 3723/*
 3724 * Check if a (signed) value is within a specified (unsigned) margin,
 3725 * based on the observation that:
 3726 *
 3727 *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
 3728 *
 3729 * NOTE: this only works when value + maring < INT_MAX.
 3730 */
 3731static inline bool within_margin(int value, int margin)
 3732{
 3733	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
 3734}
 3735
 3736static void
 3737util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 3738{
 3739	long last_ewma_diff;
 3740	struct util_est ue;
 3741	int cpu;
 3742
 3743	if (!sched_feat(UTIL_EST))
 3744		return;
 3745
 3746	/* Update root cfs_rq's estimated utilization */
 3747	ue.enqueued  = cfs_rq->avg.util_est.enqueued;
 3748	ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
 3749	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
 3750
 3751	/*
 3752	 * Skip update of task's estimated utilization when the task has not
 3753	 * yet completed an activation, e.g. being migrated.
 3754	 */
 3755	if (!task_sleep)
 3756		return;
 3757
 3758	/*
 3759	 * If the PELT values haven't changed since enqueue time,
 3760	 * skip the util_est update.
 3761	 */
 3762	ue = p->se.avg.util_est;
 3763	if (ue.enqueued & UTIL_AVG_UNCHANGED)
 3764		return;
 3765
 3766	/*
 3767	 * Skip update of task's estimated utilization when its EWMA is
 3768	 * already ~1% close to its last activation value.
 3769	 */
 3770	ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
 3771	last_ewma_diff = ue.enqueued - ue.ewma;
 3772	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
 3773		return;
 3774
 3775	/*
 3776	 * To avoid overestimation of actual task utilization, skip updates if
 3777	 * we cannot grant there is idle time in this CPU.
 3778	 */
 3779	cpu = cpu_of(rq_of(cfs_rq));
 3780	if (task_util(p) > capacity_orig_of(cpu))
 3781		return;
 3782
 3783	/*
 3784	 * Update Task's estimated utilization
 3785	 *
 3786	 * When *p completes an activation we can consolidate another sample
 3787	 * of the task size. This is done by storing the current PELT value
 3788	 * as ue.enqueued and by using this value to update the Exponential
 3789	 * Weighted Moving Average (EWMA):
 3790	 *
 3791	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
 3792	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
 3793	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
 3794	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
 3795	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
 3796	 *
 3797	 * Where 'w' is the weight of new samples, which is configured to be
 3798	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
 3799	 */
 3800	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
 3801	ue.ewma  += last_ewma_diff;
 3802	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
 3803	WRITE_ONCE(p->se.avg.util_est, ue);
 3804}
 3805
 3806static inline int task_fits_capacity(struct task_struct *p, long capacity)
 3807{
 3808	return fits_capacity(task_util_est(p), capacity);
 3809}
 3810
 3811static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 3812{
 3813	if (!static_branch_unlikely(&sched_asym_cpucapacity))
 3814		return;
 3815
 3816	if (!p) {
 3817		rq->misfit_task_load = 0;
 3818		return;
 3819	}
 3820
 3821	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
 3822		rq->misfit_task_load = 0;
 3823		return;
 3824	}
 3825
 3826	rq->misfit_task_load = task_h_load(p);
 3827}
 3828
 3829#else /* CONFIG_SMP */
 3830
 3831#define UPDATE_TG	0x0
 3832#define SKIP_AGE_LOAD	0x0
 3833#define DO_ATTACH	0x0
 3834
 3835static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 3836{
 3837	cfs_rq_util_change(cfs_rq, 0);
 3838}
 3839
 3840static inline void remove_entity_load_avg(struct sched_entity *se) {}
 3841
 3842static inline void
 3843attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
 3844static inline void
 3845detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 3846
 3847static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
 3848{
 3849	return 0;
 3850}
 3851
 3852static inline void
 3853util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 3854
 3855static inline void
 3856util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
 3857		 bool task_sleep) {}
 3858static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
 3859
 3860#endif /* CONFIG_SMP */
 3861
 3862static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 3863{
 3864#ifdef CONFIG_SCHED_DEBUG
 3865	s64 d = se->vruntime - cfs_rq->min_vruntime;
 3866
 3867	if (d < 0)
 3868		d = -d;
 3869
 3870	if (d > 3*sysctl_sched_latency)
 3871		schedstat_inc(cfs_rq->nr_spread_over);
 3872#endif
 3873}
 3874
 3875static void
 3876place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 3877{
 3878	u64 vruntime = cfs_rq->min_vruntime;
 3879
 3880	/*
 3881	 * The 'current' period is already promised to the current tasks,
 3882	 * however the extra weight of the new task will slow them down a
 3883	 * little, place the new task so that it fits in the slot that
 3884	 * stays open at the end.
 3885	 */
 3886	if (initial && sched_feat(START_DEBIT))
 3887		vruntime += sched_vslice(cfs_rq, se);
 3888
 3889	/* sleeps up to a single latency don't count. */
 3890	if (!initial) {
 3891		unsigned long thresh = sysctl_sched_latency;
 3892
 3893		/*
 3894		 * Halve their sleep time's effect, to allow
 3895		 * for a gentler effect of sleepers:
 3896		 */
 3897		if (sched_feat(GENTLE_FAIR_SLEEPERS))
 3898			thresh >>= 1;
 3899
 3900		vruntime -= thresh;
 3901	}
 3902
 3903	/* ensure we never gain time by being placed backwards. */
 3904	se->vruntime = max_vruntime(se->vruntime, vruntime);
 3905}
 3906
 3907static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 3908
 3909static inline void check_schedstat_required(void)
 3910{
 3911#ifdef CONFIG_SCHEDSTATS
 3912	if (schedstat_enabled())
 3913		return;
 3914
 3915	/* Force schedstat enabled if a dependent tracepoint is active */
 3916	if (trace_sched_stat_wait_enabled()    ||
 3917			trace_sched_stat_sleep_enabled()   ||
 3918			trace_sched_stat_iowait_enabled()  ||
 3919			trace_sched_stat_blocked_enabled() ||
 3920			trace_sched_stat_runtime_enabled())  {
 3921		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
 3922			     "stat_blocked and stat_runtime require the "
 3923			     "kernel parameter schedstats=enable or "
 3924			     "kernel.sched_schedstats=1\n");
 3925	}
 3926#endif
 3927}
 3928
 3929
 3930/*
 3931 * MIGRATION
 3932 *
 3933 *	dequeue
 3934 *	  update_curr()
 3935 *	    update_min_vruntime()
 3936 *	  vruntime -= min_vruntime
 3937 *
 3938 *	enqueue
 3939 *	  update_curr()
 3940 *	    update_min_vruntime()
 3941 *	  vruntime += min_vruntime
 3942 *
 3943 * this way the vruntime transition between RQs is done when both
 3944 * min_vruntime are up-to-date.
 3945 *
 3946 * WAKEUP (remote)
 3947 *
 3948 *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
 3949 *	  vruntime -= min_vruntime
 3950 *
 3951 *	enqueue
 3952 *	  update_curr()
 3953 *	    update_min_vruntime()
 3954 *	  vruntime += min_vruntime
 3955 *
 3956 * this way we don't have the most up-to-date min_vruntime on the originating
 3957 * CPU and an up-to-date min_vruntime on the destination CPU.
 3958 */
 3959
 3960static void
 3961enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 3962{
 3963	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
 3964	bool curr = cfs_rq->curr == se;
 3965
 3966	/*
 3967	 * If we're the current task, we must renormalise before calling
 3968	 * update_curr().
 3969	 */
 3970	if (renorm && curr)
 3971		se->vruntime += cfs_rq->min_vruntime;
 3972
 3973	update_curr(cfs_rq);
 3974
 3975	/*
 3976	 * Otherwise, renormalise after, such that we're placed at the current
 3977	 * moment in time, instead of some random moment in the past. Being
 3978	 * placed in the past could significantly boost this task to the
 3979	 * fairness detriment of existing tasks.
 3980	 */
 3981	if (renorm && !curr)
 3982		se->vruntime += cfs_rq->min_vruntime;
 3983
 3984	/*
 3985	 * When enqueuing a sched_entity, we must:
 3986	 *   - Update loads to have both entity and cfs_rq synced with now.
 3987	 *   - Add its load to cfs_rq->runnable_avg
 3988	 *   - For group_entity, update its weight to reflect the new share of
 3989	 *     its group cfs_rq
 3990	 *   - Add its new weight to cfs_rq->load.weight
 3991	 */
 3992	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
 3993	update_cfs_group(se);
 3994	enqueue_runnable_load_avg(cfs_rq, se);
 3995	account_entity_enqueue(cfs_rq, se);
 
 3996
 3997	if (flags & ENQUEUE_WAKEUP)
 3998		place_entity(cfs_rq, se, 0);
 
 
 3999
 4000	check_schedstat_required();
 4001	update_stats_enqueue(cfs_rq, se, flags);
 4002	check_spread(cfs_rq, se);
 4003	if (!curr)
 4004		__enqueue_entity(cfs_rq, se);
 4005	se->on_rq = 1;
 4006
 4007	if (cfs_rq->nr_running == 1) {
 4008		list_add_leaf_cfs_rq(cfs_rq);
 4009		check_enqueue_throttle(cfs_rq);
 4010	}
 4011}
 4012
 4013static void __clear_buddies_last(struct sched_entity *se)
 4014{
 4015	for_each_sched_entity(se) {
 4016		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 4017		if (cfs_rq->last != se)
 
 
 4018			break;
 4019
 4020		cfs_rq->last = NULL;
 4021	}
 4022}
 4023
 4024static void __clear_buddies_next(struct sched_entity *se)
 4025{
 4026	for_each_sched_entity(se) {
 4027		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 4028		if (cfs_rq->next != se)
 
 
 4029			break;
 4030
 4031		cfs_rq->next = NULL;
 4032	}
 4033}
 4034
 4035static void __clear_buddies_skip(struct sched_entity *se)
 4036{
 4037	for_each_sched_entity(se) {
 4038		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 4039		if (cfs_rq->skip != se)
 
 
 4040			break;
 4041
 4042		cfs_rq->skip = NULL;
 4043	}
 4044}
 4045
 4046static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 4047{
 4048	if (cfs_rq->last == se)
 4049		__clear_buddies_last(se);
 4050
 4051	if (cfs_rq->next == se)
 4052		__clear_buddies_next(se);
 4053
 4054	if (cfs_rq->skip == se)
 4055		__clear_buddies_skip(se);
 4056}
 4057
 4058static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 4059
 4060static void
 4061dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 4062{
 4063	/*
 4064	 * Update run-time statistics of the 'current'.
 4065	 */
 4066	update_curr(cfs_rq);
 4067
 4068	/*
 4069	 * When dequeuing a sched_entity, we must:
 4070	 *   - Update loads to have both entity and cfs_rq synced with now.
 4071	 *   - Subtract its load from the cfs_rq->runnable_avg.
 4072	 *   - Subtract its previous weight from cfs_rq->load.weight.
 4073	 *   - For group entity, update its weight to reflect the new share
 4074	 *     of its group cfs_rq.
 4075	 */
 4076	update_load_avg(cfs_rq, se, UPDATE_TG);
 4077	dequeue_runnable_load_avg(cfs_rq, se);
 4078
 4079	update_stats_dequeue(cfs_rq, se, flags);
 
 
 
 
 
 
 4080
 4081	clear_buddies(cfs_rq, se);
 4082
 4083	if (se != cfs_rq->curr)
 4084		__dequeue_entity(cfs_rq, se);
 4085	se->on_rq = 0;
 
 4086	account_entity_dequeue(cfs_rq, se);
 4087
 4088	/*
 4089	 * Normalize after update_curr(); which will also have moved
 4090	 * min_vruntime if @se is the one holding it back. But before doing
 4091	 * update_min_vruntime() again, which will discount @se's position and
 4092	 * can move min_vruntime forward still more.
 4093	 */
 4094	if (!(flags & DEQUEUE_SLEEP))
 4095		se->vruntime -= cfs_rq->min_vruntime;
 4096
 4097	/* return excess runtime on last dequeue */
 4098	return_cfs_rq_runtime(cfs_rq);
 4099
 4100	update_cfs_group(se);
 4101
 4102	/*
 4103	 * Now advance min_vruntime if @se was the entity holding it back,
 4104	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
 4105	 * put back on, and if we advance min_vruntime, we'll be placed back
 4106	 * further than we started -- ie. we'll be penalized.
 4107	 */
 4108	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
 4109		update_min_vruntime(cfs_rq);
 4110}
 4111
 4112/*
 4113 * Preempt the current task with a newly woken task if needed:
 4114 */
 4115static void
 4116check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 4117{
 4118	unsigned long ideal_runtime, delta_exec;
 4119	struct sched_entity *se;
 4120	s64 delta;
 4121
 4122	ideal_runtime = sched_slice(cfs_rq, curr);
 4123	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 4124	if (delta_exec > ideal_runtime) {
 4125		resched_curr(rq_of(cfs_rq));
 4126		/*
 4127		 * The current task ran long enough, ensure it doesn't get
 4128		 * re-elected due to buddy favours.
 4129		 */
 4130		clear_buddies(cfs_rq, curr);
 4131		return;
 4132	}
 4133
 4134	/*
 4135	 * Ensure that a task that missed wakeup preemption by a
 4136	 * narrow margin doesn't have to wait for a full slice.
 4137	 * This also mitigates buddy induced latencies under load.
 4138	 */
 4139	if (delta_exec < sysctl_sched_min_granularity)
 4140		return;
 4141
 4142	se = __pick_first_entity(cfs_rq);
 4143	delta = curr->vruntime - se->vruntime;
 4144
 4145	if (delta < 0)
 4146		return;
 4147
 4148	if (delta > ideal_runtime)
 4149		resched_curr(rq_of(cfs_rq));
 4150}
 4151
 4152static void
 4153set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 4154{
 4155	/* 'current' is not kept within the tree. */
 4156	if (se->on_rq) {
 4157		/*
 4158		 * Any task has to be enqueued before it get to execute on
 4159		 * a CPU. So account for the time it spent waiting on the
 4160		 * runqueue.
 4161		 */
 4162		update_stats_wait_end(cfs_rq, se);
 4163		__dequeue_entity(cfs_rq, se);
 4164		update_load_avg(cfs_rq, se, UPDATE_TG);
 4165	}
 4166
 4167	update_stats_curr_start(cfs_rq, se);
 4168	cfs_rq->curr = se;
 4169
 4170	/*
 4171	 * Track our maximum slice length, if the CPU's load is at
 4172	 * least twice that of our own weight (i.e. dont track it
 4173	 * when there are only lesser-weight tasks around):
 4174	 */
 4175	if (schedstat_enabled() &&
 4176	    rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
 4177		schedstat_set(se->statistics.slice_max,
 4178			max((u64)schedstat_val(se->statistics.slice_max),
 4179			    se->sum_exec_runtime - se->prev_sum_exec_runtime));
 4180	}
 4181
 4182	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 4183}
 4184
 4185static int
 4186wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 4187
 4188/*
 4189 * Pick the next process, keeping these things in mind, in this order:
 4190 * 1) keep things fair between processes/task groups
 4191 * 2) pick the "next" process, since someone really wants that to run
 4192 * 3) pick the "last" process, for cache locality
 4193 * 4) do not run the "skip" process, if something else is available
 4194 */
 4195static struct sched_entity *
 4196pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 4197{
 4198	struct sched_entity *left = __pick_first_entity(cfs_rq);
 4199	struct sched_entity *se;
 4200
 4201	/*
 4202	 * If curr is set we have to see if its left of the leftmost entity
 4203	 * still in the tree, provided there was anything in the tree at all.
 4204	 */
 4205	if (!left || (curr && entity_before(curr, left)))
 4206		left = curr;
 4207
 4208	se = left; /* ideally we run the leftmost entity */
 4209
 4210	/*
 4211	 * Avoid running the skip buddy, if running something else can
 4212	 * be done without getting too unfair.
 4213	 */
 4214	if (cfs_rq->skip == se) {
 4215		struct sched_entity *second;
 4216
 4217		if (se == curr) {
 4218			second = __pick_first_entity(cfs_rq);
 4219		} else {
 4220			second = __pick_next_entity(se);
 4221			if (!second || (curr && entity_before(curr, second)))
 4222				second = curr;
 4223		}
 4224
 4225		if (second && wakeup_preempt_entity(second, left) < 1)
 4226			se = second;
 4227	}
 4228
 4229	/*
 4230	 * Prefer last buddy, try to return the CPU to a preempted task.
 4231	 */
 4232	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
 4233		se = cfs_rq->last;
 4234
 4235	/*
 4236	 * Someone really wants this to run. If it's not unfair, run it.
 4237	 */
 4238	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
 4239		se = cfs_rq->next;
 4240
 4241	clear_buddies(cfs_rq, se);
 4242
 4243	return se;
 4244}
 4245
 4246static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 4247
 4248static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 4249{
 4250	/*
 4251	 * If still on the runqueue then deactivate_task()
 4252	 * was not called and update_curr() has to be done:
 4253	 */
 4254	if (prev->on_rq)
 4255		update_curr(cfs_rq);
 4256
 4257	/* throttle cfs_rqs exceeding runtime */
 4258	check_cfs_rq_runtime(cfs_rq);
 4259
 4260	check_spread(cfs_rq, prev);
 4261
 4262	if (prev->on_rq) {
 4263		update_stats_wait_start(cfs_rq, prev);
 4264		/* Put 'current' back into the tree. */
 4265		__enqueue_entity(cfs_rq, prev);
 4266		/* in !on_rq case, update occurred at dequeue */
 4267		update_load_avg(cfs_rq, prev, 0);
 4268	}
 4269	cfs_rq->curr = NULL;
 4270}
 4271
 4272static void
 4273entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 4274{
 4275	/*
 4276	 * Update run-time statistics of the 'current'.
 4277	 */
 4278	update_curr(cfs_rq);
 4279
 4280	/*
 4281	 * Ensure that runnable average is periodically updated.
 4282	 */
 4283	update_load_avg(cfs_rq, curr, UPDATE_TG);
 4284	update_cfs_group(curr);
 4285
 4286#ifdef CONFIG_SCHED_HRTICK
 4287	/*
 4288	 * queued ticks are scheduled to match the slice, so don't bother
 4289	 * validating it and just reschedule.
 4290	 */
 4291	if (queued) {
 4292		resched_curr(rq_of(cfs_rq));
 4293		return;
 4294	}
 4295	/*
 4296	 * don't let the period tick interfere with the hrtick preemption
 4297	 */
 4298	if (!sched_feat(DOUBLE_TICK) &&
 4299			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
 4300		return;
 4301#endif
 4302
 4303	if (cfs_rq->nr_running > 1)
 4304		check_preempt_tick(cfs_rq, curr);
 4305}
 4306
 4307
 4308/**************************************************
 4309 * CFS bandwidth control machinery
 4310 */
 4311
 4312#ifdef CONFIG_CFS_BANDWIDTH
 4313
 4314#ifdef CONFIG_JUMP_LABEL
 4315static struct static_key __cfs_bandwidth_used;
 4316
 4317static inline bool cfs_bandwidth_used(void)
 4318{
 4319	return static_key_false(&__cfs_bandwidth_used);
 4320}
 4321
 4322void cfs_bandwidth_usage_inc(void)
 4323{
 4324	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
 
 
 
 
 4325}
 4326
 4327void cfs_bandwidth_usage_dec(void)
 4328{
 4329	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 4330}
 4331#else /* CONFIG_JUMP_LABEL */
 4332static bool cfs_bandwidth_used(void)
 4333{
 4334	return true;
 4335}
 4336
 4337void cfs_bandwidth_usage_inc(void) {}
 4338void cfs_bandwidth_usage_dec(void) {}
 4339#endif /* CONFIG_JUMP_LABEL */
 4340
 4341/*
 4342 * default period for cfs group bandwidth.
 4343 * default: 0.1s, units: nanoseconds
 4344 */
 4345static inline u64 default_cfs_period(void)
 4346{
 4347	return 100000000ULL;
 4348}
 4349
 4350static inline u64 sched_cfs_bandwidth_slice(void)
 4351{
 4352	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
 4353}
 4354
 4355/*
 4356 * Replenish runtime according to assigned quota. We use sched_clock_cpu
 4357 * directly instead of rq->clock to avoid adding additional synchronization
 4358 * around rq->lock.
 4359 *
 4360 * requires cfs_b->lock
 4361 */
 4362void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 4363{
 4364	if (cfs_b->quota != RUNTIME_INF)
 4365		cfs_b->runtime = cfs_b->quota;
 
 
 
 
 
 
 4366}
 4367
 4368static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 4369{
 4370	return &tg->cfs_bandwidth;
 4371}
 4372
 4373/* returns 0 on failure to allocate runtime */
 4374static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 4375{
 4376	struct task_group *tg = cfs_rq->tg;
 4377	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 4378	u64 amount = 0, min_amount;
 4379
 4380	/* note: this is a positive sum as runtime_remaining <= 0 */
 4381	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
 4382
 4383	raw_spin_lock(&cfs_b->lock);
 4384	if (cfs_b->quota == RUNTIME_INF)
 4385		amount = min_amount;
 4386	else {
 4387		start_cfs_bandwidth(cfs_b);
 
 
 
 
 
 
 
 
 
 4388
 4389		if (cfs_b->runtime > 0) {
 4390			amount = min(cfs_b->runtime, min_amount);
 4391			cfs_b->runtime -= amount;
 4392			cfs_b->idle = 0;
 4393		}
 4394	}
 
 4395	raw_spin_unlock(&cfs_b->lock);
 4396
 4397	cfs_rq->runtime_remaining += amount;
 
 
 
 
 
 
 
 4398
 4399	return cfs_rq->runtime_remaining > 0;
 4400}
 4401
 4402static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 4403{
 4404	/* dock delta_exec before expiring quota (as it could span periods) */
 4405	cfs_rq->runtime_remaining -= delta_exec;
 
 4406
 4407	if (likely(cfs_rq->runtime_remaining > 0))
 4408		return;
 4409
 4410	if (cfs_rq->throttled)
 4411		return;
 4412	/*
 4413	 * if we're unable to extend our runtime we resched so that the active
 4414	 * hierarchy can be throttled
 4415	 */
 4416	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
 4417		resched_curr(rq_of(cfs_rq));
 4418}
 4419
 4420static __always_inline
 4421void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 4422{
 4423	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 4424		return;
 4425
 4426	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 4427}
 4428
 4429static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 4430{
 4431	return cfs_bandwidth_used() && cfs_rq->throttled;
 4432}
 4433
 4434/* check whether cfs_rq, or any parent, is throttled */
 4435static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 4436{
 4437	return cfs_bandwidth_used() && cfs_rq->throttle_count;
 4438}
 4439
 4440/*
 4441 * Ensure that neither of the group entities corresponding to src_cpu or
 4442 * dest_cpu are members of a throttled hierarchy when performing group
 4443 * load-balance operations.
 4444 */
 4445static inline int throttled_lb_pair(struct task_group *tg,
 4446				    int src_cpu, int dest_cpu)
 4447{
 4448	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
 4449
 4450	src_cfs_rq = tg->cfs_rq[src_cpu];
 4451	dest_cfs_rq = tg->cfs_rq[dest_cpu];
 4452
 4453	return throttled_hierarchy(src_cfs_rq) ||
 4454	       throttled_hierarchy(dest_cfs_rq);
 4455}
 4456
 
 4457static int tg_unthrottle_up(struct task_group *tg, void *data)
 4458{
 4459	struct rq *rq = data;
 4460	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 4461
 4462	cfs_rq->throttle_count--;
 
 4463	if (!cfs_rq->throttle_count) {
 4464		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
 4465					     cfs_rq->throttled_clock_task;
 4466
 4467		/* Add cfs_rq with already running entity in the list */
 4468		if (cfs_rq->nr_running >= 1)
 4469			list_add_leaf_cfs_rq(cfs_rq);
 
 
 
 4470	}
 
 4471
 4472	return 0;
 4473}
 4474
 4475static int tg_throttle_down(struct task_group *tg, void *data)
 4476{
 4477	struct rq *rq = data;
 4478	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 4479
 4480	/* group is entering throttled state, stop time */
 4481	if (!cfs_rq->throttle_count) {
 4482		cfs_rq->throttled_clock_task = rq_clock_task(rq);
 4483		list_del_leaf_cfs_rq(cfs_rq);
 4484	}
 4485	cfs_rq->throttle_count++;
 4486
 4487	return 0;
 4488}
 4489
 4490static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 4491{
 4492	struct rq *rq = rq_of(cfs_rq);
 4493	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 4494	struct sched_entity *se;
 4495	long task_delta, idle_task_delta, dequeue = 1;
 4496	bool empty;
 4497
 4498	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 4499
 4500	/* freeze hierarchy runnable averages while throttled */
 4501	rcu_read_lock();
 4502	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 4503	rcu_read_unlock();
 4504
 4505	task_delta = cfs_rq->h_nr_running;
 4506	idle_task_delta = cfs_rq->idle_h_nr_running;
 4507	for_each_sched_entity(se) {
 4508		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 4509		/* throttled entity or throttle-on-deactivate */
 4510		if (!se->on_rq)
 4511			break;
 4512
 4513		if (dequeue)
 4514			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 4515		qcfs_rq->h_nr_running -= task_delta;
 4516		qcfs_rq->idle_h_nr_running -= idle_task_delta;
 4517
 4518		if (qcfs_rq->load.weight)
 4519			dequeue = 0;
 4520	}
 4521
 4522	if (!se)
 4523		sub_nr_running(rq, task_delta);
 4524
 4525	cfs_rq->throttled = 1;
 4526	cfs_rq->throttled_clock = rq_clock(rq);
 4527	raw_spin_lock(&cfs_b->lock);
 4528	empty = list_empty(&cfs_b->throttled_cfs_rq);
 4529
 4530	/*
 4531	 * Add to the _head_ of the list, so that an already-started
 4532	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
 4533	 * not running add to the tail so that later runqueues don't get starved.
 4534	 */
 4535	if (cfs_b->distribute_running)
 4536		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 4537	else
 4538		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 4539
 4540	/*
 4541	 * If we're the first throttled task, make sure the bandwidth
 4542	 * timer is running.
 4543	 */
 4544	if (empty)
 4545		start_cfs_bandwidth(cfs_b);
 4546
 4547	raw_spin_unlock(&cfs_b->lock);
 4548}
 4549
 4550void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 4551{
 4552	struct rq *rq = rq_of(cfs_rq);
 4553	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 4554	struct sched_entity *se;
 4555	int enqueue = 1;
 4556	long task_delta, idle_task_delta;
 4557
 4558	se = cfs_rq->tg->se[cpu_of(rq)];
 4559
 4560	cfs_rq->throttled = 0;
 4561
 4562	update_rq_clock(rq);
 4563
 4564	raw_spin_lock(&cfs_b->lock);
 4565	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
 4566	list_del_rcu(&cfs_rq->throttled_list);
 4567	raw_spin_unlock(&cfs_b->lock);
 
 4568
 
 4569	/* update hierarchical throttle state */
 4570	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
 4571
 4572	if (!cfs_rq->load.weight)
 4573		return;
 4574
 4575	task_delta = cfs_rq->h_nr_running;
 4576	idle_task_delta = cfs_rq->idle_h_nr_running;
 4577	for_each_sched_entity(se) {
 4578		if (se->on_rq)
 4579			enqueue = 0;
 4580
 4581		cfs_rq = cfs_rq_of(se);
 4582		if (enqueue)
 4583			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
 4584		cfs_rq->h_nr_running += task_delta;
 4585		cfs_rq->idle_h_nr_running += idle_task_delta;
 4586
 4587		if (cfs_rq_throttled(cfs_rq))
 4588			break;
 4589	}
 4590
 4591	assert_list_leaf_cfs_rq(rq);
 4592
 4593	if (!se)
 4594		add_nr_running(rq, task_delta);
 4595
 4596	/* Determine whether we need to wake up potentially idle CPU: */
 4597	if (rq->curr == rq->idle && rq->cfs.nr_running)
 4598		resched_curr(rq);
 4599}
 4600
 4601static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
 
 4602{
 4603	struct cfs_rq *cfs_rq;
 4604	u64 runtime;
 4605	u64 starting_runtime = remaining;
 4606
 4607	rcu_read_lock();
 4608	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
 4609				throttled_list) {
 4610		struct rq *rq = rq_of(cfs_rq);
 4611		struct rq_flags rf;
 4612
 4613		rq_lock_irqsave(rq, &rf);
 4614		if (!cfs_rq_throttled(cfs_rq))
 4615			goto next;
 4616
 4617		/* By the above check, this should never be true */
 4618		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
 4619
 4620		runtime = -cfs_rq->runtime_remaining + 1;
 4621		if (runtime > remaining)
 4622			runtime = remaining;
 4623		remaining -= runtime;
 4624
 4625		cfs_rq->runtime_remaining += runtime;
 
 4626
 4627		/* we check whether we're throttled above */
 4628		if (cfs_rq->runtime_remaining > 0)
 4629			unthrottle_cfs_rq(cfs_rq);
 4630
 4631next:
 4632		rq_unlock_irqrestore(rq, &rf);
 4633
 4634		if (!remaining)
 4635			break;
 4636	}
 4637	rcu_read_unlock();
 4638
 4639	return starting_runtime - remaining;
 4640}
 4641
 4642/*
 4643 * Responsible for refilling a task_group's bandwidth and unthrottling its
 4644 * cfs_rqs as appropriate. If there has been no activity within the last
 4645 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
 4646 * used to track this state.
 4647 */
 4648static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
 4649{
 4650	u64 runtime;
 4651	int throttled;
 4652
 
 4653	/* no need to continue the timer with no bandwidth constraint */
 4654	if (cfs_b->quota == RUNTIME_INF)
 4655		goto out_deactivate;
 4656
 4657	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
 
 4658	cfs_b->nr_periods += overrun;
 4659
 4660	/*
 4661	 * idle depends on !throttled (for the case of a large deficit), and if
 4662	 * we're going inactive then everything else can be deferred
 4663	 */
 4664	if (cfs_b->idle && !throttled)
 4665		goto out_deactivate;
 4666
 4667	__refill_cfs_bandwidth_runtime(cfs_b);
 4668
 4669	if (!throttled) {
 4670		/* mark as potentially idle for the upcoming period */
 4671		cfs_b->idle = 1;
 4672		return 0;
 4673	}
 4674
 4675	/* account preceding periods in which throttling occurred */
 4676	cfs_b->nr_throttled += overrun;
 4677
 4678	/*
 4679	 * This check is repeated as we are holding onto the new bandwidth while
 4680	 * we unthrottle. This can potentially race with an unthrottled group
 4681	 * trying to acquire new bandwidth from the global pool. This can result
 4682	 * in us over-using our runtime if it is all used during this loop, but
 4683	 * only by limited amounts in that extreme case.
 4684	 */
 4685	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
 4686		runtime = cfs_b->runtime;
 4687		cfs_b->distribute_running = 1;
 4688		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 
 
 
 
 
 
 
 4689		/* we can't nest cfs_b->lock while distributing bandwidth */
 4690		runtime = distribute_cfs_runtime(cfs_b, runtime);
 4691		raw_spin_lock_irqsave(&cfs_b->lock, flags);
 
 4692
 4693		cfs_b->distribute_running = 0;
 4694		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 4695
 4696		lsub_positive(&cfs_b->runtime, runtime);
 4697	}
 4698
 
 
 4699	/*
 4700	 * While we are ensured activity in the period following an
 4701	 * unthrottle, this also covers the case in which the new bandwidth is
 4702	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
 4703	 * timer to remain active while there are any throttled entities.)
 4704	 */
 4705	cfs_b->idle = 0;
 
 
 
 
 4706
 4707	return 0;
 4708
 4709out_deactivate:
 4710	return 1;
 4711}
 4712
 4713/* a cfs_rq won't donate quota below this amount */
 4714static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
 4715/* minimum remaining period time to redistribute slack quota */
 4716static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 4717/* how long we wait to gather additional slack before distributing */
 4718static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
 4719
 4720/*
 4721 * Are we near the end of the current quota period?
 4722 *
 4723 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
 4724 * hrtimer base being cleared by hrtimer_start. In the case of
 4725 * migrate_hrtimers, base is never cleared, so we are fine.
 4726 */
 4727static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 4728{
 4729	struct hrtimer *refresh_timer = &cfs_b->period_timer;
 4730	u64 remaining;
 4731
 4732	/* if the call-back is running a quota refresh is already occurring */
 4733	if (hrtimer_callback_running(refresh_timer))
 4734		return 1;
 4735
 4736	/* is a quota refresh about to occur? */
 4737	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
 4738	if (remaining < min_expire)
 4739		return 1;
 4740
 4741	return 0;
 4742}
 4743
 4744static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
 4745{
 4746	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
 4747
 4748	/* if there's a quota refresh soon don't bother with slack */
 4749	if (runtime_refresh_within(cfs_b, min_left))
 4750		return;
 4751
 4752	/* don't push forwards an existing deferred unthrottle */
 4753	if (cfs_b->slack_started)
 4754		return;
 4755	cfs_b->slack_started = true;
 4756
 4757	hrtimer_start(&cfs_b->slack_timer,
 4758			ns_to_ktime(cfs_bandwidth_slack_period),
 4759			HRTIMER_MODE_REL);
 4760}
 4761
 4762/* we know any runtime found here is valid as update_curr() precedes return */
 4763static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 4764{
 4765	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 4766	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
 4767
 4768	if (slack_runtime <= 0)
 4769		return;
 4770
 4771	raw_spin_lock(&cfs_b->lock);
 4772	if (cfs_b->quota != RUNTIME_INF) {
 
 4773		cfs_b->runtime += slack_runtime;
 4774
 4775		/* we are under rq->lock, defer unthrottling using a timer */
 4776		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
 4777		    !list_empty(&cfs_b->throttled_cfs_rq))
 4778			start_cfs_slack_bandwidth(cfs_b);
 4779	}
 4780	raw_spin_unlock(&cfs_b->lock);
 4781
 4782	/* even if it's not valid for return we don't want to try again */
 4783	cfs_rq->runtime_remaining -= slack_runtime;
 4784}
 4785
 4786static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 4787{
 4788	if (!cfs_bandwidth_used())
 4789		return;
 4790
 4791	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
 4792		return;
 4793
 4794	__return_cfs_rq_runtime(cfs_rq);
 4795}
 4796
 4797/*
 4798 * This is done with a timer (instead of inline with bandwidth return) since
 4799 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
 4800 */
 4801static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 4802{
 4803	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
 4804	unsigned long flags;
 4805
 4806	/* confirm we're still not at a refresh boundary */
 4807	raw_spin_lock_irqsave(&cfs_b->lock, flags);
 4808	cfs_b->slack_started = false;
 4809	if (cfs_b->distribute_running) {
 4810		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 4811		return;
 4812	}
 4813
 4814	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 4815		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 4816		return;
 4817	}
 4818
 4819	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
 4820		runtime = cfs_b->runtime;
 4821
 4822	if (runtime)
 4823		cfs_b->distribute_running = 1;
 4824
 4825	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 4826
 4827	if (!runtime)
 4828		return;
 4829
 4830	runtime = distribute_cfs_runtime(cfs_b, runtime);
 4831
 4832	raw_spin_lock_irqsave(&cfs_b->lock, flags);
 4833	lsub_positive(&cfs_b->runtime, runtime);
 4834	cfs_b->distribute_running = 0;
 4835	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 4836}
 4837
 4838/*
 4839 * When a group wakes up we want to make sure that its quota is not already
 4840 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
 4841 * runtime as update_curr() throttling can not not trigger until it's on-rq.
 4842 */
 4843static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 4844{
 4845	if (!cfs_bandwidth_used())
 4846		return;
 4847
 4848	/* an active group must be handled by the update_curr()->put() path */
 4849	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 4850		return;
 4851
 4852	/* ensure the group is not already throttled */
 4853	if (cfs_rq_throttled(cfs_rq))
 4854		return;
 4855
 4856	/* update runtime allocation */
 4857	account_cfs_rq_runtime(cfs_rq, 0);
 4858	if (cfs_rq->runtime_remaining <= 0)
 4859		throttle_cfs_rq(cfs_rq);
 4860}
 4861
 4862static void sync_throttle(struct task_group *tg, int cpu)
 4863{
 4864	struct cfs_rq *pcfs_rq, *cfs_rq;
 4865
 4866	if (!cfs_bandwidth_used())
 4867		return;
 4868
 4869	if (!tg->parent)
 4870		return;
 4871
 4872	cfs_rq = tg->cfs_rq[cpu];
 4873	pcfs_rq = tg->parent->cfs_rq[cpu];
 4874
 4875	cfs_rq->throttle_count = pcfs_rq->throttle_count;
 4876	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
 4877}
 4878
 4879/* conditionally throttle active cfs_rq's from put_prev_entity() */
 4880static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 4881{
 4882	if (!cfs_bandwidth_used())
 4883		return false;
 4884
 4885	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
 4886		return false;
 4887
 4888	/*
 4889	 * it's possible for a throttled entity to be forced into a running
 4890	 * state (e.g. set_curr_task), in this case we're finished.
 4891	 */
 4892	if (cfs_rq_throttled(cfs_rq))
 4893		return true;
 4894
 4895	throttle_cfs_rq(cfs_rq);
 4896	return true;
 4897}
 4898
 
 
 
 
 4899static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 4900{
 4901	struct cfs_bandwidth *cfs_b =
 4902		container_of(timer, struct cfs_bandwidth, slack_timer);
 4903
 4904	do_sched_cfs_slack_timer(cfs_b);
 4905
 4906	return HRTIMER_NORESTART;
 4907}
 4908
 4909extern const u64 max_cfs_quota_period;
 4910
 4911static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 4912{
 4913	struct cfs_bandwidth *cfs_b =
 4914		container_of(timer, struct cfs_bandwidth, period_timer);
 4915	unsigned long flags;
 4916	int overrun;
 4917	int idle = 0;
 4918	int count = 0;
 4919
 4920	raw_spin_lock_irqsave(&cfs_b->lock, flags);
 4921	for (;;) {
 4922		overrun = hrtimer_forward_now(timer, cfs_b->period);
 
 
 4923		if (!overrun)
 4924			break;
 4925
 4926		if (++count > 3) {
 4927			u64 new, old = ktime_to_ns(cfs_b->period);
 4928
 4929			/*
 4930			 * Grow period by a factor of 2 to avoid losing precision.
 4931			 * Precision loss in the quota/period ratio can cause __cfs_schedulable
 4932			 * to fail.
 4933			 */
 4934			new = old * 2;
 4935			if (new < max_cfs_quota_period) {
 4936				cfs_b->period = ns_to_ktime(new);
 4937				cfs_b->quota *= 2;
 4938
 4939				pr_warn_ratelimited(
 4940	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
 4941					smp_processor_id(),
 4942					div_u64(new, NSEC_PER_USEC),
 4943					div_u64(cfs_b->quota, NSEC_PER_USEC));
 4944			} else {
 4945				pr_warn_ratelimited(
 4946	"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
 4947					smp_processor_id(),
 4948					div_u64(old, NSEC_PER_USEC),
 4949					div_u64(cfs_b->quota, NSEC_PER_USEC));
 4950			}
 4951
 4952			/* reset count so we don't come right back in here */
 4953			count = 0;
 4954		}
 4955
 4956		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
 4957	}
 4958	if (idle)
 4959		cfs_b->period_active = 0;
 4960	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 4961
 4962	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 4963}
 4964
 4965void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 4966{
 4967	raw_spin_lock_init(&cfs_b->lock);
 4968	cfs_b->runtime = 0;
 4969	cfs_b->quota = RUNTIME_INF;
 4970	cfs_b->period = ns_to_ktime(default_cfs_period());
 4971
 4972	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 4973	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 4974	cfs_b->period_timer.function = sched_cfs_period_timer;
 4975	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 4976	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 4977	cfs_b->distribute_running = 0;
 4978	cfs_b->slack_started = false;
 4979}
 4980
 4981static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 4982{
 4983	cfs_rq->runtime_enabled = 0;
 4984	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 4985}
 4986
 4987void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
 4988{
 4989	lockdep_assert_held(&cfs_b->lock);
 
 
 
 
 
 
 
 
 
 4990
 4991	if (cfs_b->period_active)
 4992		return;
 
 
 
 4993
 4994	cfs_b->period_active = 1;
 4995	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
 4996	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
 4997}
 4998
 4999static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 5000{
 5001	/* init_cfs_bandwidth() was not called */
 5002	if (!cfs_b->throttled_cfs_rq.next)
 5003		return;
 5004
 5005	hrtimer_cancel(&cfs_b->period_timer);
 5006	hrtimer_cancel(&cfs_b->slack_timer);
 5007}
 5008
 5009/*
 5010 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
 5011 *
 5012 * The race is harmless, since modifying bandwidth settings of unhooked group
 5013 * bits doesn't do much.
 5014 */
 5015
 5016/* cpu online calback */
 5017static void __maybe_unused update_runtime_enabled(struct rq *rq)
 5018{
 5019	struct task_group *tg;
 5020
 5021	lockdep_assert_held(&rq->lock);
 5022
 5023	rcu_read_lock();
 5024	list_for_each_entry_rcu(tg, &task_groups, list) {
 5025		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 5026		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 5027
 5028		raw_spin_lock(&cfs_b->lock);
 5029		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 5030		raw_spin_unlock(&cfs_b->lock);
 5031	}
 5032	rcu_read_unlock();
 5033}
 5034
 5035/* cpu offline callback */
 5036static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 5037{
 5038	struct task_group *tg;
 5039
 5040	lockdep_assert_held(&rq->lock);
 5041
 5042	rcu_read_lock();
 5043	list_for_each_entry_rcu(tg, &task_groups, list) {
 5044		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 5045
 5046		if (!cfs_rq->runtime_enabled)
 5047			continue;
 5048
 5049		/*
 5050		 * clock_task is not advancing so we just need to make sure
 5051		 * there's some valid quota amount
 5052		 */
 5053		cfs_rq->runtime_remaining = 1;
 5054		/*
 5055		 * Offline rq is schedulable till CPU is completely disabled
 5056		 * in take_cpu_down(), so we prevent new cfs throttling here.
 5057		 */
 5058		cfs_rq->runtime_enabled = 0;
 5059
 5060		if (cfs_rq_throttled(cfs_rq))
 5061			unthrottle_cfs_rq(cfs_rq);
 5062	}
 5063	rcu_read_unlock();
 5064}
 5065
 5066#else /* CONFIG_CFS_BANDWIDTH */
 5067
 5068static inline bool cfs_bandwidth_used(void)
 5069{
 5070	return false;
 5071}
 5072
 5073static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 5074static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 5075static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 5076static inline void sync_throttle(struct task_group *tg, int cpu) {}
 5077static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 5078
 5079static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 5080{
 5081	return 0;
 5082}
 5083
 5084static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 5085{
 5086	return 0;
 5087}
 5088
 5089static inline int throttled_lb_pair(struct task_group *tg,
 5090				    int src_cpu, int dest_cpu)
 5091{
 5092	return 0;
 5093}
 5094
 5095void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 5096
 5097#ifdef CONFIG_FAIR_GROUP_SCHED
 5098static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 5099#endif
 5100
 5101static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 5102{
 5103	return NULL;
 5104}
 5105static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 5106static inline void update_runtime_enabled(struct rq *rq) {}
 5107static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 5108
 5109#endif /* CONFIG_CFS_BANDWIDTH */
 5110
 5111/**************************************************
 5112 * CFS operations on tasks:
 5113 */
 5114
 5115#ifdef CONFIG_SCHED_HRTICK
 5116static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 5117{
 5118	struct sched_entity *se = &p->se;
 5119	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 5120
 5121	SCHED_WARN_ON(task_rq(p) != rq);
 5122
 5123	if (rq->cfs.h_nr_running > 1) {
 5124		u64 slice = sched_slice(cfs_rq, se);
 5125		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
 5126		s64 delta = slice - ran;
 5127
 5128		if (delta < 0) {
 5129			if (rq->curr == p)
 5130				resched_curr(rq);
 5131			return;
 5132		}
 
 
 
 
 
 
 
 
 5133		hrtick_start(rq, delta);
 5134	}
 5135}
 5136
 5137/*
 5138 * called from enqueue/dequeue and updates the hrtick when the
 5139 * current task is from our class and nr_running is low enough
 5140 * to matter.
 5141 */
 5142static void hrtick_update(struct rq *rq)
 5143{
 5144	struct task_struct *curr = rq->curr;
 5145
 5146	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
 5147		return;
 5148
 5149	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
 5150		hrtick_start_fair(rq, curr);
 5151}
 5152#else /* !CONFIG_SCHED_HRTICK */
 5153static inline void
 5154hrtick_start_fair(struct rq *rq, struct task_struct *p)
 5155{
 5156}
 5157
 5158static inline void hrtick_update(struct rq *rq)
 5159{
 5160}
 5161#endif
 5162
 5163#ifdef CONFIG_SMP
 5164static inline unsigned long cpu_util(int cpu);
 5165
 5166static inline bool cpu_overutilized(int cpu)
 5167{
 5168	return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
 5169}
 5170
 5171static inline void update_overutilized_status(struct rq *rq)
 5172{
 5173	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
 5174		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
 5175		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
 5176	}
 5177}
 5178#else
 5179static inline void update_overutilized_status(struct rq *rq) { }
 5180#endif
 5181
 5182/*
 5183 * The enqueue_task method is called before nr_running is
 5184 * increased. Here we update the fair scheduling stats and
 5185 * then put the task into the rbtree:
 5186 */
 5187static void
 5188enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 5189{
 5190	struct cfs_rq *cfs_rq;
 5191	struct sched_entity *se = &p->se;
 5192	int idle_h_nr_running = task_has_idle_policy(p);
 5193
 5194	/*
 5195	 * The code below (indirectly) updates schedutil which looks at
 5196	 * the cfs_rq utilization to select a frequency.
 5197	 * Let's add the task's estimated utilization to the cfs_rq's
 5198	 * estimated utilization, before we update schedutil.
 5199	 */
 5200	util_est_enqueue(&rq->cfs, p);
 5201
 5202	/*
 5203	 * If in_iowait is set, the code below may not trigger any cpufreq
 5204	 * utilization updates, so do it here explicitly with the IOWAIT flag
 5205	 * passed.
 5206	 */
 5207	if (p->in_iowait)
 5208		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 5209
 5210	for_each_sched_entity(se) {
 5211		if (se->on_rq)
 5212			break;
 5213		cfs_rq = cfs_rq_of(se);
 5214		enqueue_entity(cfs_rq, se, flags);
 5215
 5216		/*
 5217		 * end evaluation on encountering a throttled cfs_rq
 5218		 *
 5219		 * note: in the case of encountering a throttled cfs_rq we will
 5220		 * post the final h_nr_running increment below.
 5221		 */
 5222		if (cfs_rq_throttled(cfs_rq))
 5223			break;
 5224		cfs_rq->h_nr_running++;
 5225		cfs_rq->idle_h_nr_running += idle_h_nr_running;
 5226
 5227		flags = ENQUEUE_WAKEUP;
 5228	}
 5229
 5230	for_each_sched_entity(se) {
 5231		cfs_rq = cfs_rq_of(se);
 5232		cfs_rq->h_nr_running++;
 5233		cfs_rq->idle_h_nr_running += idle_h_nr_running;
 5234
 5235		if (cfs_rq_throttled(cfs_rq))
 5236			break;
 5237
 5238		update_load_avg(cfs_rq, se, UPDATE_TG);
 5239		update_cfs_group(se);
 5240	}
 5241
 5242	if (!se) {
 5243		add_nr_running(rq, 1);
 5244		/*
 5245		 * Since new tasks are assigned an initial util_avg equal to
 5246		 * half of the spare capacity of their CPU, tiny tasks have the
 5247		 * ability to cross the overutilized threshold, which will
 5248		 * result in the load balancer ruining all the task placement
 5249		 * done by EAS. As a way to mitigate that effect, do not account
 5250		 * for the first enqueue operation of new tasks during the
 5251		 * overutilized flag detection.
 5252		 *
 5253		 * A better way of solving this problem would be to wait for
 5254		 * the PELT signals of tasks to converge before taking them
 5255		 * into account, but that is not straightforward to implement,
 5256		 * and the following generally works well enough in practice.
 5257		 */
 5258		if (flags & ENQUEUE_WAKEUP)
 5259			update_overutilized_status(rq);
 5260
 5261	}
 5262
 5263	if (cfs_bandwidth_used()) {
 5264		/*
 5265		 * When bandwidth control is enabled; the cfs_rq_throttled()
 5266		 * breaks in the above iteration can result in incomplete
 5267		 * leaf list maintenance, resulting in triggering the assertion
 5268		 * below.
 5269		 */
 5270		for_each_sched_entity(se) {
 5271			cfs_rq = cfs_rq_of(se);
 5272
 5273			if (list_add_leaf_cfs_rq(cfs_rq))
 5274				break;
 5275		}
 5276	}
 5277
 5278	assert_list_leaf_cfs_rq(rq);
 5279
 5280	hrtick_update(rq);
 5281}
 5282
 5283static void set_next_buddy(struct sched_entity *se);
 5284
 5285/*
 5286 * The dequeue_task method is called before nr_running is
 5287 * decreased. We remove the task from the rbtree and
 5288 * update the fair scheduling stats:
 5289 */
 5290static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 5291{
 5292	struct cfs_rq *cfs_rq;
 5293	struct sched_entity *se = &p->se;
 5294	int task_sleep = flags & DEQUEUE_SLEEP;
 5295	int idle_h_nr_running = task_has_idle_policy(p);
 5296
 5297	for_each_sched_entity(se) {
 5298		cfs_rq = cfs_rq_of(se);
 5299		dequeue_entity(cfs_rq, se, flags);
 5300
 5301		/*
 5302		 * end evaluation on encountering a throttled cfs_rq
 5303		 *
 5304		 * note: in the case of encountering a throttled cfs_rq we will
 5305		 * post the final h_nr_running decrement below.
 5306		*/
 5307		if (cfs_rq_throttled(cfs_rq))
 5308			break;
 5309		cfs_rq->h_nr_running--;
 5310		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 5311
 5312		/* Don't dequeue parent if it has other entities besides us */
 5313		if (cfs_rq->load.weight) {
 5314			/* Avoid re-evaluating load for this entity: */
 5315			se = parent_entity(se);
 5316			/*
 5317			 * Bias pick_next to pick a task from this cfs_rq, as
 5318			 * p is sleeping when it is within its sched_slice.
 5319			 */
 5320			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
 5321				set_next_buddy(se);
 
 
 
 5322			break;
 5323		}
 5324		flags |= DEQUEUE_SLEEP;
 5325	}
 5326
 5327	for_each_sched_entity(se) {
 5328		cfs_rq = cfs_rq_of(se);
 5329		cfs_rq->h_nr_running--;
 5330		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 5331
 5332		if (cfs_rq_throttled(cfs_rq))
 5333			break;
 5334
 5335		update_load_avg(cfs_rq, se, UPDATE_TG);
 5336		update_cfs_group(se);
 5337	}
 5338
 5339	if (!se)
 5340		sub_nr_running(rq, 1);
 5341
 5342	util_est_dequeue(&rq->cfs, p, task_sleep);
 5343	hrtick_update(rq);
 5344}
 5345
 5346#ifdef CONFIG_SMP
 
 
 
 
 
 5347
 5348/* Working cpumask for: load_balance, load_balance_newidle. */
 5349DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 5350DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 
 
 
 
 
 
 
 
 5351
 5352#ifdef CONFIG_NO_HZ_COMMON
 
 5353
 5354static struct {
 5355	cpumask_var_t idle_cpus_mask;
 5356	atomic_t nr_cpus;
 5357	int has_blocked;		/* Idle CPUS has blocked load */
 5358	unsigned long next_balance;     /* in jiffy units */
 5359	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 5360} nohz ____cacheline_aligned;
 5361
 5362#endif /* CONFIG_NO_HZ_COMMON */
 5363
 5364/* CPU only has SCHED_IDLE tasks enqueued */
 5365static int sched_idle_cpu(int cpu)
 
 
 
 5366{
 5367	struct rq *rq = cpu_rq(cpu);
 
 5368
 5369	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
 5370			rq->nr_running);
 5371}
 5372
 5373static unsigned long cpu_runnable_load(struct rq *rq)
 5374{
 5375	return cfs_rq_runnable_load_avg(&rq->cfs);
 5376}
 5377
 5378static unsigned long capacity_of(int cpu)
 5379{
 5380	return cpu_rq(cpu)->cpu_capacity;
 5381}
 5382
 5383static unsigned long cpu_avg_load_per_task(int cpu)
 5384{
 5385	struct rq *rq = cpu_rq(cpu);
 5386	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
 5387	unsigned long load_avg = cpu_runnable_load(rq);
 5388
 5389	if (nr_running)
 5390		return load_avg / nr_running;
 5391
 5392	return 0;
 5393}
 5394
 5395static void record_wakee(struct task_struct *p)
 
 5396{
 5397	/*
 5398	 * Only decay a single time; tasks that have less then 1 wakeup per
 5399	 * jiffy will not have built up many flips.
 5400	 */
 5401	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
 5402		current->wakee_flips >>= 1;
 5403		current->wakee_flip_decay_ts = jiffies;
 5404	}
 5405
 5406	if (current->last_wakee != p) {
 5407		current->last_wakee = p;
 5408		current->wakee_flips++;
 5409	}
 
 
 
 
 
 
 
 
 
 5410}
 5411
 
 5412/*
 5413 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
 
 
 
 
 
 
 
 
 
 
 
 5414 *
 5415 * A waker of many should wake a different task than the one last awakened
 5416 * at a frequency roughly N times higher than one of its wakees.
 5417 *
 5418 * In order to determine whether we should let the load spread vs consolidating
 5419 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
 5420 * partner, and a factor of lls_size higher frequency in the other.
 5421 *
 5422 * With both conditions met, we can be relatively sure that the relationship is
 5423 * non-monogamous, with partner count exceeding socket size.
 5424 *
 5425 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
 5426 * whatever is irrelevant, spread criteria is apparent partner count exceeds
 5427 * socket size.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5428 */
 5429static int wake_wide(struct task_struct *p)
 5430{
 5431	unsigned int master = current->wakee_flips;
 5432	unsigned int slave = p->wakee_flips;
 5433	int factor = this_cpu_read(sd_llc_size);
 5434
 5435	if (master < slave)
 5436		swap(master, slave);
 5437	if (slave < factor || master < slave * factor)
 5438		return 0;
 5439	return 1;
 5440}
 5441
 5442/*
 5443 * The purpose of wake_affine() is to quickly determine on which CPU we can run
 5444 * soonest. For the purpose of speed we only consider the waking and previous
 5445 * CPU.
 5446 *
 5447 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
 5448 *			cache-affine and is (or	will be) idle.
 5449 *
 5450 * wake_affine_weight() - considers the weight to reflect the average
 5451 *			  scheduling latency of the CPUs. This seems to work
 5452 *			  for the overloaded case.
 5453 */
 5454static int
 5455wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 5456{
 5457	/*
 5458	 * If this_cpu is idle, it implies the wakeup is from interrupt
 5459	 * context. Only allow the move if cache is shared. Otherwise an
 5460	 * interrupt intensive workload could force all tasks onto one
 5461	 * node depending on the IO topology or IRQ affinity settings.
 5462	 *
 5463	 * If the prev_cpu is idle and cache affine then avoid a migration.
 5464	 * There is no guarantee that the cache hot data from an interrupt
 5465	 * is more important than cache hot data on the prev_cpu and from
 5466	 * a cpufreq perspective, it's better to have higher utilisation
 5467	 * on one CPU.
 5468	 */
 5469	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
 5470		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 5471
 5472	if (sync && cpu_rq(this_cpu)->nr_running == 1)
 5473		return this_cpu;
 5474
 5475	return nr_cpumask_bits;
 5476}
 
 
 5477
 5478static int
 5479wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 5480		   int this_cpu, int prev_cpu, int sync)
 5481{
 5482	s64 this_eff_load, prev_eff_load;
 5483	unsigned long task_load;
 5484
 5485	this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
 
 
 
 
 
 
 5486
 5487	if (sync) {
 5488		unsigned long current_load = task_h_load(current);
 
 
 
 
 
 5489
 5490		if (current_load > this_eff_load)
 5491			return this_cpu;
 
 
 5492
 5493		this_eff_load -= current_load;
 
 
 
 
 
 
 
 5494	}
 5495
 5496	task_load = task_h_load(p);
 
 
 5497
 5498	this_eff_load += task_load;
 5499	if (sched_feat(WA_BIAS))
 5500		this_eff_load *= 100;
 5501	this_eff_load *= capacity_of(prev_cpu);
 
 5502
 5503	prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
 5504	prev_eff_load -= task_load;
 5505	if (sched_feat(WA_BIAS))
 5506		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
 5507	prev_eff_load *= capacity_of(this_cpu);
 
 
 
 
 
 
 
 
 
 
 
 5508
 5509	/*
 5510	 * If sync, adjust the weight of prev_eff_load such that if
 5511	 * prev_eff == this_eff that select_idle_sibling() will consider
 5512	 * stacking the wakee on top of the waker if no other CPU is
 5513	 * idle.
 5514	 */
 5515	if (sync)
 5516		prev_eff_load += 1;
 
 5517
 5518	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 5519}
 
 5520
 5521static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 5522		       int this_cpu, int prev_cpu, int sync)
 5523{
 5524	int target = nr_cpumask_bits;
 5525
 5526	if (sched_feat(WA_IDLE))
 5527		target = wake_affine_idle(this_cpu, prev_cpu, sync);
 
 
 
 
 
 
 
 
 
 5528
 5529	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
 5530		target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
 
 
 
 
 
 
 5531
 5532	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 5533	if (target == nr_cpumask_bits)
 5534		return prev_cpu;
 5535
 5536	schedstat_inc(sd->ttwu_move_affine);
 5537	schedstat_inc(p->se.statistics.nr_wakeups_affine);
 5538	return target;
 5539}
 
 
 
 5540
 5541static unsigned long cpu_util_without(int cpu, struct task_struct *p);
 
 5542
 5543static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
 5544{
 5545	return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
 
 
 
 
 
 
 
 
 
 
 
 5546}
 5547
 5548/*
 5549 * find_idlest_group finds and returns the least busy CPU group within the
 5550 * domain.
 5551 *
 5552 * Assumes p is allowed on at least one CPU in sd.
 5553 */
 5554static struct sched_group *
 5555find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 5556		  int this_cpu, int sd_flag)
 5557{
 5558	struct sched_group *idlest = NULL, *group = sd->groups;
 5559	struct sched_group *most_spare_sg = NULL;
 5560	unsigned long min_runnable_load = ULONG_MAX;
 5561	unsigned long this_runnable_load = ULONG_MAX;
 5562	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
 5563	unsigned long most_spare = 0, this_spare = 0;
 5564	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
 5565	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
 5566				(sd->imbalance_pct-100) / 100;
 5567
 5568	do {
 5569		unsigned long load, avg_load, runnable_load;
 5570		unsigned long spare_cap, max_spare_cap;
 5571		int local_group;
 5572		int i;
 5573
 5574		/* Skip over this group if it has no CPUs allowed */
 5575		if (!cpumask_intersects(sched_group_span(group),
 5576					p->cpus_ptr))
 5577			continue;
 5578
 5579		local_group = cpumask_test_cpu(this_cpu,
 5580					       sched_group_span(group));
 5581
 5582		/*
 5583		 * Tally up the load of all CPUs in the group and find
 5584		 * the group containing the CPU with most spare capacity.
 5585		 */
 5586		avg_load = 0;
 5587		runnable_load = 0;
 5588		max_spare_cap = 0;
 5589
 5590		for_each_cpu(i, sched_group_span(group)) {
 5591			load = cpu_runnable_load(cpu_rq(i));
 5592			runnable_load += load;
 5593
 5594			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
 5595
 5596			spare_cap = capacity_spare_without(i, p);
 5597
 5598			if (spare_cap > max_spare_cap)
 5599				max_spare_cap = spare_cap;
 5600		}
 5601
 5602		/* Adjust by relative CPU capacity of the group */
 5603		avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
 5604					group->sgc->capacity;
 5605		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
 5606					group->sgc->capacity;
 5607
 5608		if (local_group) {
 5609			this_runnable_load = runnable_load;
 5610			this_avg_load = avg_load;
 5611			this_spare = max_spare_cap;
 5612		} else {
 5613			if (min_runnable_load > (runnable_load + imbalance)) {
 5614				/*
 5615				 * The runnable load is significantly smaller
 5616				 * so we can pick this new CPU:
 5617				 */
 5618				min_runnable_load = runnable_load;
 5619				min_avg_load = avg_load;
 5620				idlest = group;
 5621			} else if ((runnable_load < (min_runnable_load + imbalance)) &&
 5622				   (100*min_avg_load > imbalance_scale*avg_load)) {
 5623				/*
 5624				 * The runnable loads are close so take the
 5625				 * blocked load into account through avg_load:
 5626				 */
 5627				min_avg_load = avg_load;
 5628				idlest = group;
 5629			}
 5630
 5631			if (most_spare < max_spare_cap) {
 5632				most_spare = max_spare_cap;
 5633				most_spare_sg = group;
 5634			}
 5635		}
 5636	} while (group = group->next, group != sd->groups);
 5637
 5638	/*
 5639	 * The cross-over point between using spare capacity or least load
 5640	 * is too conservative for high utilization tasks on partially
 5641	 * utilized systems if we require spare_capacity > task_util(p),
 5642	 * so we allow for some task stuffing by using
 5643	 * spare_capacity > task_util(p)/2.
 5644	 *
 5645	 * Spare capacity can't be used for fork because the utilization has
 5646	 * not been set yet, we must first select a rq to compute the initial
 5647	 * utilization.
 5648	 */
 5649	if (sd_flag & SD_BALANCE_FORK)
 5650		goto skip_spare;
 5651
 5652	if (this_spare > task_util(p) / 2 &&
 5653	    imbalance_scale*this_spare > 100*most_spare)
 5654		return NULL;
 5655
 5656	if (most_spare > task_util(p) / 2)
 5657		return most_spare_sg;
 5658
 5659skip_spare:
 5660	if (!idlest)
 5661		return NULL;
 5662
 5663	/*
 5664	 * When comparing groups across NUMA domains, it's possible for the
 5665	 * local domain to be very lightly loaded relative to the remote
 5666	 * domains but "imbalance" skews the comparison making remote CPUs
 5667	 * look much more favourable. When considering cross-domain, add
 5668	 * imbalance to the runnable load on the remote node and consider
 5669	 * staying local.
 5670	 */
 5671	if ((sd->flags & SD_NUMA) &&
 5672	    min_runnable_load + imbalance >= this_runnable_load)
 5673		return NULL;
 5674
 5675	if (min_runnable_load > (this_runnable_load + imbalance))
 5676		return NULL;
 5677
 5678	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
 5679	     (100*this_avg_load < imbalance_scale*min_avg_load))
 5680		return NULL;
 5681
 5682	return idlest;
 5683}
 5684
 5685/*
 5686 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
 5687 */
 5688static int
 5689find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 5690{
 5691	unsigned long load, min_load = ULONG_MAX;
 5692	unsigned int min_exit_latency = UINT_MAX;
 5693	u64 latest_idle_timestamp = 0;
 5694	int least_loaded_cpu = this_cpu;
 5695	int shallowest_idle_cpu = -1, si_cpu = -1;
 5696	int i;
 5697
 5698	/* Check if we have any choice: */
 5699	if (group->group_weight == 1)
 5700		return cpumask_first(sched_group_span(group));
 5701
 5702	/* Traverse only the allowed CPUs */
 5703	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
 5704		if (available_idle_cpu(i)) {
 5705			struct rq *rq = cpu_rq(i);
 5706			struct cpuidle_state *idle = idle_get_state(rq);
 5707			if (idle && idle->exit_latency < min_exit_latency) {
 5708				/*
 5709				 * We give priority to a CPU whose idle state
 5710				 * has the smallest exit latency irrespective
 5711				 * of any idle timestamp.
 5712				 */
 5713				min_exit_latency = idle->exit_latency;
 5714				latest_idle_timestamp = rq->idle_stamp;
 5715				shallowest_idle_cpu = i;
 5716			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
 5717				   rq->idle_stamp > latest_idle_timestamp) {
 5718				/*
 5719				 * If equal or no active idle state, then
 5720				 * the most recently idled CPU might have
 5721				 * a warmer cache.
 5722				 */
 5723				latest_idle_timestamp = rq->idle_stamp;
 5724				shallowest_idle_cpu = i;
 5725			}
 5726		} else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
 5727			if (sched_idle_cpu(i)) {
 5728				si_cpu = i;
 5729				continue;
 5730			}
 5731
 5732			load = cpu_runnable_load(cpu_rq(i));
 5733			if (load < min_load) {
 5734				min_load = load;
 5735				least_loaded_cpu = i;
 5736			}
 5737		}
 5738	}
 5739
 5740	if (shallowest_idle_cpu != -1)
 5741		return shallowest_idle_cpu;
 5742	if (si_cpu != -1)
 5743		return si_cpu;
 5744	return least_loaded_cpu;
 5745}
 5746
 5747static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
 5748				  int cpu, int prev_cpu, int sd_flag)
 5749{
 5750	int new_cpu = cpu;
 5751
 5752	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
 5753		return prev_cpu;
 5754
 5755	/*
 5756	 * We need task's util for capacity_spare_without, sync it up to
 5757	 * prev_cpu's last_update_time.
 5758	 */
 5759	if (!(sd_flag & SD_BALANCE_FORK))
 5760		sync_entity_load_avg(&p->se);
 5761
 5762	while (sd) {
 5763		struct sched_group *group;
 5764		struct sched_domain *tmp;
 5765		int weight;
 5766
 5767		if (!(sd->flags & sd_flag)) {
 5768			sd = sd->child;
 5769			continue;
 5770		}
 5771
 5772		group = find_idlest_group(sd, p, cpu, sd_flag);
 5773		if (!group) {
 5774			sd = sd->child;
 5775			continue;
 5776		}
 5777
 5778		new_cpu = find_idlest_group_cpu(group, p, cpu);
 5779		if (new_cpu == cpu) {
 5780			/* Now try balancing at a lower domain level of 'cpu': */
 5781			sd = sd->child;
 5782			continue;
 5783		}
 5784
 5785		/* Now try balancing at a lower domain level of 'new_cpu': */
 5786		cpu = new_cpu;
 5787		weight = sd->span_weight;
 5788		sd = NULL;
 5789		for_each_domain(cpu, tmp) {
 5790			if (weight <= tmp->span_weight)
 5791				break;
 5792			if (tmp->flags & sd_flag)
 5793				sd = tmp;
 5794		}
 5795	}
 5796
 5797	return new_cpu;
 5798}
 5799
 5800#ifdef CONFIG_SCHED_SMT
 5801DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 5802EXPORT_SYMBOL_GPL(sched_smt_present);
 5803
 5804static inline void set_idle_cores(int cpu, int val)
 5805{
 5806	struct sched_domain_shared *sds;
 5807
 5808	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 5809	if (sds)
 5810		WRITE_ONCE(sds->has_idle_cores, val);
 5811}
 5812
 5813static inline bool test_idle_cores(int cpu, bool def)
 5814{
 5815	struct sched_domain_shared *sds;
 5816
 5817	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 5818	if (sds)
 5819		return READ_ONCE(sds->has_idle_cores);
 5820
 5821	return def;
 5822}
 5823
 5824/*
 5825 * Scans the local SMT mask to see if the entire core is idle, and records this
 5826 * information in sd_llc_shared->has_idle_cores.
 5827 *
 5828 * Since SMT siblings share all cache levels, inspecting this limited remote
 5829 * state should be fairly cheap.
 5830 */
 5831void __update_idle_core(struct rq *rq)
 5832{
 5833	int core = cpu_of(rq);
 5834	int cpu;
 5835
 5836	rcu_read_lock();
 5837	if (test_idle_cores(core, true))
 5838		goto unlock;
 5839
 5840	for_each_cpu(cpu, cpu_smt_mask(core)) {
 5841		if (cpu == core)
 5842			continue;
 5843
 5844		if (!available_idle_cpu(cpu))
 5845			goto unlock;
 5846	}
 5847
 5848	set_idle_cores(core, 1);
 5849unlock:
 5850	rcu_read_unlock();
 5851}
 5852
 5853/*
 5854 * Scan the entire LLC domain for idle cores; this dynamically switches off if
 5855 * there are no idle cores left in the system; tracked through
 5856 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
 5857 */
 5858static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
 5859{
 5860	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
 5861	int core, cpu;
 5862
 5863	if (!static_branch_likely(&sched_smt_present))
 5864		return -1;
 5865
 5866	if (!test_idle_cores(target, false))
 5867		return -1;
 5868
 5869	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 5870
 5871	for_each_cpu_wrap(core, cpus, target) {
 5872		bool idle = true;
 5873
 5874		for_each_cpu(cpu, cpu_smt_mask(core)) {
 5875			__cpumask_clear_cpu(cpu, cpus);
 5876			if (!available_idle_cpu(cpu))
 5877				idle = false;
 5878		}
 5879
 5880		if (idle)
 5881			return core;
 5882	}
 5883
 5884	/*
 5885	 * Failed to find an idle core; stop looking for one.
 5886	 */
 5887	set_idle_cores(target, 0);
 5888
 5889	return -1;
 5890}
 5891
 5892/*
 5893 * Scan the local SMT mask for idle CPUs.
 5894 */
 5895static int select_idle_smt(struct task_struct *p, int target)
 5896{
 5897	int cpu, si_cpu = -1;
 5898
 5899	if (!static_branch_likely(&sched_smt_present))
 5900		return -1;
 5901
 5902	for_each_cpu(cpu, cpu_smt_mask(target)) {
 5903		if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 5904			continue;
 5905		if (available_idle_cpu(cpu))
 5906			return cpu;
 5907		if (si_cpu == -1 && sched_idle_cpu(cpu))
 5908			si_cpu = cpu;
 5909	}
 5910
 5911	return si_cpu;
 5912}
 5913
 5914#else /* CONFIG_SCHED_SMT */
 5915
 5916static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
 5917{
 5918	return -1;
 5919}
 5920
 5921static inline int select_idle_smt(struct task_struct *p, int target)
 5922{
 5923	return -1;
 5924}
 5925
 5926#endif /* CONFIG_SCHED_SMT */
 5927
 5928/*
 5929 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
 5930 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
 5931 * average idle time for this rq (as found in rq->avg_idle).
 5932 */
 5933static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
 5934{
 5935	struct sched_domain *this_sd;
 5936	u64 avg_cost, avg_idle;
 5937	u64 time, cost;
 5938	s64 delta;
 5939	int this = smp_processor_id();
 5940	int cpu, nr = INT_MAX, si_cpu = -1;
 5941
 5942	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 5943	if (!this_sd)
 5944		return -1;
 5945
 5946	/*
 5947	 * Due to large variance we need a large fuzz factor; hackbench in
 5948	 * particularly is sensitive here.
 5949	 */
 5950	avg_idle = this_rq()->avg_idle / 512;
 5951	avg_cost = this_sd->avg_scan_cost + 1;
 5952
 5953	if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
 5954		return -1;
 5955
 5956	if (sched_feat(SIS_PROP)) {
 5957		u64 span_avg = sd->span_weight * avg_idle;
 5958		if (span_avg > 4*avg_cost)
 5959			nr = div_u64(span_avg, avg_cost);
 5960		else
 5961			nr = 4;
 5962	}
 5963
 5964	time = cpu_clock(this);
 5965
 5966	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
 5967		if (!--nr)
 5968			return si_cpu;
 5969		if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 5970			continue;
 5971		if (available_idle_cpu(cpu))
 5972			break;
 5973		if (si_cpu == -1 && sched_idle_cpu(cpu))
 5974			si_cpu = cpu;
 5975	}
 5976
 5977	time = cpu_clock(this) - time;
 5978	cost = this_sd->avg_scan_cost;
 5979	delta = (s64)(time - cost) / 8;
 5980	this_sd->avg_scan_cost += delta;
 5981
 5982	return cpu;
 5983}
 5984
 5985/*
 5986 * Try and locate an idle core/thread in the LLC cache domain.
 5987 */
 5988static int select_idle_sibling(struct task_struct *p, int prev, int target)
 5989{
 
 
 5990	struct sched_domain *sd;
 5991	int i, recent_used_cpu;
 5992
 5993	if (available_idle_cpu(target) || sched_idle_cpu(target))
 5994		return target;
 5995
 5996	/*
 5997	 * If the previous CPU is cache affine and idle, don't be stupid:
 5998	 */
 5999	if (prev != target && cpus_share_cache(prev, target) &&
 6000	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))
 6001		return prev;
 6002
 6003	/* Check a recently used CPU as a potential idle candidate: */
 6004	recent_used_cpu = p->recent_used_cpu;
 6005	if (recent_used_cpu != prev &&
 6006	    recent_used_cpu != target &&
 6007	    cpus_share_cache(recent_used_cpu, target) &&
 6008	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 6009	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
 6010		/*
 6011		 * Replace recent_used_cpu with prev as it is a potential
 6012		 * candidate for the next wake:
 6013		 */
 6014		p->recent_used_cpu = prev;
 6015		return recent_used_cpu;
 6016	}
 6017
 6018	sd = rcu_dereference(per_cpu(sd_llc, target));
 6019	if (!sd)
 6020		return target;
 6021
 6022	i = select_idle_core(p, sd, target);
 6023	if ((unsigned)i < nr_cpumask_bits)
 6024		return i;
 6025
 6026	i = select_idle_cpu(p, sd, target);
 6027	if ((unsigned)i < nr_cpumask_bits)
 6028		return i;
 6029
 6030	i = select_idle_smt(p, target);
 6031	if ((unsigned)i < nr_cpumask_bits)
 6032		return i;
 6033
 6034	return target;
 6035}
 6036
 6037/**
 6038 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
 6039 * @cpu: the CPU to get the utilization of
 6040 *
 6041 * The unit of the return value must be the one of capacity so we can compare
 6042 * the utilization with the capacity of the CPU that is available for CFS task
 6043 * (ie cpu_capacity).
 6044 *
 6045 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
 6046 * recent utilization of currently non-runnable tasks on a CPU. It represents
 6047 * the amount of utilization of a CPU in the range [0..capacity_orig] where
 6048 * capacity_orig is the cpu_capacity available at the highest frequency
 6049 * (arch_scale_freq_capacity()).
 6050 * The utilization of a CPU converges towards a sum equal to or less than the
 6051 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
 6052 * the running time on this CPU scaled by capacity_curr.
 6053 *
 6054 * The estimated utilization of a CPU is defined to be the maximum between its
 6055 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
 6056 * currently RUNNABLE on that CPU.
 6057 * This allows to properly represent the expected utilization of a CPU which
 6058 * has just got a big task running since a long sleep period. At the same time
 6059 * however it preserves the benefits of the "blocked utilization" in
 6060 * describing the potential for other tasks waking up on the same CPU.
 6061 *
 6062 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
 6063 * higher than capacity_orig because of unfortunate rounding in
 6064 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
 6065 * the average stabilizes with the new running time. We need to check that the
 6066 * utilization stays within the range of [0..capacity_orig] and cap it if
 6067 * necessary. Without utilization capping, a group could be seen as overloaded
 6068 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
 6069 * available capacity. We allow utilization to overshoot capacity_curr (but not
 6070 * capacity_orig) as it useful for predicting the capacity required after task
 6071 * migrations (scheduler-driven DVFS).
 6072 *
 6073 * Return: the (estimated) utilization for the specified CPU
 6074 */
 6075static inline unsigned long cpu_util(int cpu)
 6076{
 6077	struct cfs_rq *cfs_rq;
 6078	unsigned int util;
 6079
 6080	cfs_rq = &cpu_rq(cpu)->cfs;
 6081	util = READ_ONCE(cfs_rq->avg.util_avg);
 6082
 6083	if (sched_feat(UTIL_EST))
 6084		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
 6085
 6086	return min_t(unsigned long, util, capacity_orig_of(cpu));
 6087}
 6088
 6089/*
 6090 * cpu_util_without: compute cpu utilization without any contributions from *p
 6091 * @cpu: the CPU which utilization is requested
 6092 * @p: the task which utilization should be discounted
 6093 *
 6094 * The utilization of a CPU is defined by the utilization of tasks currently
 6095 * enqueued on that CPU as well as tasks which are currently sleeping after an
 6096 * execution on that CPU.
 6097 *
 6098 * This method returns the utilization of the specified CPU by discounting the
 6099 * utilization of the specified task, whenever the task is currently
 6100 * contributing to the CPU utilization.
 6101 */
 6102static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 6103{
 6104	struct cfs_rq *cfs_rq;
 6105	unsigned int util;
 6106
 6107	/* Task has no contribution or is new */
 6108	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 6109		return cpu_util(cpu);
 6110
 6111	cfs_rq = &cpu_rq(cpu)->cfs;
 6112	util = READ_ONCE(cfs_rq->avg.util_avg);
 6113
 6114	/* Discount task's util from CPU's util */
 6115	lsub_positive(&util, task_util(p));
 6116
 6117	/*
 6118	 * Covered cases:
 6119	 *
 6120	 * a) if *p is the only task sleeping on this CPU, then:
 6121	 *      cpu_util (== task_util) > util_est (== 0)
 6122	 *    and thus we return:
 6123	 *      cpu_util_without = (cpu_util - task_util) = 0
 6124	 *
 6125	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
 6126	 *    IDLE, then:
 6127	 *      cpu_util >= task_util
 6128	 *      cpu_util > util_est (== 0)
 6129	 *    and thus we discount *p's blocked utilization to return:
 6130	 *      cpu_util_without = (cpu_util - task_util) >= 0
 6131	 *
 6132	 * c) if other tasks are RUNNABLE on that CPU and
 6133	 *      util_est > cpu_util
 6134	 *    then we use util_est since it returns a more restrictive
 6135	 *    estimation of the spare capacity on that CPU, by just
 6136	 *    considering the expected utilization of tasks already
 6137	 *    runnable on that CPU.
 6138	 *
 6139	 * Cases a) and b) are covered by the above code, while case c) is
 6140	 * covered by the following code when estimated utilization is
 6141	 * enabled.
 6142	 */
 6143	if (sched_feat(UTIL_EST)) {
 6144		unsigned int estimated =
 6145			READ_ONCE(cfs_rq->avg.util_est.enqueued);
 6146
 6147		/*
 6148		 * Despite the following checks we still have a small window
 6149		 * for a possible race, when an execl's select_task_rq_fair()
 6150		 * races with LB's detach_task():
 6151		 *
 6152		 *   detach_task()
 6153		 *     p->on_rq = TASK_ON_RQ_MIGRATING;
 6154		 *     ---------------------------------- A
 6155		 *     deactivate_task()                   \
 6156		 *       dequeue_task()                     + RaceTime
 6157		 *         util_est_dequeue()              /
 6158		 *     ---------------------------------- B
 6159		 *
 6160		 * The additional check on "current == p" it's required to
 6161		 * properly fix the execl regression and it helps in further
 6162		 * reducing the chances for the above race.
 6163		 */
 6164		if (unlikely(task_on_rq_queued(p) || current == p))
 6165			lsub_positive(&estimated, _task_util_est(p));
 6166
 6167		util = max(util, estimated);
 6168	}
 6169
 6170	/*
 6171	 * Utilization (estimated) can exceed the CPU capacity, thus let's
 6172	 * clamp to the maximum CPU capacity to ensure consistency with
 6173	 * the cpu_util call.
 6174	 */
 6175	return min_t(unsigned long, util, capacity_orig_of(cpu));
 6176}
 6177
 6178/*
 6179 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
 6180 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
 6181 *
 6182 * In that case WAKE_AFFINE doesn't make sense and we'll let
 6183 * BALANCE_WAKE sort things out.
 6184 */
 6185static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 6186{
 6187	long min_cap, max_cap;
 6188
 6189	if (!static_branch_unlikely(&sched_asym_cpucapacity))
 6190		return 0;
 6191
 6192	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
 6193	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
 6194
 6195	/* Minimum capacity is close to max, no need to abort wake_affine */
 6196	if (max_cap - min_cap < max_cap >> 3)
 6197		return 0;
 6198
 6199	/* Bring task utilization in sync with prev_cpu */
 6200	sync_entity_load_avg(&p->se);
 6201
 6202	return !task_fits_capacity(p, min_cap);
 6203}
 6204
 6205/*
 6206 * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
 6207 * to @dst_cpu.
 6208 */
 6209static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
 6210{
 6211	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
 6212	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
 6213
 6214	/*
 6215	 * If @p migrates from @cpu to another, remove its contribution. Or,
 6216	 * if @p migrates from another CPU to @cpu, add its contribution. In
 6217	 * the other cases, @cpu is not impacted by the migration, so the
 6218	 * util_avg should already be correct.
 6219	 */
 6220	if (task_cpu(p) == cpu && dst_cpu != cpu)
 6221		sub_positive(&util, task_util(p));
 6222	else if (task_cpu(p) != cpu && dst_cpu == cpu)
 6223		util += task_util(p);
 6224
 6225	if (sched_feat(UTIL_EST)) {
 6226		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
 6227
 6228		/*
 6229		 * During wake-up, the task isn't enqueued yet and doesn't
 6230		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
 6231		 * so just add it (if needed) to "simulate" what will be
 6232		 * cpu_util() after the task has been enqueued.
 6233		 */
 6234		if (dst_cpu == cpu)
 6235			util_est += _task_util_est(p);
 6236
 6237		util = max(util, util_est);
 6238	}
 6239
 6240	return min(util, capacity_orig_of(cpu));
 6241}
 6242
 6243/*
 6244 * compute_energy(): Estimates the energy that @pd would consume if @p was
 6245 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
 6246 * landscape of @pd's CPUs after the task migration, and uses the Energy Model
 6247 * to compute what would be the energy if we decided to actually migrate that
 6248 * task.
 6249 */
 6250static long
 6251compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 6252{
 6253	struct cpumask *pd_mask = perf_domain_span(pd);
 6254	unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
 6255	unsigned long max_util = 0, sum_util = 0;
 6256	int cpu;
 6257
 6258	/*
 6259	 * The capacity state of CPUs of the current rd can be driven by CPUs
 6260	 * of another rd if they belong to the same pd. So, account for the
 6261	 * utilization of these CPUs too by masking pd with cpu_online_mask
 6262	 * instead of the rd span.
 6263	 *
 6264	 * If an entire pd is outside of the current rd, it will not appear in
 6265	 * its pd list and will not be accounted by compute_energy().
 6266	 */
 6267	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
 6268		unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
 6269		struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
 6270
 6271		/*
 6272		 * Busy time computation: utilization clamping is not
 6273		 * required since the ratio (sum_util / cpu_capacity)
 6274		 * is already enough to scale the EM reported power
 6275		 * consumption at the (eventually clamped) cpu_capacity.
 6276		 */
 6277		sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
 6278					       ENERGY_UTIL, NULL);
 6279
 6280		/*
 6281		 * Performance domain frequency: utilization clamping
 6282		 * must be considered since it affects the selection
 6283		 * of the performance domain frequency.
 6284		 * NOTE: in case RT tasks are running, by default the
 6285		 * FREQUENCY_UTIL's utilization can be max OPP.
 6286		 */
 6287		cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
 6288					      FREQUENCY_UTIL, tsk);
 6289		max_util = max(max_util, cpu_util);
 6290	}
 6291
 6292	return em_pd_energy(pd->em_pd, max_util, sum_util);
 6293}
 6294
 6295/*
 6296 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
 6297 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
 6298 * spare capacity in each performance domain and uses it as a potential
 6299 * candidate to execute the task. Then, it uses the Energy Model to figure
 6300 * out which of the CPU candidates is the most energy-efficient.
 6301 *
 6302 * The rationale for this heuristic is as follows. In a performance domain,
 6303 * all the most energy efficient CPU candidates (according to the Energy
 6304 * Model) are those for which we'll request a low frequency. When there are
 6305 * several CPUs for which the frequency request will be the same, we don't
 6306 * have enough data to break the tie between them, because the Energy Model
 6307 * only includes active power costs. With this model, if we assume that
 6308 * frequency requests follow utilization (e.g. using schedutil), the CPU with
 6309 * the maximum spare capacity in a performance domain is guaranteed to be among
 6310 * the best candidates of the performance domain.
 6311 *
 6312 * In practice, it could be preferable from an energy standpoint to pack
 6313 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
 6314 * but that could also hurt our chances to go cluster idle, and we have no
 6315 * ways to tell with the current Energy Model if this is actually a good
 6316 * idea or not. So, find_energy_efficient_cpu() basically favors
 6317 * cluster-packing, and spreading inside a cluster. That should at least be
 6318 * a good thing for latency, and this is consistent with the idea that most
 6319 * of the energy savings of EAS come from the asymmetry of the system, and
 6320 * not so much from breaking the tie between identical CPUs. That's also the
 6321 * reason why EAS is enabled in the topology code only for systems where
 6322 * SD_ASYM_CPUCAPACITY is set.
 6323 *
 6324 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
 6325 * they don't have any useful utilization data yet and it's not possible to
 6326 * forecast their impact on energy consumption. Consequently, they will be
 6327 * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
 6328 * to be energy-inefficient in some use-cases. The alternative would be to
 6329 * bias new tasks towards specific types of CPUs first, or to try to infer
 6330 * their util_avg from the parent task, but those heuristics could hurt
 6331 * other use-cases too. So, until someone finds a better way to solve this,
 6332 * let's keep things simple by re-using the existing slow path.
 6333 */
 6334static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 6335{
 6336	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
 6337	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 6338	unsigned long cpu_cap, util, base_energy = 0;
 6339	int cpu, best_energy_cpu = prev_cpu;
 6340	struct sched_domain *sd;
 6341	struct perf_domain *pd;
 6342
 6343	rcu_read_lock();
 6344	pd = rcu_dereference(rd->pd);
 6345	if (!pd || READ_ONCE(rd->overutilized))
 6346		goto fail;
 6347
 6348	/*
 6349	 * Energy-aware wake-up happens on the lowest sched_domain starting
 6350	 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
 6351	 */
 6352	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
 6353	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
 6354		sd = sd->parent;
 6355	if (!sd)
 6356		goto fail;
 6357
 6358	sync_entity_load_avg(&p->se);
 6359	if (!task_util_est(p))
 6360		goto unlock;
 6361
 6362	for (; pd; pd = pd->next) {
 6363		unsigned long cur_delta, spare_cap, max_spare_cap = 0;
 6364		unsigned long base_energy_pd;
 6365		int max_spare_cap_cpu = -1;
 6366
 6367		/* Compute the 'base' energy of the pd, without @p */
 6368		base_energy_pd = compute_energy(p, -1, pd);
 6369		base_energy += base_energy_pd;
 6370
 6371		for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
 6372			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 6373				continue;
 6374
 6375			/* Skip CPUs that will be overutilized. */
 6376			util = cpu_util_next(cpu, p, cpu);
 6377			cpu_cap = capacity_of(cpu);
 6378			if (!fits_capacity(util, cpu_cap))
 6379				continue;
 6380
 6381			/* Always use prev_cpu as a candidate. */
 6382			if (cpu == prev_cpu) {
 6383				prev_delta = compute_energy(p, prev_cpu, pd);
 6384				prev_delta -= base_energy_pd;
 6385				best_delta = min(best_delta, prev_delta);
 6386			}
 6387
 6388			/*
 6389			 * Find the CPU with the maximum spare capacity in
 6390			 * the performance domain
 6391			 */
 6392			spare_cap = cpu_cap - util;
 6393			if (spare_cap > max_spare_cap) {
 6394				max_spare_cap = spare_cap;
 6395				max_spare_cap_cpu = cpu;
 6396			}
 6397		}
 6398
 6399		/* Evaluate the energy impact of using this CPU. */
 6400		if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
 6401			cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
 6402			cur_delta -= base_energy_pd;
 6403			if (cur_delta < best_delta) {
 6404				best_delta = cur_delta;
 6405				best_energy_cpu = max_spare_cap_cpu;
 6406			}
 6407		}
 6408	}
 6409unlock:
 6410	rcu_read_unlock();
 6411
 6412	/*
 6413	 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
 6414	 * least 6% of the energy used by prev_cpu.
 6415	 */
 6416	if (prev_delta == ULONG_MAX)
 6417		return best_energy_cpu;
 6418
 6419	if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
 6420		return best_energy_cpu;
 6421
 6422	return prev_cpu;
 6423
 6424fail:
 6425	rcu_read_unlock();
 6426
 6427	return -1;
 6428}
 6429
 6430/*
 6431 * select_task_rq_fair: Select target runqueue for the waking task in domains
 6432 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
 6433 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 6434 *
 6435 * Balances load by selecting the idlest CPU in the idlest group, or under
 6436 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
 6437 *
 6438 * Returns the target CPU number.
 6439 *
 6440 * preempt must be disabled.
 6441 */
 6442static int
 6443select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 6444{
 6445	struct sched_domain *tmp, *sd = NULL;
 6446	int cpu = smp_processor_id();
 6447	int new_cpu = prev_cpu;
 
 6448	int want_affine = 0;
 6449	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
 6450
 6451	if (sd_flag & SD_BALANCE_WAKE) {
 6452		record_wakee(p);
 6453
 6454		if (sched_energy_enabled()) {
 6455			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
 6456			if (new_cpu >= 0)
 6457				return new_cpu;
 6458			new_cpu = prev_cpu;
 6459		}
 6460
 6461		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
 6462			      cpumask_test_cpu(cpu, p->cpus_ptr);
 
 
 6463	}
 6464
 6465	rcu_read_lock();
 6466	for_each_domain(cpu, tmp) {
 6467		if (!(tmp->flags & SD_LOAD_BALANCE))
 6468			break;
 6469
 6470		/*
 6471		 * If both 'cpu' and 'prev_cpu' are part of this domain,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6472		 * cpu is a valid SD_WAKE_AFFINE target.
 6473		 */
 6474		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 6475		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
 6476			if (cpu != prev_cpu)
 6477				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
 6478
 6479			sd = NULL; /* Prefer wake_affine over balance flags */
 6480			break;
 6481		}
 6482
 6483		if (tmp->flags & sd_flag)
 6484			sd = tmp;
 6485		else if (!want_affine)
 6486			break;
 6487	}
 6488
 6489	if (unlikely(sd)) {
 6490		/* Slow path */
 6491		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
 6492	} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
 6493		/* Fast path */
 6494
 6495		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 6496
 6497		if (want_affine)
 6498			current->recent_used_cpu = cpu;
 6499	}
 6500	rcu_read_unlock();
 6501
 6502	return new_cpu;
 6503}
 6504
 6505static void detach_entity_cfs_rq(struct sched_entity *se);
 6506
 6507/*
 6508 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
 6509 * cfs_rq_of(p) references at time of call are still valid and identify the
 6510 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
 6511 */
 6512static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 6513{
 6514	/*
 6515	 * As blocked tasks retain absolute vruntime the migration needs to
 6516	 * deal with this by subtracting the old and adding the new
 6517	 * min_vruntime -- the latter is done by enqueue_entity() when placing
 6518	 * the task on the new runqueue.
 6519	 */
 6520	if (p->state == TASK_WAKING) {
 6521		struct sched_entity *se = &p->se;
 6522		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 6523		u64 min_vruntime;
 6524
 6525#ifndef CONFIG_64BIT
 6526		u64 min_vruntime_copy;
 6527
 6528		do {
 6529			min_vruntime_copy = cfs_rq->min_vruntime_copy;
 6530			smp_rmb();
 6531			min_vruntime = cfs_rq->min_vruntime;
 6532		} while (min_vruntime != min_vruntime_copy);
 6533#else
 6534		min_vruntime = cfs_rq->min_vruntime;
 6535#endif
 6536
 6537		se->vruntime -= min_vruntime;
 
 6538	}
 6539
 6540	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
 6541		/*
 6542		 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
 6543		 * rq->lock and can modify state directly.
 6544		 */
 6545		lockdep_assert_held(&task_rq(p)->lock);
 6546		detach_entity_cfs_rq(&p->se);
 6547
 6548	} else {
 6549		/*
 6550		 * We are supposed to update the task to "current" time, then
 6551		 * its up to date and ready to go to new CPU/cfs_rq. But we
 6552		 * have difficulty in getting what current time is, so simply
 6553		 * throw away the out-of-date time. This will result in the
 6554		 * wakee task is less decayed, but giving the wakee more load
 6555		 * sounds not bad.
 6556		 */
 6557		remove_entity_load_avg(&p->se);
 6558	}
 6559
 6560	/* Tell new CPU we are migrated */
 6561	p->se.avg.last_update_time = 0;
 
 
 6562
 6563	/* We have migrated, no longer consider this task hot */
 6564	p->se.exec_start = 0;
 6565
 6566	update_scan_period(p, new_cpu);
 6567}
 
 
 
 6568
 6569static void task_dead_fair(struct task_struct *p)
 6570{
 6571	remove_entity_load_avg(&p->se);
 6572}
 
 
 6573
 6574static int
 6575balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 6576{
 6577	if (rq->nr_running)
 6578		return 1;
 
 
 
 
 
 
 
 
 
 6579
 6580	return newidle_balance(rq, rf) != 0;
 6581}
 6582#endif /* CONFIG_SMP */
 6583
 6584static unsigned long wakeup_gran(struct sched_entity *se)
 
 6585{
 6586	unsigned long gran = sysctl_sched_wakeup_granularity;
 6587
 6588	/*
 6589	 * Since its curr running now, convert the gran from real-time
 6590	 * to virtual-time in his units.
 6591	 *
 6592	 * By using 'se' instead of 'curr' we penalize light tasks, so
 6593	 * they get preempted easier. That is, if 'se' < 'curr' then
 6594	 * the resulting gran will be larger, therefore penalizing the
 6595	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
 6596	 * be smaller, again penalizing the lighter task.
 6597	 *
 6598	 * This is especially important for buddies when the leftmost
 6599	 * task is higher priority than the buddy.
 6600	 */
 6601	return calc_delta_fair(gran, se);
 6602}
 6603
 6604/*
 6605 * Should 'se' preempt 'curr'.
 6606 *
 6607 *             |s1
 6608 *        |s2
 6609 *   |s3
 6610 *         g
 6611 *      |<--->|c
 6612 *
 6613 *  w(c, s1) = -1
 6614 *  w(c, s2) =  0
 6615 *  w(c, s3) =  1
 6616 *
 6617 */
 6618static int
 6619wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 6620{
 6621	s64 gran, vdiff = curr->vruntime - se->vruntime;
 6622
 6623	if (vdiff <= 0)
 6624		return -1;
 6625
 6626	gran = wakeup_gran(se);
 6627	if (vdiff > gran)
 6628		return 1;
 6629
 6630	return 0;
 6631}
 6632
 6633static void set_last_buddy(struct sched_entity *se)
 6634{
 6635	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
 6636		return;
 6637
 6638	for_each_sched_entity(se) {
 6639		if (SCHED_WARN_ON(!se->on_rq))
 6640			return;
 6641		cfs_rq_of(se)->last = se;
 6642	}
 6643}
 6644
 6645static void set_next_buddy(struct sched_entity *se)
 6646{
 6647	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
 6648		return;
 6649
 6650	for_each_sched_entity(se) {
 6651		if (SCHED_WARN_ON(!se->on_rq))
 6652			return;
 6653		cfs_rq_of(se)->next = se;
 6654	}
 6655}
 6656
 6657static void set_skip_buddy(struct sched_entity *se)
 6658{
 6659	for_each_sched_entity(se)
 6660		cfs_rq_of(se)->skip = se;
 6661}
 6662
 6663/*
 6664 * Preempt the current task with a newly woken task if needed:
 6665 */
 6666static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 6667{
 6668	struct task_struct *curr = rq->curr;
 6669	struct sched_entity *se = &curr->se, *pse = &p->se;
 6670	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 6671	int scale = cfs_rq->nr_running >= sched_nr_latency;
 6672	int next_buddy_marked = 0;
 6673
 6674	if (unlikely(se == pse))
 6675		return;
 6676
 6677	/*
 6678	 * This is possible from callers such as attach_tasks(), in which we
 6679	 * unconditionally check_prempt_curr() after an enqueue (which may have
 6680	 * lead to a throttle).  This both saves work and prevents false
 6681	 * next-buddy nomination below.
 6682	 */
 6683	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
 6684		return;
 6685
 6686	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
 6687		set_next_buddy(pse);
 6688		next_buddy_marked = 1;
 6689	}
 6690
 6691	/*
 6692	 * We can come here with TIF_NEED_RESCHED already set from new task
 6693	 * wake up path.
 6694	 *
 6695	 * Note: this also catches the edge-case of curr being in a throttled
 6696	 * group (e.g. via set_curr_task), since update_curr() (in the
 6697	 * enqueue of curr) will have resulted in resched being set.  This
 6698	 * prevents us from potentially nominating it as a false LAST_BUDDY
 6699	 * below.
 6700	 */
 6701	if (test_tsk_need_resched(curr))
 6702		return;
 6703
 6704	/* Idle tasks are by definition preempted by non-idle tasks. */
 6705	if (unlikely(task_has_idle_policy(curr)) &&
 6706	    likely(!task_has_idle_policy(p)))
 6707		goto preempt;
 6708
 6709	/*
 6710	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
 6711	 * is driven by the tick):
 6712	 */
 6713	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
 6714		return;
 6715
 6716	find_matching_se(&se, &pse);
 6717	update_curr(cfs_rq_of(se));
 6718	BUG_ON(!pse);
 6719	if (wakeup_preempt_entity(se, pse) == 1) {
 6720		/*
 6721		 * Bias pick_next to pick the sched entity that is
 6722		 * triggering this preemption.
 6723		 */
 6724		if (!next_buddy_marked)
 6725			set_next_buddy(pse);
 6726		goto preempt;
 6727	}
 6728
 6729	return;
 6730
 6731preempt:
 6732	resched_curr(rq);
 6733	/*
 6734	 * Only set the backward buddy when the current task is still
 6735	 * on the rq. This can happen when a wakeup gets interleaved
 6736	 * with schedule on the ->pre_schedule() or idle_balance()
 6737	 * point, either of which can * drop the rq lock.
 6738	 *
 6739	 * Also, during early boot the idle thread is in the fair class,
 6740	 * for obvious reasons its a bad idea to schedule back to it.
 6741	 */
 6742	if (unlikely(!se->on_rq || curr == rq->idle))
 6743		return;
 6744
 6745	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
 6746		set_last_buddy(se);
 6747}
 6748
 6749static struct task_struct *
 6750pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 6751{
 
 6752	struct cfs_rq *cfs_rq = &rq->cfs;
 6753	struct sched_entity *se;
 6754	struct task_struct *p;
 6755	int new_tasks;
 6756
 6757again:
 6758	if (!sched_fair_runnable(rq))
 6759		goto idle;
 6760
 6761#ifdef CONFIG_FAIR_GROUP_SCHED
 6762	if (!prev || prev->sched_class != &fair_sched_class)
 6763		goto simple;
 6764
 6765	/*
 6766	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
 6767	 * likely that a next task is from the same cgroup as the current.
 6768	 *
 6769	 * Therefore attempt to avoid putting and setting the entire cgroup
 6770	 * hierarchy, only change the part that actually changes.
 6771	 */
 6772
 6773	do {
 6774		struct sched_entity *curr = cfs_rq->curr;
 6775
 6776		/*
 6777		 * Since we got here without doing put_prev_entity() we also
 6778		 * have to consider cfs_rq->curr. If it is still a runnable
 6779		 * entity, update_curr() will update its vruntime, otherwise
 6780		 * forget we've ever seen it.
 6781		 */
 6782		if (curr) {
 6783			if (curr->on_rq)
 6784				update_curr(cfs_rq);
 6785			else
 6786				curr = NULL;
 6787
 6788			/*
 6789			 * This call to check_cfs_rq_runtime() will do the
 6790			 * throttle and dequeue its entity in the parent(s).
 6791			 * Therefore the nr_running test will indeed
 6792			 * be correct.
 6793			 */
 6794			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
 6795				cfs_rq = &rq->cfs;
 6796
 6797				if (!cfs_rq->nr_running)
 6798					goto idle;
 6799
 6800				goto simple;
 6801			}
 6802		}
 6803
 6804		se = pick_next_entity(cfs_rq, curr);
 6805		cfs_rq = group_cfs_rq(se);
 6806	} while (cfs_rq);
 6807
 6808	p = task_of(se);
 6809
 6810	/*
 6811	 * Since we haven't yet done put_prev_entity and if the selected task
 6812	 * is a different task than we started out with, try and touch the
 6813	 * least amount of cfs_rqs.
 6814	 */
 6815	if (prev != p) {
 6816		struct sched_entity *pse = &prev->se;
 6817
 6818		while (!(cfs_rq = is_same_group(se, pse))) {
 6819			int se_depth = se->depth;
 6820			int pse_depth = pse->depth;
 6821
 6822			if (se_depth <= pse_depth) {
 6823				put_prev_entity(cfs_rq_of(pse), pse);
 6824				pse = parent_entity(pse);
 6825			}
 6826			if (se_depth >= pse_depth) {
 6827				set_next_entity(cfs_rq_of(se), se);
 6828				se = parent_entity(se);
 6829			}
 6830		}
 6831
 6832		put_prev_entity(cfs_rq, pse);
 6833		set_next_entity(cfs_rq, se);
 6834	}
 6835
 6836	goto done;
 6837simple:
 6838#endif
 6839	if (prev)
 6840		put_prev_task(rq, prev);
 6841
 6842	do {
 6843		se = pick_next_entity(cfs_rq, NULL);
 6844		set_next_entity(cfs_rq, se);
 6845		cfs_rq = group_cfs_rq(se);
 6846	} while (cfs_rq);
 6847
 6848	p = task_of(se);
 6849
 6850done: __maybe_unused;
 6851#ifdef CONFIG_SMP
 6852	/*
 6853	 * Move the next running task to the front of
 6854	 * the list, so our cfs_tasks list becomes MRU
 6855	 * one.
 6856	 */
 6857	list_move(&p->se.group_node, &rq->cfs_tasks);
 6858#endif
 6859
 6860	if (hrtick_enabled(rq))
 6861		hrtick_start_fair(rq, p);
 6862
 6863	update_misfit_status(p, rq);
 6864
 6865	return p;
 6866
 6867idle:
 6868	if (!rf)
 6869		return NULL;
 6870
 6871	new_tasks = newidle_balance(rq, rf);
 6872
 6873	/*
 6874	 * Because newidle_balance() releases (and re-acquires) rq->lock, it is
 6875	 * possible for any higher priority task to appear. In that case we
 6876	 * must re-start the pick_next_entity() loop.
 6877	 */
 6878	if (new_tasks < 0)
 6879		return RETRY_TASK;
 6880
 6881	if (new_tasks > 0)
 6882		goto again;
 6883
 6884	/*
 6885	 * rq is about to be idle, check if we need to update the
 6886	 * lost_idle_time of clock_pelt
 6887	 */
 6888	update_idle_rq_clock_pelt(rq);
 6889
 6890	return NULL;
 6891}
 6892
 6893/*
 6894 * Account for a descheduled task:
 6895 */
 6896static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 6897{
 6898	struct sched_entity *se = &prev->se;
 6899	struct cfs_rq *cfs_rq;
 6900
 6901	for_each_sched_entity(se) {
 6902		cfs_rq = cfs_rq_of(se);
 6903		put_prev_entity(cfs_rq, se);
 6904	}
 6905}
 6906
 6907/*
 6908 * sched_yield() is very simple
 6909 *
 6910 * The magic of dealing with the ->skip buddy is in pick_next_entity.
 6911 */
 6912static void yield_task_fair(struct rq *rq)
 6913{
 6914	struct task_struct *curr = rq->curr;
 6915	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 6916	struct sched_entity *se = &curr->se;
 6917
 6918	/*
 6919	 * Are we the only task in the tree?
 6920	 */
 6921	if (unlikely(rq->nr_running == 1))
 6922		return;
 6923
 6924	clear_buddies(cfs_rq, se);
 6925
 6926	if (curr->policy != SCHED_BATCH) {
 6927		update_rq_clock(rq);
 6928		/*
 6929		 * Update run-time statistics of the 'current'.
 6930		 */
 6931		update_curr(cfs_rq);
 6932		/*
 6933		 * Tell update_rq_clock() that we've just updated,
 6934		 * so we don't do microscopic update in schedule()
 6935		 * and double the fastpath cost.
 6936		 */
 6937		rq_clock_skip_update(rq);
 6938	}
 6939
 6940	set_skip_buddy(se);
 6941}
 6942
 6943static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
 6944{
 6945	struct sched_entity *se = &p->se;
 6946
 6947	/* throttled hierarchies are not runnable */
 6948	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 6949		return false;
 6950
 6951	/* Tell the scheduler that we'd really like pse to run next. */
 6952	set_next_buddy(se);
 6953
 6954	yield_task_fair(rq);
 6955
 6956	return true;
 6957}
 6958
 6959#ifdef CONFIG_SMP
 6960/**************************************************
 6961 * Fair scheduling class load-balancing methods.
 6962 *
 6963 * BASICS
 6964 *
 6965 * The purpose of load-balancing is to achieve the same basic fairness the
 6966 * per-CPU scheduler provides, namely provide a proportional amount of compute
 6967 * time to each task. This is expressed in the following equation:
 6968 *
 6969 *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
 6970 *
 6971 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
 6972 * W_i,0 is defined as:
 6973 *
 6974 *   W_i,0 = \Sum_j w_i,j                                             (2)
 6975 *
 6976 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
 6977 * is derived from the nice value as per sched_prio_to_weight[].
 6978 *
 6979 * The weight average is an exponential decay average of the instantaneous
 6980 * weight:
 6981 *
 6982 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
 6983 *
 6984 * C_i is the compute capacity of CPU i, typically it is the
 6985 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
 6986 * can also include other factors [XXX].
 6987 *
 6988 * To achieve this balance we define a measure of imbalance which follows
 6989 * directly from (1):
 6990 *
 6991 *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
 6992 *
 6993 * We them move tasks around to minimize the imbalance. In the continuous
 6994 * function space it is obvious this converges, in the discrete case we get
 6995 * a few fun cases generally called infeasible weight scenarios.
 6996 *
 6997 * [XXX expand on:
 6998 *     - infeasible weights;
 6999 *     - local vs global optima in the discrete case. ]
 7000 *
 7001 *
 7002 * SCHED DOMAINS
 7003 *
 7004 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
 7005 * for all i,j solution, we create a tree of CPUs that follows the hardware
 7006 * topology where each level pairs two lower groups (or better). This results
 7007 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
 7008 * tree to only the first of the previous level and we decrease the frequency
 7009 * of load-balance at each level inv. proportional to the number of CPUs in
 7010 * the groups.
 7011 *
 7012 * This yields:
 7013 *
 7014 *     log_2 n     1     n
 7015 *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
 7016 *     i = 0      2^i   2^i
 7017 *                               `- size of each group
 7018 *         |         |     `- number of CPUs doing load-balance
 7019 *         |         `- freq
 7020 *         `- sum over all levels
 7021 *
 7022 * Coupled with a limit on how many tasks we can migrate every balance pass,
 7023 * this makes (5) the runtime complexity of the balancer.
 7024 *
 7025 * An important property here is that each CPU is still (indirectly) connected
 7026 * to every other CPU in at most O(log n) steps:
 7027 *
 7028 * The adjacency matrix of the resulting graph is given by:
 7029 *
 7030 *             log_2 n
 7031 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
 7032 *             k = 0
 7033 *
 7034 * And you'll find that:
 7035 *
 7036 *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
 7037 *
 7038 * Showing there's indeed a path between every CPU in at most O(log n) steps.
 7039 * The task movement gives a factor of O(m), giving a convergence complexity
 7040 * of:
 7041 *
 7042 *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
 7043 *
 7044 *
 7045 * WORK CONSERVING
 7046 *
 7047 * In order to avoid CPUs going idle while there's still work to do, new idle
 7048 * balancing is more aggressive and has the newly idle CPU iterate up the domain
 7049 * tree itself instead of relying on other CPUs to bring it work.
 7050 *
 7051 * This adds some complexity to both (5) and (8) but it reduces the total idle
 7052 * time.
 7053 *
 7054 * [XXX more?]
 7055 *
 7056 *
 7057 * CGROUPS
 7058 *
 7059 * Cgroups make a horror show out of (2), instead of a simple sum we get:
 7060 *
 7061 *                                s_k,i
 7062 *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
 7063 *                                 S_k
 7064 *
 7065 * Where
 7066 *
 7067 *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
 7068 *
 7069 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
 7070 *
 7071 * The big problem is S_k, its a global sum needed to compute a local (W_i)
 7072 * property.
 7073 *
 7074 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
 7075 *      rewrite all of this once again.]
 7076 */
 7077
 7078static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 7079
 7080enum fbq_type { regular, remote, all };
 7081
 7082enum group_type {
 7083	group_other = 0,
 7084	group_misfit_task,
 7085	group_imbalanced,
 7086	group_overloaded,
 7087};
 7088
 7089#define LBF_ALL_PINNED	0x01
 7090#define LBF_NEED_BREAK	0x02
 7091#define LBF_DST_PINNED  0x04
 7092#define LBF_SOME_PINNED	0x08
 7093#define LBF_NOHZ_STATS	0x10
 7094#define LBF_NOHZ_AGAIN	0x20
 7095
 7096struct lb_env {
 7097	struct sched_domain	*sd;
 7098
 7099	struct rq		*src_rq;
 7100	int			src_cpu;
 
 7101
 7102	int			dst_cpu;
 7103	struct rq		*dst_rq;
 7104
 7105	struct cpumask		*dst_grpmask;
 7106	int			new_dst_cpu;
 7107	enum cpu_idle_type	idle;
 7108	long			imbalance;
 7109	/* The set of CPUs under consideration for load-balancing */
 7110	struct cpumask		*cpus;
 7111
 7112	unsigned int		flags;
 7113
 7114	unsigned int		loop;
 7115	unsigned int		loop_break;
 7116	unsigned int		loop_max;
 7117
 7118	enum fbq_type		fbq_type;
 7119	enum group_type		src_grp_type;
 7120	struct list_head	tasks;
 7121};
 7122
 7123/*
 
 
 
 
 
 
 
 
 
 
 
 
 7124 * Is this task likely cache-hot:
 7125 */
 7126static int task_hot(struct task_struct *p, struct lb_env *env)
 
 7127{
 7128	s64 delta;
 7129
 7130	lockdep_assert_held(&env->src_rq->lock);
 7131
 7132	if (p->sched_class != &fair_sched_class)
 7133		return 0;
 7134
 7135	if (unlikely(task_has_idle_policy(p)))
 7136		return 0;
 7137
 7138	/*
 7139	 * Buddy candidates are cache hot:
 7140	 */
 7141	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
 7142			(&p->se == cfs_rq_of(&p->se)->next ||
 7143			 &p->se == cfs_rq_of(&p->se)->last))
 7144		return 1;
 7145
 7146	if (sysctl_sched_migration_cost == -1)
 7147		return 1;
 7148	if (sysctl_sched_migration_cost == 0)
 7149		return 0;
 7150
 7151	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
 7152
 7153	return delta < (s64)sysctl_sched_migration_cost;
 7154}
 7155
 7156#ifdef CONFIG_NUMA_BALANCING
 7157/*
 7158 * Returns 1, if task migration degrades locality
 7159 * Returns 0, if task migration improves locality i.e migration preferred.
 7160 * Returns -1, if task migration is not affected by locality.
 7161 */
 7162static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 7163{
 7164	struct numa_group *numa_group = rcu_dereference(p->numa_group);
 7165	unsigned long src_weight, dst_weight;
 7166	int src_nid, dst_nid, dist;
 7167
 7168	if (!static_branch_likely(&sched_numa_balancing))
 7169		return -1;
 7170
 7171	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
 7172		return -1;
 7173
 7174	src_nid = cpu_to_node(env->src_cpu);
 7175	dst_nid = cpu_to_node(env->dst_cpu);
 7176
 7177	if (src_nid == dst_nid)
 7178		return -1;
 7179
 7180	/* Migrating away from the preferred node is always bad. */
 7181	if (src_nid == p->numa_preferred_nid) {
 7182		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
 7183			return 1;
 7184		else
 7185			return -1;
 7186	}
 7187
 7188	/* Encourage migration to the preferred node. */
 7189	if (dst_nid == p->numa_preferred_nid)
 7190		return 0;
 7191
 7192	/* Leaving a core idle is often worse than degrading locality. */
 7193	if (env->idle == CPU_IDLE)
 7194		return -1;
 7195
 7196	dist = node_distance(src_nid, dst_nid);
 7197	if (numa_group) {
 7198		src_weight = group_weight(p, src_nid, dist);
 7199		dst_weight = group_weight(p, dst_nid, dist);
 7200	} else {
 7201		src_weight = task_weight(p, src_nid, dist);
 7202		dst_weight = task_weight(p, dst_nid, dist);
 7203	}
 7204
 7205	return dst_weight < src_weight;
 7206}
 7207
 7208#else
 7209static inline int migrate_degrades_locality(struct task_struct *p,
 7210					     struct lb_env *env)
 7211{
 7212	return -1;
 7213}
 7214#endif
 7215
 7216/*
 7217 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 7218 */
 7219static
 7220int can_migrate_task(struct task_struct *p, struct lb_env *env)
 7221{
 7222	int tsk_cache_hot;
 7223
 7224	lockdep_assert_held(&env->src_rq->lock);
 7225
 7226	/*
 7227	 * We do not migrate tasks that are:
 7228	 * 1) throttled_lb_pair, or
 7229	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
 7230	 * 3) running (obviously), or
 7231	 * 4) are cache-hot on their current CPU.
 7232	 */
 7233	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 7234		return 0;
 7235
 7236	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
 7237		int cpu;
 7238
 7239		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
 7240
 7241		env->flags |= LBF_SOME_PINNED;
 7242
 7243		/*
 7244		 * Remember if this task can be migrated to any other CPU in
 7245		 * our sched_group. We may want to revisit it if we couldn't
 7246		 * meet load balance goals by pulling other tasks on src_cpu.
 7247		 *
 7248		 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
 7249		 * already computed one in current iteration.
 7250		 */
 7251		if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
 7252			return 0;
 7253
 7254		/* Prevent to re-select dst_cpu via env's CPUs: */
 7255		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
 7256			if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
 7257				env->flags |= LBF_DST_PINNED;
 7258				env->new_dst_cpu = cpu;
 7259				break;
 7260			}
 7261		}
 7262
 7263		return 0;
 7264	}
 7265
 7266	/* Record that we found atleast one task that could run on dst_cpu */
 7267	env->flags &= ~LBF_ALL_PINNED;
 7268
 7269	if (task_running(env->src_rq, p)) {
 7270		schedstat_inc(p->se.statistics.nr_failed_migrations_running);
 7271		return 0;
 7272	}
 7273
 7274	/*
 7275	 * Aggressive migration if:
 7276	 * 1) destination numa is preferred
 7277	 * 2) task is cache cold, or
 7278	 * 3) too many balance attempts have failed.
 7279	 */
 7280	tsk_cache_hot = migrate_degrades_locality(p, env);
 7281	if (tsk_cache_hot == -1)
 7282		tsk_cache_hot = task_hot(p, env);
 7283
 7284	if (tsk_cache_hot <= 0 ||
 7285	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 7286		if (tsk_cache_hot == 1) {
 7287			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
 7288			schedstat_inc(p->se.statistics.nr_forced_migrations);
 7289		}
 
 7290		return 1;
 7291	}
 7292
 7293	schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
 7294	return 0;
 7295}
 7296
 7297/*
 7298 * detach_task() -- detach the task for the migration specified in env
 7299 */
 7300static void detach_task(struct task_struct *p, struct lb_env *env)
 7301{
 7302	lockdep_assert_held(&env->src_rq->lock);
 7303
 7304	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
 7305	set_task_cpu(p, env->dst_cpu);
 7306}
 7307
 7308/*
 7309 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
 7310 * part of active balancing operations within "domain".
 
 7311 *
 7312 * Returns a task if successful and NULL otherwise.
 7313 */
 7314static struct task_struct *detach_one_task(struct lb_env *env)
 7315{
 7316	struct task_struct *p;
 7317
 7318	lockdep_assert_held(&env->src_rq->lock);
 
 
 7319
 7320	list_for_each_entry_reverse(p,
 7321			&env->src_rq->cfs_tasks, se.group_node) {
 7322		if (!can_migrate_task(p, env))
 7323			continue;
 7324
 7325		detach_task(p, env);
 7326
 7327		/*
 7328		 * Right now, this is only the second place where
 7329		 * lb_gained[env->idle] is updated (other is detach_tasks)
 7330		 * so we can safely collect stats here rather than
 7331		 * inside detach_tasks().
 7332		 */
 7333		schedstat_inc(env->sd->lb_gained[env->idle]);
 7334		return p;
 7335	}
 7336	return NULL;
 7337}
 7338
 
 
 7339static const unsigned int sched_nr_migrate_break = 32;
 7340
 7341/*
 7342 * detach_tasks() -- tries to detach up to imbalance runnable load from
 7343 * busiest_rq, as part of a balancing operation within domain "sd".
 
 7344 *
 7345 * Returns number of detached tasks if successful and 0 otherwise.
 7346 */
 7347static int detach_tasks(struct lb_env *env)
 7348{
 7349	struct list_head *tasks = &env->src_rq->cfs_tasks;
 7350	struct task_struct *p;
 7351	unsigned long load;
 7352	int detached = 0;
 7353
 7354	lockdep_assert_held(&env->src_rq->lock);
 7355
 7356	if (env->imbalance <= 0)
 7357		return 0;
 7358
 7359	while (!list_empty(tasks)) {
 7360		/*
 7361		 * We don't want to steal all, otherwise we may be treated likewise,
 7362		 * which could at worst lead to a livelock crash.
 7363		 */
 7364		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
 7365			break;
 7366
 7367		p = list_last_entry(tasks, struct task_struct, se.group_node);
 7368
 7369		env->loop++;
 7370		/* We've more or less seen every task there is, call it quits */
 7371		if (env->loop > env->loop_max)
 7372			break;
 7373
 7374		/* take a breather every nr_migrate tasks */
 7375		if (env->loop > env->loop_break) {
 7376			env->loop_break += sched_nr_migrate_break;
 7377			env->flags |= LBF_NEED_BREAK;
 7378			break;
 7379		}
 7380
 7381		if (!can_migrate_task(p, env))
 7382			goto next;
 7383
 7384		load = task_h_load(p);
 7385
 7386		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
 7387			goto next;
 7388
 7389		if ((load / 2) > env->imbalance)
 7390			goto next;
 7391
 7392		detach_task(p, env);
 7393		list_add(&p->se.group_node, &env->tasks);
 7394
 7395		detached++;
 
 7396		env->imbalance -= load;
 7397
 7398#ifdef CONFIG_PREEMPTION
 7399		/*
 7400		 * NEWIDLE balancing is a source of latency, so preemptible
 7401		 * kernels will stop after the first task is detached to minimize
 7402		 * the critical section.
 7403		 */
 7404		if (env->idle == CPU_NEWLY_IDLE)
 7405			break;
 7406#endif
 7407
 7408		/*
 7409		 * We only want to steal up to the prescribed amount of
 7410		 * runnable load.
 7411		 */
 7412		if (env->imbalance <= 0)
 7413			break;
 7414
 7415		continue;
 7416next:
 7417		list_move(&p->se.group_node, tasks);
 7418	}
 7419
 7420	/*
 7421	 * Right now, this is one of only two places we collect this stat
 7422	 * so we can safely collect detach_one_task() stats here rather
 7423	 * than inside detach_one_task().
 7424	 */
 7425	schedstat_add(env->sd->lb_gained[env->idle], detached);
 7426
 7427	return detached;
 7428}
 7429
 7430/*
 7431 * attach_task() -- attach the task detached by detach_task() to its new rq.
 7432 */
 7433static void attach_task(struct rq *rq, struct task_struct *p)
 7434{
 7435	lockdep_assert_held(&rq->lock);
 7436
 7437	BUG_ON(task_rq(p) != rq);
 7438	activate_task(rq, p, ENQUEUE_NOCLOCK);
 7439	check_preempt_curr(rq, p, 0);
 7440}
 7441
 7442/*
 7443 * attach_one_task() -- attaches the task returned from detach_one_task() to
 7444 * its new rq.
 7445 */
 7446static void attach_one_task(struct rq *rq, struct task_struct *p)
 7447{
 7448	struct rq_flags rf;
 7449
 7450	rq_lock(rq, &rf);
 7451	update_rq_clock(rq);
 7452	attach_task(rq, p);
 7453	rq_unlock(rq, &rf);
 7454}
 7455
 
 7456/*
 7457 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
 7458 * new rq.
 7459 */
 7460static void attach_tasks(struct lb_env *env)
 7461{
 7462	struct list_head *tasks = &env->tasks;
 7463	struct task_struct *p;
 7464	struct rq_flags rf;
 7465
 7466	rq_lock(env->dst_rq, &rf);
 7467	update_rq_clock(env->dst_rq);
 7468
 7469	while (!list_empty(tasks)) {
 7470		p = list_first_entry(tasks, struct task_struct, se.group_node);
 7471		list_del_init(&p->se.group_node);
 7472
 7473		attach_task(env->dst_rq, p);
 7474	}
 7475
 7476	rq_unlock(env->dst_rq, &rf);
 7477}
 7478
 7479#ifdef CONFIG_NO_HZ_COMMON
 7480static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
 7481{
 7482	if (cfs_rq->avg.load_avg)
 7483		return true;
 7484
 7485	if (cfs_rq->avg.util_avg)
 7486		return true;
 7487
 7488	return false;
 7489}
 7490
 7491static inline bool others_have_blocked(struct rq *rq)
 7492{
 7493	if (READ_ONCE(rq->avg_rt.util_avg))
 7494		return true;
 7495
 7496	if (READ_ONCE(rq->avg_dl.util_avg))
 7497		return true;
 7498
 7499#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 7500	if (READ_ONCE(rq->avg_irq.util_avg))
 7501		return true;
 7502#endif
 7503
 7504	return false;
 7505}
 7506
 7507static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
 7508{
 7509	rq->last_blocked_load_update_tick = jiffies;
 7510
 7511	if (!has_blocked)
 7512		rq->has_blocked_load = 0;
 7513}
 7514#else
 7515static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
 7516static inline bool others_have_blocked(struct rq *rq) { return false; }
 7517static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
 7518#endif
 7519
 7520#ifdef CONFIG_FAIR_GROUP_SCHED
 
 7521
 7522static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 7523{
 7524	if (cfs_rq->load.weight)
 7525		return false;
 7526
 7527	if (cfs_rq->avg.load_sum)
 7528		return false;
 7529
 7530	if (cfs_rq->avg.util_sum)
 7531		return false;
 
 
 
 7532
 7533	if (cfs_rq->avg.runnable_load_sum)
 7534		return false;
 7535
 7536	return true;
 7537}
 7538
 7539static void update_blocked_averages(int cpu)
 7540{
 
 7541	struct rq *rq = cpu_rq(cpu);
 7542	struct cfs_rq *cfs_rq, *pos;
 7543	const struct sched_class *curr_class;
 7544	struct rq_flags rf;
 7545	bool done = true;
 7546
 7547	rq_lock_irqsave(rq, &rf);
 7548	update_rq_clock(rq);
 7549
 7550	/*
 7551	 * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
 7552	 * that RT, DL and IRQ signals have been updated before updating CFS.
 7553	 */
 7554	curr_class = rq->curr->sched_class;
 7555	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
 7556	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
 7557	update_irq_load_avg(rq, 0);
 7558
 7559	/* Don't need periodic decay once load/util_avg are null */
 7560	if (others_have_blocked(rq))
 7561		done = false;
 7562
 
 7563	/*
 7564	 * Iterates the task_group tree in a bottom up fashion, see
 7565	 * list_add_leaf_cfs_rq() for details.
 7566	 */
 7567	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 7568		struct sched_entity *se;
 7569
 7570		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
 7571			update_tg_load_avg(cfs_rq, 0);
 7572
 7573		/* Propagate pending load changes to the parent, if any: */
 7574		se = cfs_rq->tg->se[cpu];
 7575		if (se && !skip_blocked_update(se))
 7576			update_load_avg(cfs_rq_of(se), se, 0);
 7577
 7578		/*
 7579		 * There can be a lot of idle CPU cgroups.  Don't let fully
 7580		 * decayed cfs_rqs linger on the list.
 7581		 */
 7582		if (cfs_rq_is_decayed(cfs_rq))
 7583			list_del_leaf_cfs_rq(cfs_rq);
 7584
 7585		/* Don't need periodic decay once load/util_avg are null */
 7586		if (cfs_rq_has_blocked(cfs_rq))
 7587			done = false;
 7588	}
 7589
 7590	update_blocked_load_status(rq, !done);
 7591	rq_unlock_irqrestore(rq, &rf);
 7592}
 7593
 7594/*
 7595 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
 7596 * This needs to be done in a top-down fashion because the load of a child
 7597 * group is a fraction of its parents load.
 7598 */
 7599static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 7600{
 7601	struct rq *rq = rq_of(cfs_rq);
 7602	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
 7603	unsigned long now = jiffies;
 7604	unsigned long load;
 
 7605
 7606	if (cfs_rq->last_h_load_update == now)
 7607		return;
 7608
 7609	WRITE_ONCE(cfs_rq->h_load_next, NULL);
 7610	for_each_sched_entity(se) {
 7611		cfs_rq = cfs_rq_of(se);
 7612		WRITE_ONCE(cfs_rq->h_load_next, se);
 7613		if (cfs_rq->last_h_load_update == now)
 7614			break;
 7615	}
 7616
 7617	if (!se) {
 7618		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
 7619		cfs_rq->last_h_load_update = now;
 7620	}
 7621
 7622	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
 7623		load = cfs_rq->h_load;
 7624		load = div64_ul(load * se->avg.load_avg,
 7625			cfs_rq_load_avg(cfs_rq) + 1);
 7626		cfs_rq = group_cfs_rq(se);
 7627		cfs_rq->h_load = load;
 7628		cfs_rq->last_h_load_update = now;
 7629	}
 7630}
 7631
 7632static unsigned long task_h_load(struct task_struct *p)
 7633{
 7634	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 
 
 
 
 7635
 7636	update_cfs_rq_h_load(cfs_rq);
 7637	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
 7638			cfs_rq_load_avg(cfs_rq) + 1);
 7639}
 7640#else
 7641static inline void update_blocked_averages(int cpu)
 7642{
 7643	struct rq *rq = cpu_rq(cpu);
 7644	struct cfs_rq *cfs_rq = &rq->cfs;
 7645	const struct sched_class *curr_class;
 7646	struct rq_flags rf;
 7647
 7648	rq_lock_irqsave(rq, &rf);
 7649	update_rq_clock(rq);
 7650
 7651	/*
 7652	 * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
 7653	 * that RT, DL and IRQ signals have been updated before updating CFS.
 7654	 */
 7655	curr_class = rq->curr->sched_class;
 7656	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
 7657	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
 7658	update_irq_load_avg(rq, 0);
 7659
 7660	update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
 7661
 7662	update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
 7663	rq_unlock_irqrestore(rq, &rf);
 7664}
 7665
 7666static unsigned long task_h_load(struct task_struct *p)
 7667{
 7668	return p->se.avg.load_avg;
 7669}
 7670#endif
 7671
 7672/********** Helpers for find_busiest_group ************************/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7673
 7674/*
 7675 * sg_lb_stats - stats of a sched_group required for load_balancing
 7676 */
 7677struct sg_lb_stats {
 7678	unsigned long avg_load; /*Avg load across the CPUs of the group */
 7679	unsigned long group_load; /* Total load over the CPUs of the group */
 7680	unsigned long load_per_task;
 
 7681	unsigned long group_capacity;
 7682	unsigned long group_util; /* Total utilization of the group */
 7683	unsigned int sum_nr_running; /* Nr tasks running in the group */
 7684	unsigned int idle_cpus;
 7685	unsigned int group_weight;
 7686	enum group_type group_type;
 7687	int group_no_capacity;
 7688	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
 7689#ifdef CONFIG_NUMA_BALANCING
 7690	unsigned int nr_numa_running;
 7691	unsigned int nr_preferred_running;
 7692#endif
 7693};
 7694
 7695/*
 7696 * sd_lb_stats - Structure to store the statistics of a sched_domain
 7697 *		 during load balancing.
 7698 */
 7699struct sd_lb_stats {
 7700	struct sched_group *busiest;	/* Busiest group in this sd */
 7701	struct sched_group *local;	/* Local group in this sd */
 7702	unsigned long total_running;
 7703	unsigned long total_load;	/* Total load of all groups in sd */
 7704	unsigned long total_capacity;	/* Total capacity of all groups in sd */
 7705	unsigned long avg_load;	/* Average load across all groups in sd */
 
 
 
 7706
 7707	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
 7708	struct sg_lb_stats local_stat;	/* Statistics of the local group */
 7709};
 
 
 
 
 
 
 
 7710
 7711static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 7712{
 7713	/*
 7714	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
 7715	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
 7716	 * We must however clear busiest_stat::avg_load because
 7717	 * update_sd_pick_busiest() reads this before assignment.
 7718	 */
 7719	*sds = (struct sd_lb_stats){
 7720		.busiest = NULL,
 7721		.local = NULL,
 7722		.total_running = 0UL,
 7723		.total_load = 0UL,
 7724		.total_capacity = 0UL,
 7725		.busiest_stat = {
 7726			.avg_load = 0UL,
 7727			.sum_nr_running = 0,
 7728			.group_type = group_other,
 7729		},
 7730	};
 7731}
 7732
 7733static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
 
 
 
 
 
 7734{
 7735	struct rq *rq = cpu_rq(cpu);
 7736	unsigned long max = arch_scale_cpu_capacity(cpu);
 7737	unsigned long used, free;
 7738	unsigned long irq;
 7739
 7740	irq = cpu_util_irq(rq);
 
 
 
 
 
 7741
 7742	if (unlikely(irq >= max))
 7743		return 1;
 7744
 7745	used = READ_ONCE(rq->avg_rt.util_avg);
 7746	used += READ_ONCE(rq->avg_dl.util_avg);
 
 
 
 
 7747
 7748	if (unlikely(used >= max))
 7749		return 1;
 7750
 7751	free = max - used;
 7752
 7753	return scale_irq_capacity(free, irq, max);
 7754}
 7755
 7756static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 7757{
 7758	unsigned long capacity = scale_rt_capacity(sd, cpu);
 
 7759	struct sched_group *sdg = sd->groups;
 7760
 7761	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7762
 7763	if (!capacity)
 7764		capacity = 1;
 7765
 7766	cpu_rq(cpu)->cpu_capacity = capacity;
 7767	sdg->sgc->capacity = capacity;
 7768	sdg->sgc->min_capacity = capacity;
 7769	sdg->sgc->max_capacity = capacity;
 7770}
 7771
 7772void update_group_capacity(struct sched_domain *sd, int cpu)
 7773{
 7774	struct sched_domain *child = sd->child;
 7775	struct sched_group *group, *sdg = sd->groups;
 7776	unsigned long capacity, min_capacity, max_capacity;
 7777	unsigned long interval;
 7778
 7779	interval = msecs_to_jiffies(sd->balance_interval);
 7780	interval = clamp(interval, 1UL, max_load_balance_interval);
 7781	sdg->sgc->next_update = jiffies + interval;
 7782
 7783	if (!child) {
 7784		update_cpu_capacity(sd, cpu);
 7785		return;
 7786	}
 7787
 7788	capacity = 0;
 7789	min_capacity = ULONG_MAX;
 7790	max_capacity = 0;
 7791
 7792	if (child->flags & SD_OVERLAP) {
 7793		/*
 7794		 * SD_OVERLAP domains cannot assume that child groups
 7795		 * span the current group.
 7796		 */
 7797
 7798		for_each_cpu(cpu, sched_group_span(sdg)) {
 7799			struct sched_group_capacity *sgc;
 7800			struct rq *rq = cpu_rq(cpu);
 7801
 7802			/*
 7803			 * build_sched_domains() -> init_sched_groups_capacity()
 7804			 * gets here before we've attached the domains to the
 7805			 * runqueues.
 7806			 *
 7807			 * Use capacity_of(), which is set irrespective of domains
 7808			 * in update_cpu_capacity().
 7809			 *
 7810			 * This avoids capacity from being 0 and
 7811			 * causing divide-by-zero issues on boot.
 7812			 */
 7813			if (unlikely(!rq->sd)) {
 7814				capacity += capacity_of(cpu);
 7815			} else {
 7816				sgc = rq->sd->groups->sgc;
 7817				capacity += sgc->capacity;
 7818			}
 7819
 7820			min_capacity = min(capacity, min_capacity);
 7821			max_capacity = max(capacity, max_capacity);
 7822		}
 7823	} else  {
 7824		/*
 7825		 * !SD_OVERLAP domains can assume that child groups
 7826		 * span the current group.
 7827		 */
 7828
 7829		group = child->groups;
 7830		do {
 7831			struct sched_group_capacity *sgc = group->sgc;
 7832
 7833			capacity += sgc->capacity;
 7834			min_capacity = min(sgc->min_capacity, min_capacity);
 7835			max_capacity = max(sgc->max_capacity, max_capacity);
 7836			group = group->next;
 7837		} while (group != child->groups);
 7838	}
 7839
 7840	sdg->sgc->capacity = capacity;
 7841	sdg->sgc->min_capacity = min_capacity;
 7842	sdg->sgc->max_capacity = max_capacity;
 7843}
 7844
 7845/*
 7846 * Check whether the capacity of the rq has been noticeably reduced by side
 7847 * activity. The imbalance_pct is used for the threshold.
 7848 * Return true is the capacity is reduced
 7849 */
 7850static inline int
 7851check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 7852{
 7853	return ((rq->cpu_capacity * sd->imbalance_pct) <
 7854				(rq->cpu_capacity_orig * 100));
 7855}
 7856
 7857/*
 7858 * Check whether a rq has a misfit task and if it looks like we can actually
 7859 * help that task: we can migrate the task to a CPU of higher capacity, or
 7860 * the task's current CPU is heavily pressured.
 7861 */
 7862static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
 7863{
 7864	return rq->misfit_task_load &&
 7865		(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
 7866		 check_cpu_capacity(rq, sd));
 7867}
 7868
 7869/*
 7870 * Group imbalance indicates (and tries to solve) the problem where balancing
 7871 * groups is inadequate due to ->cpus_ptr constraints.
 7872 *
 7873 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
 7874 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
 7875 * Something like:
 7876 *
 7877 *	{ 0 1 2 3 } { 4 5 6 7 }
 7878 *	        *     * * *
 7879 *
 7880 * If we were to balance group-wise we'd place two tasks in the first group and
 7881 * two tasks in the second group. Clearly this is undesired as it will overload
 7882 * cpu 3 and leave one of the CPUs in the second group unused.
 7883 *
 7884 * The current solution to this issue is detecting the skew in the first group
 7885 * by noticing the lower domain failed to reach balance and had difficulty
 7886 * moving tasks due to affinity constraints.
 7887 *
 7888 * When this is so detected; this group becomes a candidate for busiest; see
 7889 * update_sd_pick_busiest(). And calculate_imbalance() and
 7890 * find_busiest_group() avoid some of the usual balance conditions to allow it
 7891 * to create an effective group imbalance.
 7892 *
 7893 * This is a somewhat tricky proposition since the next run might not find the
 7894 * group imbalance and decide the groups need to be balanced again. A most
 7895 * subtle and fragile situation.
 7896 */
 7897
 7898static inline int sg_imbalanced(struct sched_group *group)
 7899{
 7900	return group->sgc->imbalance;
 7901}
 7902
 7903/*
 7904 * group_has_capacity returns true if the group has spare capacity that could
 7905 * be used by some tasks.
 7906 * We consider that a group has spare capacity if the  * number of task is
 7907 * smaller than the number of CPUs or if the utilization is lower than the
 7908 * available capacity for CFS tasks.
 7909 * For the latter, we use a threshold to stabilize the state, to take into
 7910 * account the variance of the tasks' load and to return true if the available
 7911 * capacity in meaningful for the load balancer.
 7912 * As an example, an available capacity of 1% can appear but it doesn't make
 7913 * any benefit for the load balance.
 7914 */
 7915static inline bool
 7916group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
 7917{
 7918	if (sgs->sum_nr_running < sgs->group_weight)
 7919		return true;
 7920
 7921	if ((sgs->group_capacity * 100) >
 7922			(sgs->group_util * env->sd->imbalance_pct))
 7923		return true;
 7924
 7925	return false;
 7926}
 7927
 7928/*
 7929 *  group_is_overloaded returns true if the group has more tasks than it can
 7930 *  handle.
 7931 *  group_is_overloaded is not equals to !group_has_capacity because a group
 7932 *  with the exact right number of tasks, has no more spare capacity but is not
 7933 *  overloaded so both group_has_capacity and group_is_overloaded return
 7934 *  false.
 7935 */
 7936static inline bool
 7937group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 7938{
 7939	if (sgs->sum_nr_running <= sgs->group_weight)
 7940		return false;
 7941
 7942	if ((sgs->group_capacity * 100) <
 7943			(sgs->group_util * env->sd->imbalance_pct))
 7944		return true;
 7945
 7946	return false;
 7947}
 7948
 7949/*
 7950 * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
 7951 * per-CPU capacity than sched_group ref.
 7952 */
 7953static inline bool
 7954group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 7955{
 7956	return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
 7957}
 7958
 7959/*
 7960 * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
 7961 * per-CPU capacity_orig than sched_group ref.
 7962 */
 7963static inline bool
 7964group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 7965{
 7966	return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
 7967}
 7968
 7969static inline enum
 7970group_type group_classify(struct sched_group *group,
 7971			  struct sg_lb_stats *sgs)
 7972{
 7973	if (sgs->group_no_capacity)
 7974		return group_overloaded;
 7975
 7976	if (sg_imbalanced(group))
 7977		return group_imbalanced;
 7978
 7979	if (sgs->group_misfit_task_load)
 7980		return group_misfit_task;
 7981
 7982	return group_other;
 7983}
 7984
 7985static bool update_nohz_stats(struct rq *rq, bool force)
 7986{
 7987#ifdef CONFIG_NO_HZ_COMMON
 7988	unsigned int cpu = rq->cpu;
 7989
 7990	if (!rq->has_blocked_load)
 7991		return false;
 7992
 7993	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
 7994		return false;
 7995
 7996	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
 7997		return true;
 7998
 7999	update_blocked_averages(cpu);
 
 
 
 
 8000
 8001	return rq->has_blocked_load;
 8002#else
 8003	return false;
 8004#endif
 8005}
 8006
 8007/**
 8008 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 8009 * @env: The load balancing environment.
 8010 * @group: sched_group whose statistics are to be updated.
 
 
 
 
 8011 * @sgs: variable to hold the statistics for this group.
 8012 * @sg_status: Holds flag indicating the status of the sched_group
 8013 */
 8014static inline void update_sg_lb_stats(struct lb_env *env,
 8015				      struct sched_group *group,
 8016				      struct sg_lb_stats *sgs,
 8017				      int *sg_status)
 8018{
 8019	int i, nr_running;
 8020
 8021	memset(sgs, 0, sizeof(*sgs));
 
 
 8022
 8023	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 8024		struct rq *rq = cpu_rq(i);
 8025
 8026		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
 8027			env->flags |= LBF_NOHZ_AGAIN;
 
 
 
 8028
 8029		sgs->group_load += cpu_runnable_load(rq);
 8030		sgs->group_util += cpu_util(i);
 8031		sgs->sum_nr_running += rq->cfs.h_nr_running;
 8032
 8033		nr_running = rq->nr_running;
 8034		if (nr_running > 1)
 8035			*sg_status |= SG_OVERLOAD;
 8036
 8037		if (cpu_overutilized(i))
 8038			*sg_status |= SG_OVERUTILIZED;
 
 
 
 
 
 8039
 8040#ifdef CONFIG_NUMA_BALANCING
 8041		sgs->nr_numa_running += rq->nr_numa_running;
 8042		sgs->nr_preferred_running += rq->nr_preferred_running;
 8043#endif
 8044		/*
 8045		 * No need to call idle_cpu() if nr_running is not 0
 8046		 */
 8047		if (!nr_running && idle_cpu(i))
 
 
 
 
 
 
 
 
 
 
 8048			sgs->idle_cpus++;
 
 8049
 8050		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
 8051		    sgs->group_misfit_task_load < rq->misfit_task_load) {
 8052			sgs->group_misfit_task_load = rq->misfit_task_load;
 8053			*sg_status |= SG_OVERLOAD;
 8054		}
 
 
 
 
 
 
 
 
 
 
 8055	}
 8056
 8057	/* Adjust by relative CPU capacity of the group */
 8058	sgs->group_capacity = group->sgc->capacity;
 8059	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
 8060
 
 
 
 
 
 
 
 
 
 8061	if (sgs->sum_nr_running)
 8062		sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
 8063
 
 
 
 
 
 
 
 
 8064	sgs->group_weight = group->group_weight;
 8065
 8066	sgs->group_no_capacity = group_is_overloaded(env, sgs);
 8067	sgs->group_type = group_classify(group, sgs);
 8068}
 8069
 8070/**
 8071 * update_sd_pick_busiest - return 1 on busiest group
 8072 * @env: The load balancing environment.
 8073 * @sds: sched_domain statistics
 8074 * @sg: sched_group candidate to be checked for being the busiest
 8075 * @sgs: sched_group statistics
 8076 *
 8077 * Determine if @sg is a busier group than the previously selected
 8078 * busiest group.
 8079 *
 8080 * Return: %true if @sg is a busier group than the previously selected
 8081 * busiest group. %false otherwise.
 8082 */
 8083static bool update_sd_pick_busiest(struct lb_env *env,
 8084				   struct sd_lb_stats *sds,
 8085				   struct sched_group *sg,
 8086				   struct sg_lb_stats *sgs)
 8087{
 8088	struct sg_lb_stats *busiest = &sds->busiest_stat;
 8089
 8090	/*
 8091	 * Don't try to pull misfit tasks we can't help.
 8092	 * We can use max_capacity here as reduction in capacity on some
 8093	 * CPUs in the group should either be possible to resolve
 8094	 * internally or be covered by avg_load imbalance (eventually).
 8095	 */
 8096	if (sgs->group_type == group_misfit_task &&
 8097	    (!group_smaller_max_cpu_capacity(sg, sds->local) ||
 8098	     !group_has_capacity(env, &sds->local_stat)))
 8099		return false;
 8100
 8101	if (sgs->group_type > busiest->group_type)
 8102		return true;
 8103
 8104	if (sgs->group_type < busiest->group_type)
 8105		return false;
 8106
 8107	if (sgs->avg_load <= busiest->avg_load)
 8108		return false;
 8109
 8110	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
 8111		goto asym_packing;
 8112
 8113	/*
 8114	 * Candidate sg has no more than one task per CPU and
 8115	 * has higher per-CPU capacity. Migrating tasks to less
 8116	 * capable CPUs may harm throughput. Maximize throughput,
 8117	 * power/energy consequences are not considered.
 8118	 */
 8119	if (sgs->sum_nr_running <= sgs->group_weight &&
 8120	    group_smaller_min_cpu_capacity(sds->local, sg))
 8121		return false;
 8122
 8123	/*
 8124	 * If we have more than one misfit sg go with the biggest misfit.
 8125	 */
 8126	if (sgs->group_type == group_misfit_task &&
 8127	    sgs->group_misfit_task_load < busiest->group_misfit_task_load)
 8128		return false;
 8129
 8130asym_packing:
 8131	/* This is the busiest node in its class. */
 8132	if (!(env->sd->flags & SD_ASYM_PACKING))
 8133		return true;
 8134
 8135	/* No ASYM_PACKING if target CPU is already busy */
 8136	if (env->idle == CPU_NOT_IDLE)
 8137		return true;
 8138	/*
 8139	 * ASYM_PACKING needs to move all the work to the highest
 8140	 * prority CPUs in the group, therefore mark all groups
 8141	 * of lower priority than ourself as busy.
 8142	 */
 8143	if (sgs->sum_nr_running &&
 8144	    sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
 8145		if (!sds->busiest)
 8146			return true;
 8147
 8148		/* Prefer to move from lowest priority CPU's work */
 8149		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
 8150				      sg->asym_prefer_cpu))
 8151			return true;
 8152	}
 8153
 8154	return false;
 8155}
 8156
 8157#ifdef CONFIG_NUMA_BALANCING
 8158static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 8159{
 8160	if (sgs->sum_nr_running > sgs->nr_numa_running)
 8161		return regular;
 8162	if (sgs->sum_nr_running > sgs->nr_preferred_running)
 8163		return remote;
 8164	return all;
 8165}
 8166
 8167static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 8168{
 8169	if (rq->nr_running > rq->nr_numa_running)
 8170		return regular;
 8171	if (rq->nr_running > rq->nr_preferred_running)
 8172		return remote;
 8173	return all;
 8174}
 8175#else
 8176static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 8177{
 8178	return all;
 8179}
 8180
 8181static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 8182{
 8183	return regular;
 8184}
 8185#endif /* CONFIG_NUMA_BALANCING */
 8186
 8187/**
 8188 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 8189 * @env: The load balancing environment.
 
 
 8190 * @sds: variable to hold the statistics for this sched_domain.
 8191 */
 8192static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 
 
 8193{
 8194	struct sched_domain *child = env->sd->child;
 8195	struct sched_group *sg = env->sd->groups;
 8196	struct sg_lb_stats *local = &sds->local_stat;
 8197	struct sg_lb_stats tmp_sgs;
 8198	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 8199	int sg_status = 0;
 8200
 8201#ifdef CONFIG_NO_HZ_COMMON
 8202	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
 8203		env->flags |= LBF_NOHZ_STATS;
 8204#endif
 8205
 8206	do {
 8207		struct sg_lb_stats *sgs = &tmp_sgs;
 8208		int local_group;
 8209
 8210		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
 8211		if (local_group) {
 8212			sds->local = sg;
 8213			sgs = local;
 8214
 8215			if (env->idle != CPU_NEWLY_IDLE ||
 8216			    time_after_eq(jiffies, sg->sgc->next_update))
 8217				update_group_capacity(env->sd, env->dst_cpu);
 8218		}
 8219
 8220		update_sg_lb_stats(env, sg, sgs, &sg_status);
 8221
 8222		if (local_group)
 8223			goto next_group;
 8224
 8225		/*
 8226		 * In case the child domain prefers tasks go to siblings
 8227		 * first, lower the sg capacity so that we'll try
 8228		 * and move all the excess tasks away. We lower the capacity
 8229		 * of a group only if the local group has the capacity to fit
 8230		 * these excess tasks. The extra check prevents the case where
 8231		 * you always pull from the heaviest group when it is already
 8232		 * under-utilized (possible with a large weight task outweighs
 8233		 * the tasks on the system).
 8234		 */
 8235		if (prefer_sibling && sds->local &&
 8236		    group_has_capacity(env, local) &&
 8237		    (sgs->sum_nr_running > local->sum_nr_running + 1)) {
 8238			sgs->group_no_capacity = 1;
 8239			sgs->group_type = group_classify(sg, sgs);
 8240		}
 8241
 8242		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
 
 
 
 
 
 
 
 
 8243			sds->busiest = sg;
 8244			sds->busiest_stat = *sgs;
 
 
 
 
 
 
 8245		}
 8246
 8247next_group:
 8248		/* Now, start updating sd_lb_stats */
 8249		sds->total_running += sgs->sum_nr_running;
 8250		sds->total_load += sgs->group_load;
 8251		sds->total_capacity += sgs->group_capacity;
 8252
 8253		sg = sg->next;
 8254	} while (sg != env->sd->groups);
 8255
 8256#ifdef CONFIG_NO_HZ_COMMON
 8257	if ((env->flags & LBF_NOHZ_AGAIN) &&
 8258	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
 8259
 8260		WRITE_ONCE(nohz.next_blocked,
 8261			   jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
 8262	}
 8263#endif
 8264
 8265	if (env->sd->flags & SD_NUMA)
 8266		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 8267
 8268	if (!env->sd->parent) {
 8269		struct root_domain *rd = env->dst_rq->rd;
 8270
 8271		/* update overload indicator if we are at root domain */
 8272		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
 8273
 8274		/* Update over-utilization (tipping point, U >= 0) indicator */
 8275		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
 8276		trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
 8277	} else if (sg_status & SG_OVERUTILIZED) {
 8278		struct root_domain *rd = env->dst_rq->rd;
 8279
 8280		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
 8281		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
 8282	}
 8283}
 8284
 8285/**
 8286 * check_asym_packing - Check to see if the group is packed into the
 8287 *			sched domain.
 8288 *
 8289 * This is primarily intended to used at the sibling level.  Some
 8290 * cores like POWER7 prefer to use lower numbered SMT threads.  In the
 8291 * case of POWER7, it can move to lower SMT modes only when higher
 8292 * threads are idle.  When in lower SMT modes, the threads will
 8293 * perform better since they share less core resources.  Hence when we
 8294 * have idle threads, we want them to be the higher ones.
 8295 *
 8296 * This packing function is run on idle threads.  It checks to see if
 8297 * the busiest CPU in this domain (core in the P7 case) has a higher
 8298 * CPU number than the packing function is being run on.  Here we are
 8299 * assuming lower CPU number will be equivalent to lower a SMT thread
 8300 * number.
 8301 *
 8302 * Return: 1 when packing is required and a task should be moved to
 8303 * this CPU.  The amount of the imbalance is returned in env->imbalance.
 8304 *
 8305 * @env: The load balancing environment.
 8306 * @sds: Statistics of the sched_domain which is to be packed
 8307 */
 8308static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 8309{
 8310	int busiest_cpu;
 8311
 8312	if (!(env->sd->flags & SD_ASYM_PACKING))
 8313		return 0;
 8314
 8315	if (env->idle == CPU_NOT_IDLE)
 8316		return 0;
 8317
 8318	if (!sds->busiest)
 8319		return 0;
 8320
 8321	busiest_cpu = sds->busiest->asym_prefer_cpu;
 8322	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
 8323		return 0;
 8324
 8325	env->imbalance = sds->busiest_stat.group_load;
 
 8326
 8327	return 1;
 8328}
 8329
 8330/**
 8331 * fix_small_imbalance - Calculate the minor imbalance that exists
 8332 *			amongst the groups of a sched_domain, during
 8333 *			load balancing.
 8334 * @env: The load balancing environment.
 8335 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
 8336 */
 8337static inline
 8338void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 8339{
 8340	unsigned long tmp, capa_now = 0, capa_move = 0;
 8341	unsigned int imbn = 2;
 8342	unsigned long scaled_busy_load_per_task;
 8343	struct sg_lb_stats *local, *busiest;
 8344
 8345	local = &sds->local_stat;
 8346	busiest = &sds->busiest_stat;
 
 
 
 
 
 
 
 8347
 8348	if (!local->sum_nr_running)
 8349		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
 8350	else if (busiest->load_per_task > local->load_per_task)
 8351		imbn = 1;
 8352
 8353	scaled_busy_load_per_task =
 8354		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
 8355		busiest->group_capacity;
 8356
 8357	if (busiest->avg_load + scaled_busy_load_per_task >=
 8358	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
 8359		env->imbalance = busiest->load_per_task;
 8360		return;
 8361	}
 8362
 8363	/*
 8364	 * OK, we don't have enough imbalance to justify moving tasks,
 8365	 * however we may be able to increase total CPU capacity used by
 8366	 * moving them.
 8367	 */
 8368
 8369	capa_now += busiest->group_capacity *
 8370			min(busiest->load_per_task, busiest->avg_load);
 8371	capa_now += local->group_capacity *
 8372			min(local->load_per_task, local->avg_load);
 8373	capa_now /= SCHED_CAPACITY_SCALE;
 8374
 8375	/* Amount of load we'd subtract */
 8376	if (busiest->avg_load > scaled_busy_load_per_task) {
 8377		capa_move += busiest->group_capacity *
 8378			    min(busiest->load_per_task,
 8379				busiest->avg_load - scaled_busy_load_per_task);
 8380	}
 8381
 8382	/* Amount of load we'd add */
 8383	if (busiest->avg_load * busiest->group_capacity <
 8384	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
 8385		tmp = (busiest->avg_load * busiest->group_capacity) /
 8386		      local->group_capacity;
 8387	} else {
 8388		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
 8389		      local->group_capacity;
 8390	}
 8391	capa_move += local->group_capacity *
 8392		    min(local->load_per_task, local->avg_load + tmp);
 8393	capa_move /= SCHED_CAPACITY_SCALE;
 8394
 8395	/* Move if we gain throughput */
 8396	if (capa_move > capa_now)
 8397		env->imbalance = busiest->load_per_task;
 8398}
 8399
 8400/**
 8401 * calculate_imbalance - Calculate the amount of imbalance present within the
 8402 *			 groups of a given sched_domain during load balance.
 8403 * @env: load balance environment
 8404 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
 8405 */
 8406static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 8407{
 8408	unsigned long max_pull, load_above_capacity = ~0UL;
 8409	struct sg_lb_stats *local, *busiest;
 8410
 8411	local = &sds->local_stat;
 8412	busiest = &sds->busiest_stat;
 8413
 8414	if (busiest->group_type == group_imbalanced) {
 8415		/*
 8416		 * In the group_imb case we cannot rely on group-wide averages
 8417		 * to ensure CPU-load equilibrium, look at wider averages. XXX
 8418		 */
 8419		busiest->load_per_task =
 8420			min(busiest->load_per_task, sds->avg_load);
 8421	}
 8422
 8423	/*
 8424	 * Avg load of busiest sg can be less and avg load of local sg can
 8425	 * be greater than avg load across all sgs of sd because avg load
 8426	 * factors in sg capacity and sgs with smaller group_type are
 8427	 * skipped when updating the busiest sg:
 8428	 */
 8429	if (busiest->group_type != group_misfit_task &&
 8430	    (busiest->avg_load <= sds->avg_load ||
 8431	     local->avg_load >= sds->avg_load)) {
 8432		env->imbalance = 0;
 8433		return fix_small_imbalance(env, sds);
 8434	}
 8435
 8436	/*
 8437	 * If there aren't any idle CPUs, avoid creating some.
 8438	 */
 8439	if (busiest->group_type == group_overloaded &&
 8440	    local->group_type   == group_overloaded) {
 8441		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
 8442		if (load_above_capacity > busiest->group_capacity) {
 8443			load_above_capacity -= busiest->group_capacity;
 8444			load_above_capacity *= scale_load_down(NICE_0_LOAD);
 8445			load_above_capacity /= busiest->group_capacity;
 8446		} else
 8447			load_above_capacity = ~0UL;
 8448	}
 8449
 8450	/*
 8451	 * We're trying to get all the CPUs to the average_load, so we don't
 8452	 * want to push ourselves above the average load, nor do we wish to
 8453	 * reduce the max loaded CPU below the average load. At the same time,
 8454	 * we also don't want to reduce the group load below the group
 8455	 * capacity. Thus we look for the minimum possible imbalance.
 
 
 
 8456	 */
 8457	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
 8458
 8459	/* How much load to actually move to equalise the imbalance */
 8460	env->imbalance = min(
 8461		max_pull * busiest->group_capacity,
 8462		(sds->avg_load - local->avg_load) * local->group_capacity
 8463	) / SCHED_CAPACITY_SCALE;
 8464
 8465	/* Boost imbalance to allow misfit task to be balanced. */
 8466	if (busiest->group_type == group_misfit_task) {
 8467		env->imbalance = max_t(long, env->imbalance,
 8468				       busiest->group_misfit_task_load);
 8469	}
 8470
 8471	/*
 8472	 * if *imbalance is less than the average load per runnable task
 8473	 * there is no guarantee that any tasks will be moved so we'll have
 8474	 * a think about bumping its value to force at least one task to be
 8475	 * moved
 8476	 */
 8477	if (env->imbalance < busiest->load_per_task)
 8478		return fix_small_imbalance(env, sds);
 
 8479}
 8480
 8481/******* find_busiest_group() helpers end here *********************/
 8482
 8483/**
 8484 * find_busiest_group - Returns the busiest group within the sched_domain
 8485 * if there is an imbalance.
 
 
 
 8486 *
 8487 * Also calculates the amount of runnable load which should be moved
 8488 * to restore balance.
 8489 *
 8490 * @env: The load balancing environment.
 8491 *
 8492 * Return:	- The busiest group if imbalance exists.
 
 
 
 
 
 
 8493 */
 8494static struct sched_group *find_busiest_group(struct lb_env *env)
 
 8495{
 8496	struct sg_lb_stats *local, *busiest;
 8497	struct sd_lb_stats sds;
 8498
 8499	init_sd_lb_stats(&sds);
 8500
 8501	/*
 8502	 * Compute the various statistics relavent for load balancing at
 8503	 * this level.
 8504	 */
 8505	update_sd_lb_stats(env, &sds);
 8506
 8507	if (sched_energy_enabled()) {
 8508		struct root_domain *rd = env->dst_rq->rd;
 8509
 8510		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
 8511			goto out_balanced;
 8512	}
 8513
 8514	local = &sds.local_stat;
 8515	busiest = &sds.busiest_stat;
 8516
 8517	/* ASYM feature bypasses nice load balance check */
 8518	if (check_asym_packing(env, &sds))
 8519		return sds.busiest;
 8520
 8521	/* There is no busy sibling group to pull tasks from */
 8522	if (!sds.busiest || busiest->sum_nr_running == 0)
 8523		goto out_balanced;
 8524
 8525	/* XXX broken for overlapping NUMA groups */
 8526	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
 8527						/ sds.total_capacity;
 8528
 8529	/*
 8530	 * If the busiest group is imbalanced the below checks don't
 8531	 * work because they assume all things are equal, which typically
 8532	 * isn't true due to cpus_ptr constraints and the like.
 8533	 */
 8534	if (busiest->group_type == group_imbalanced)
 8535		goto force_balance;
 8536
 8537	/*
 8538	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
 8539	 * capacities from resulting in underutilization due to avg_load.
 8540	 */
 8541	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
 8542	    busiest->group_no_capacity)
 8543		goto force_balance;
 8544
 8545	/* Misfit tasks should be dealt with regardless of the avg load */
 8546	if (busiest->group_type == group_misfit_task)
 8547		goto force_balance;
 8548
 8549	/*
 8550	 * If the local group is busier than the selected busiest group
 8551	 * don't try and pull any tasks.
 8552	 */
 8553	if (local->avg_load >= busiest->avg_load)
 8554		goto out_balanced;
 8555
 8556	/*
 8557	 * Don't pull any tasks if this group is already above the domain
 8558	 * average load.
 8559	 */
 8560	if (local->avg_load >= sds.avg_load)
 8561		goto out_balanced;
 8562
 8563	if (env->idle == CPU_IDLE) {
 8564		/*
 8565		 * This CPU is idle. If the busiest group is not overloaded
 8566		 * and there is no imbalance between this and busiest group
 8567		 * wrt idle CPUs, it is balanced. The imbalance becomes
 8568		 * significant if the diff is greater than 1 otherwise we
 8569		 * might end up to just move the imbalance on another group
 8570		 */
 8571		if ((busiest->group_type != group_overloaded) &&
 8572				(local->idle_cpus <= (busiest->idle_cpus + 1)))
 8573			goto out_balanced;
 8574	} else {
 8575		/*
 8576		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
 8577		 * imbalance_pct to be conservative.
 8578		 */
 8579		if (100 * busiest->avg_load <=
 8580				env->sd->imbalance_pct * local->avg_load)
 8581			goto out_balanced;
 8582	}
 8583
 8584force_balance:
 8585	/* Looks like there is an imbalance. Compute it */
 8586	env->src_grp_type = busiest->group_type;
 8587	calculate_imbalance(env, &sds);
 8588	return env->imbalance ? sds.busiest : NULL;
 8589
 8590out_balanced:
 
 8591	env->imbalance = 0;
 8592	return NULL;
 8593}
 8594
 8595/*
 8596 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
 8597 */
 8598static struct rq *find_busiest_queue(struct lb_env *env,
 8599				     struct sched_group *group)
 
 8600{
 8601	struct rq *busiest = NULL, *rq;
 8602	unsigned long busiest_load = 0, busiest_capacity = 1;
 8603	int i;
 8604
 8605	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 8606		unsigned long capacity, load;
 8607		enum fbq_type rt;
 
 
 8608
 8609		rq = cpu_rq(i);
 8610		rt = fbq_classify_rq(rq);
 8611
 8612		/*
 8613		 * We classify groups/runqueues into three groups:
 8614		 *  - regular: there are !numa tasks
 8615		 *  - remote:  there are numa tasks that run on the 'wrong' node
 8616		 *  - all:     there is no distinction
 8617		 *
 8618		 * In order to avoid migrating ideally placed numa tasks,
 8619		 * ignore those when there's better options.
 8620		 *
 8621		 * If we ignore the actual busiest queue to migrate another
 8622		 * task, the next balance pass can still reduce the busiest
 8623		 * queue by moving tasks around inside the node.
 8624		 *
 8625		 * If we cannot move enough load due to this classification
 8626		 * the next pass will adjust the group classification and
 8627		 * allow migration of more tasks.
 8628		 *
 8629		 * Both cases only affect the total convergence complexity.
 8630		 */
 8631		if (rt > env->fbq_type)
 8632			continue;
 8633
 8634		/*
 8635		 * For ASYM_CPUCAPACITY domains with misfit tasks we simply
 8636		 * seek the "biggest" misfit task.
 8637		 */
 8638		if (env->src_grp_type == group_misfit_task) {
 8639			if (rq->misfit_task_load > busiest_load) {
 8640				busiest_load = rq->misfit_task_load;
 8641				busiest = rq;
 8642			}
 8643
 
 8644			continue;
 8645		}
 8646
 8647		capacity = capacity_of(i);
 
 8648
 8649		/*
 8650		 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
 8651		 * eventually lead to active_balancing high->low capacity.
 8652		 * Higher per-CPU capacity is considered better than balancing
 8653		 * average load.
 8654		 */
 8655		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
 8656		    capacity_of(env->dst_cpu) < capacity &&
 8657		    rq->nr_running == 1)
 8658			continue;
 8659
 8660		load = cpu_runnable_load(rq);
 8661
 8662		/*
 8663		 * When comparing with imbalance, use cpu_runnable_load()
 8664		 * which is not scaled with the CPU capacity.
 
 
 8665		 */
 
 8666
 8667		if (rq->nr_running == 1 && load > env->imbalance &&
 8668		    !check_cpu_capacity(rq, env->sd))
 8669			continue;
 8670
 8671		/*
 8672		 * For the load comparisons with the other CPU's, consider
 8673		 * the cpu_runnable_load() scaled with the CPU capacity, so
 8674		 * that the load can be moved away from the CPU that is
 8675		 * potentially running at a lower capacity.
 8676		 *
 8677		 * Thus we're looking for max(load_i / capacity_i), crosswise
 8678		 * multiplication to rid ourselves of the division works out
 8679		 * to: load_i * capacity_j > load_j * capacity_i;  where j is
 8680		 * our previous maximum.
 8681		 */
 8682		if (load * busiest_capacity > busiest_load * capacity) {
 8683			busiest_load = load;
 8684			busiest_capacity = capacity;
 8685			busiest = rq;
 8686		}
 8687	}
 8688
 8689	return busiest;
 8690}
 8691
 8692/*
 8693 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
 8694 * so long as it is large enough.
 8695 */
 8696#define MAX_PINNED_INTERVAL	512
 8697
 8698static inline bool
 8699asym_active_balance(struct lb_env *env)
 8700{
 8701	/*
 8702	 * ASYM_PACKING needs to force migrate tasks from busy but
 8703	 * lower priority CPUs in order to pack all tasks in the
 8704	 * highest priority CPUs.
 8705	 */
 8706	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
 8707	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
 8708}
 8709
 8710static inline bool
 8711voluntary_active_balance(struct lb_env *env)
 8712{
 8713	struct sched_domain *sd = env->sd;
 8714
 8715	if (asym_active_balance(env))
 8716		return 1;
 8717
 8718	/*
 8719	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
 8720	 * It's worth migrating the task if the src_cpu's capacity is reduced
 8721	 * because of other sched_class or IRQs if more capacity stays
 8722	 * available on dst_cpu.
 8723	 */
 8724	if ((env->idle != CPU_NOT_IDLE) &&
 8725	    (env->src_rq->cfs.h_nr_running == 1)) {
 8726		if ((check_cpu_capacity(env->src_rq, sd)) &&
 8727		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
 8728			return 1;
 8729	}
 8730
 8731	if (env->src_grp_type == group_misfit_task)
 8732		return 1;
 8733
 8734	return 0;
 8735}
 8736
 8737static int need_active_balance(struct lb_env *env)
 8738{
 8739	struct sched_domain *sd = env->sd;
 8740
 8741	if (voluntary_active_balance(env))
 8742		return 1;
 8743
 8744	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 8745}
 8746
 8747static int active_load_balance_cpu_stop(void *data);
 8748
 8749static int should_we_balance(struct lb_env *env)
 8750{
 8751	struct sched_group *sg = env->sd->groups;
 8752	int cpu, balance_cpu = -1;
 8753
 8754	/*
 8755	 * Ensure the balancing environment is consistent; can happen
 8756	 * when the softirq triggers 'during' hotplug.
 8757	 */
 8758	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
 8759		return 0;
 8760
 8761	/*
 8762	 * In the newly idle case, we will allow all the CPUs
 8763	 * to do the newly idle load balance.
 8764	 */
 8765	if (env->idle == CPU_NEWLY_IDLE)
 8766		return 1;
 8767
 8768	/* Try to find first idle CPU */
 8769	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
 8770		if (!idle_cpu(cpu))
 8771			continue;
 8772
 8773		balance_cpu = cpu;
 8774		break;
 8775	}
 8776
 8777	if (balance_cpu == -1)
 8778		balance_cpu = group_balance_cpu(sg);
 8779
 8780	/*
 8781	 * First idle CPU or the first CPU(busiest) in this sched group
 8782	 * is eligible for doing load balancing at this and above domains.
 8783	 */
 8784	return balance_cpu == env->dst_cpu;
 8785}
 8786
 8787/*
 8788 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 8789 * tasks if there is an imbalance.
 8790 */
 8791static int load_balance(int this_cpu, struct rq *this_rq,
 8792			struct sched_domain *sd, enum cpu_idle_type idle,
 8793			int *continue_balancing)
 8794{
 8795	int ld_moved, cur_ld_moved, active_balance = 0;
 8796	struct sched_domain *sd_parent = sd->parent;
 8797	struct sched_group *group;
 8798	struct rq *busiest;
 8799	struct rq_flags rf;
 8800	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
 8801
 8802	struct lb_env env = {
 8803		.sd		= sd,
 8804		.dst_cpu	= this_cpu,
 8805		.dst_rq		= this_rq,
 8806		.dst_grpmask    = sched_group_span(sd->groups),
 8807		.idle		= idle,
 8808		.loop_break	= sched_nr_migrate_break,
 8809		.cpus		= cpus,
 8810		.fbq_type	= all,
 8811		.tasks		= LIST_HEAD_INIT(env.tasks),
 8812	};
 8813
 8814	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
 8815
 8816	schedstat_inc(sd->lb_count[idle]);
 8817
 8818redo:
 8819	if (!should_we_balance(&env)) {
 8820		*continue_balancing = 0;
 
 8821		goto out_balanced;
 8822	}
 8823
 8824	group = find_busiest_group(&env);
 8825	if (!group) {
 8826		schedstat_inc(sd->lb_nobusyg[idle]);
 8827		goto out_balanced;
 8828	}
 8829
 8830	busiest = find_busiest_queue(&env, group);
 8831	if (!busiest) {
 8832		schedstat_inc(sd->lb_nobusyq[idle]);
 8833		goto out_balanced;
 8834	}
 8835
 8836	BUG_ON(busiest == env.dst_rq);
 8837
 8838	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
 8839
 8840	env.src_cpu = busiest->cpu;
 8841	env.src_rq = busiest;
 8842
 8843	ld_moved = 0;
 8844	if (busiest->nr_running > 1) {
 8845		/*
 8846		 * Attempt to move tasks. If find_busiest_group has found
 8847		 * an imbalance but busiest->nr_running <= 1, the group is
 8848		 * still unbalanced. ld_moved simply stays zero, so it is
 8849		 * correctly treated as an imbalance.
 8850		 */
 8851		env.flags |= LBF_ALL_PINNED;
 
 
 8852		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 8853
 8854more_balance:
 8855		rq_lock_irqsave(busiest, &rf);
 8856		update_rq_clock(busiest);
 8857
 8858		/*
 8859		 * cur_ld_moved - load moved in current iteration
 8860		 * ld_moved     - cumulative load moved across iterations
 8861		 */
 8862		cur_ld_moved = detach_tasks(&env);
 8863
 8864		/*
 8865		 * We've detached some tasks from busiest_rq. Every
 8866		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
 8867		 * unlock busiest->lock, and we are able to be sure
 8868		 * that nobody can manipulate the tasks in parallel.
 8869		 * See task_rq_lock() family for the details.
 8870		 */
 8871
 8872		rq_unlock(busiest, &rf);
 8873
 8874		if (cur_ld_moved) {
 8875			attach_tasks(&env);
 8876			ld_moved += cur_ld_moved;
 8877		}
 8878
 8879		local_irq_restore(rf.flags);
 8880
 8881		if (env.flags & LBF_NEED_BREAK) {
 8882			env.flags &= ~LBF_NEED_BREAK;
 8883			goto more_balance;
 8884		}
 8885
 8886		/*
 8887		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
 8888		 * us and move them to an alternate dst_cpu in our sched_group
 8889		 * where they can run. The upper limit on how many times we
 8890		 * iterate on same src_cpu is dependent on number of CPUs in our
 8891		 * sched_group.
 8892		 *
 8893		 * This changes load balance semantics a bit on who can move
 8894		 * load to a given_cpu. In addition to the given_cpu itself
 8895		 * (or a ilb_cpu acting on its behalf where given_cpu is
 8896		 * nohz-idle), we now have balance_cpu in a position to move
 8897		 * load to given_cpu. In rare situations, this may cause
 8898		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
 8899		 * _independently_ and at _same_ time to move some load to
 8900		 * given_cpu) causing exceess load to be moved to given_cpu.
 8901		 * This however should not happen so much in practice and
 8902		 * moreover subsequent load balance cycles should correct the
 8903		 * excess load moved.
 8904		 */
 8905		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
 8906
 8907			/* Prevent to re-select dst_cpu via env's CPUs */
 8908			__cpumask_clear_cpu(env.dst_cpu, env.cpus);
 8909
 8910			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
 8911			env.dst_cpu	 = env.new_dst_cpu;
 8912			env.flags	&= ~LBF_DST_PINNED;
 8913			env.loop	 = 0;
 8914			env.loop_break	 = sched_nr_migrate_break;
 8915
 8916			/*
 8917			 * Go back to "more_balance" rather than "redo" since we
 8918			 * need to continue with same src_cpu.
 8919			 */
 8920			goto more_balance;
 8921		}
 8922
 8923		/*
 8924		 * We failed to reach balance because of affinity.
 8925		 */
 8926		if (sd_parent) {
 8927			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
 8928
 8929			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
 8930				*group_imbalance = 1;
 8931		}
 8932
 8933		/* All tasks on this runqueue were pinned by CPU affinity */
 8934		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 8935			__cpumask_clear_cpu(cpu_of(busiest), cpus);
 8936			/*
 8937			 * Attempting to continue load balancing at the current
 8938			 * sched_domain level only makes sense if there are
 8939			 * active CPUs remaining as possible busiest CPUs to
 8940			 * pull load from which are not contained within the
 8941			 * destination group that is receiving any migrated
 8942			 * load.
 8943			 */
 8944			if (!cpumask_subset(cpus, env.dst_grpmask)) {
 8945				env.loop = 0;
 8946				env.loop_break = sched_nr_migrate_break;
 8947				goto redo;
 8948			}
 8949			goto out_all_pinned;
 8950		}
 8951	}
 8952
 8953	if (!ld_moved) {
 8954		schedstat_inc(sd->lb_failed[idle]);
 8955		/*
 8956		 * Increment the failure counter only on periodic balance.
 8957		 * We do not want newidle balance, which can be very
 8958		 * frequent, pollute the failure counter causing
 8959		 * excessive cache_hot migrations and active balances.
 8960		 */
 8961		if (idle != CPU_NEWLY_IDLE)
 8962			sd->nr_balance_failed++;
 8963
 8964		if (need_active_balance(&env)) {
 8965			unsigned long flags;
 8966
 8967			raw_spin_lock_irqsave(&busiest->lock, flags);
 8968
 8969			/*
 8970			 * Don't kick the active_load_balance_cpu_stop,
 8971			 * if the curr task on busiest CPU can't be
 8972			 * moved to this_cpu:
 8973			 */
 8974			if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
 
 8975				raw_spin_unlock_irqrestore(&busiest->lock,
 8976							    flags);
 8977				env.flags |= LBF_ALL_PINNED;
 8978				goto out_one_pinned;
 8979			}
 8980
 8981			/*
 8982			 * ->active_balance synchronizes accesses to
 8983			 * ->active_balance_work.  Once set, it's cleared
 8984			 * only after active load balance is finished.
 8985			 */
 8986			if (!busiest->active_balance) {
 8987				busiest->active_balance = 1;
 8988				busiest->push_cpu = this_cpu;
 8989				active_balance = 1;
 8990			}
 8991			raw_spin_unlock_irqrestore(&busiest->lock, flags);
 8992
 8993			if (active_balance) {
 8994				stop_one_cpu_nowait(cpu_of(busiest),
 8995					active_load_balance_cpu_stop, busiest,
 8996					&busiest->active_balance_work);
 8997			}
 8998
 8999			/* We've kicked active balancing, force task migration. */
 
 
 
 9000			sd->nr_balance_failed = sd->cache_nice_tries+1;
 9001		}
 9002	} else
 9003		sd->nr_balance_failed = 0;
 9004
 9005	if (likely(!active_balance) || voluntary_active_balance(&env)) {
 9006		/* We were unbalanced, so reset the balancing interval */
 9007		sd->balance_interval = sd->min_interval;
 9008	} else {
 9009		/*
 9010		 * If we've begun active balancing, start to back off. This
 9011		 * case may not be covered by the all_pinned logic if there
 9012		 * is only 1 task on the busy runqueue (because we don't call
 9013		 * detach_tasks).
 9014		 */
 9015		if (sd->balance_interval < sd->max_interval)
 9016			sd->balance_interval *= 2;
 9017	}
 9018
 9019	goto out;
 9020
 9021out_balanced:
 9022	/*
 9023	 * We reach balance although we may have faced some affinity
 9024	 * constraints. Clear the imbalance flag only if other tasks got
 9025	 * a chance to move and fix the imbalance.
 9026	 */
 9027	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
 9028		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
 9029
 9030		if (*group_imbalance)
 9031			*group_imbalance = 0;
 9032	}
 9033
 9034out_all_pinned:
 9035	/*
 9036	 * We reach balance because all tasks are pinned at this level so
 9037	 * we can't migrate them. Let the imbalance flag set so parent level
 9038	 * can try to migrate them.
 9039	 */
 9040	schedstat_inc(sd->lb_balanced[idle]);
 9041
 9042	sd->nr_balance_failed = 0;
 9043
 9044out_one_pinned:
 9045	ld_moved = 0;
 9046
 9047	/*
 9048	 * newidle_balance() disregards balance intervals, so we could
 9049	 * repeatedly reach this code, which would lead to balance_interval
 9050	 * skyrocketting in a short amount of time. Skip the balance_interval
 9051	 * increase logic to avoid that.
 9052	 */
 9053	if (env.idle == CPU_NEWLY_IDLE)
 9054		goto out;
 9055
 9056	/* tune up the balancing interval */
 9057	if ((env.flags & LBF_ALL_PINNED &&
 9058	     sd->balance_interval < MAX_PINNED_INTERVAL) ||
 9059	    sd->balance_interval < sd->max_interval)
 9060		sd->balance_interval *= 2;
 
 
 9061out:
 9062	return ld_moved;
 9063}
 9064
 9065static inline unsigned long
 9066get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
 
 
 
 9067{
 9068	unsigned long interval = sd->balance_interval;
 
 
 9069
 9070	if (cpu_busy)
 9071		interval *= sd->busy_factor;
 9072
 9073	/* scale ms to jiffies */
 9074	interval = msecs_to_jiffies(interval);
 9075	interval = clamp(interval, 1UL, max_load_balance_interval);
 9076
 9077	return interval;
 9078}
 
 
 9079
 9080static inline void
 9081update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 9082{
 9083	unsigned long interval, next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9084
 9085	/* used by idle balance, so cpu_busy = 0 */
 9086	interval = get_sd_balance_interval(sd, 0);
 9087	next = sd->last_balance + interval;
 9088
 9089	if (time_after(*next_balance, next))
 9090		*next_balance = next;
 
 
 
 
 
 9091}
 9092
 9093/*
 9094 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
 9095 * running tasks off the busiest CPU onto idle CPUs. It requires at
 9096 * least 1 task to be running on each physical CPU where possible, and
 9097 * avoids physical / logical imbalances.
 9098 */
 9099static int active_load_balance_cpu_stop(void *data)
 9100{
 9101	struct rq *busiest_rq = data;
 9102	int busiest_cpu = cpu_of(busiest_rq);
 9103	int target_cpu = busiest_rq->push_cpu;
 9104	struct rq *target_rq = cpu_rq(target_cpu);
 9105	struct sched_domain *sd;
 9106	struct task_struct *p = NULL;
 9107	struct rq_flags rf;
 9108
 9109	rq_lock_irq(busiest_rq, &rf);
 9110	/*
 9111	 * Between queueing the stop-work and running it is a hole in which
 9112	 * CPUs can become inactive. We should not move tasks from or to
 9113	 * inactive CPUs.
 9114	 */
 9115	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
 9116		goto out_unlock;
 9117
 9118	/* Make sure the requested CPU hasn't gone down in the meantime: */
 9119	if (unlikely(busiest_cpu != smp_processor_id() ||
 9120		     !busiest_rq->active_balance))
 9121		goto out_unlock;
 9122
 9123	/* Is there any task to move? */
 9124	if (busiest_rq->nr_running <= 1)
 9125		goto out_unlock;
 9126
 9127	/*
 9128	 * This condition is "impossible", if it occurs
 9129	 * we need to fix it. Originally reported by
 9130	 * Bjorn Helgaas on a 128-CPU setup.
 9131	 */
 9132	BUG_ON(busiest_rq == target_rq);
 9133
 
 
 
 9134	/* Search for an sd spanning us and the target CPU. */
 9135	rcu_read_lock();
 9136	for_each_domain(target_cpu, sd) {
 9137		if ((sd->flags & SD_LOAD_BALANCE) &&
 9138		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
 9139				break;
 9140	}
 9141
 9142	if (likely(sd)) {
 9143		struct lb_env env = {
 9144			.sd		= sd,
 9145			.dst_cpu	= target_cpu,
 9146			.dst_rq		= target_rq,
 9147			.src_cpu	= busiest_rq->cpu,
 9148			.src_rq		= busiest_rq,
 9149			.idle		= CPU_IDLE,
 9150			/*
 9151			 * can_migrate_task() doesn't need to compute new_dst_cpu
 9152			 * for active balancing. Since we have CPU_IDLE, but no
 9153			 * @dst_grpmask we need to make that test go away with lying
 9154			 * about DST_PINNED.
 9155			 */
 9156			.flags		= LBF_DST_PINNED,
 9157		};
 9158
 9159		schedstat_inc(sd->alb_count);
 9160		update_rq_clock(busiest_rq);
 9161
 9162		p = detach_one_task(&env);
 9163		if (p) {
 9164			schedstat_inc(sd->alb_pushed);
 9165			/* Active balancing done, reset the failure counter. */
 9166			sd->nr_balance_failed = 0;
 9167		} else {
 9168			schedstat_inc(sd->alb_failed);
 9169		}
 9170	}
 9171	rcu_read_unlock();
 
 9172out_unlock:
 9173	busiest_rq->active_balance = 0;
 9174	rq_unlock(busiest_rq, &rf);
 9175
 9176	if (p)
 9177		attach_one_task(target_rq, p);
 9178
 9179	local_irq_enable();
 9180
 9181	return 0;
 9182}
 9183
 9184static DEFINE_SPINLOCK(balancing);
 9185
 9186/*
 9187 * Scale the max load_balance interval with the number of CPUs in the system.
 9188 * This trades load-balance latency on larger machines for less cross talk.
 9189 */
 9190void update_max_interval(void)
 9191{
 9192	max_load_balance_interval = HZ*num_online_cpus()/10;
 9193}
 9194
 9195/*
 9196 * It checks each scheduling domain to see if it is due to be balanced,
 9197 * and initiates a balancing operation if so.
 9198 *
 9199 * Balancing parameters are set up in init_sched_domains.
 9200 */
 9201static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 9202{
 9203	int continue_balancing = 1;
 9204	int cpu = rq->cpu;
 9205	unsigned long interval;
 9206	struct sched_domain *sd;
 9207	/* Earliest time when we have to do rebalance again */
 9208	unsigned long next_balance = jiffies + 60*HZ;
 9209	int update_next_balance = 0;
 9210	int need_serialize, need_decay = 0;
 9211	u64 max_cost = 0;
 9212
 9213	rcu_read_lock();
 9214	for_each_domain(cpu, sd) {
 9215		/*
 9216		 * Decay the newidle max times here because this is a regular
 9217		 * visit to all the domains. Decay ~1% per second.
 9218		 */
 9219		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
 9220			sd->max_newidle_lb_cost =
 9221				(sd->max_newidle_lb_cost * 253) / 256;
 9222			sd->next_decay_max_lb_cost = jiffies + HZ;
 9223			need_decay = 1;
 9224		}
 9225		max_cost += sd->max_newidle_lb_cost;
 9226
 9227		if (!(sd->flags & SD_LOAD_BALANCE))
 9228			continue;
 9229
 9230		/*
 9231		 * Stop the load balance at this level. There is another
 9232		 * CPU in our sched group which is doing load balancing more
 9233		 * actively.
 9234		 */
 9235		if (!continue_balancing) {
 9236			if (need_decay)
 9237				continue;
 9238			break;
 9239		}
 9240
 9241		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
 9242
 9243		need_serialize = sd->flags & SD_SERIALIZE;
 9244		if (need_serialize) {
 9245			if (!spin_trylock(&balancing))
 9246				goto out;
 9247		}
 9248
 9249		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 9250			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
 9251				/*
 9252				 * The LBF_DST_PINNED logic could have changed
 9253				 * env->dst_cpu, so we can't know our idle
 9254				 * state even if we migrated tasks. Update it.
 9255				 */
 9256				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
 9257			}
 9258			sd->last_balance = jiffies;
 9259			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
 9260		}
 9261		if (need_serialize)
 9262			spin_unlock(&balancing);
 9263out:
 9264		if (time_after(next_balance, sd->last_balance + interval)) {
 9265			next_balance = sd->last_balance + interval;
 9266			update_next_balance = 1;
 9267		}
 9268	}
 9269	if (need_decay) {
 9270		/*
 9271		 * Ensure the rq-wide value also decays but keep it at a
 9272		 * reasonable floor to avoid funnies with rq->avg_idle.
 9273		 */
 9274		rq->max_idle_balance_cost =
 9275			max((u64)sysctl_sched_migration_cost, max_cost);
 9276	}
 9277	rcu_read_unlock();
 9278
 9279	/*
 9280	 * next_balance will be updated only when there is a need.
 9281	 * When the cpu is attached to null domain for ex, it will not be
 9282	 * updated.
 9283	 */
 9284	if (likely(update_next_balance)) {
 9285		rq->next_balance = next_balance;
 9286
 9287#ifdef CONFIG_NO_HZ_COMMON
 9288		/*
 9289		 * If this CPU has been elected to perform the nohz idle
 9290		 * balance. Other idle CPUs have already rebalanced with
 9291		 * nohz_idle_balance() and nohz.next_balance has been
 9292		 * updated accordingly. This CPU is now running the idle load
 9293		 * balance for itself and we need to update the
 9294		 * nohz.next_balance accordingly.
 9295		 */
 9296		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
 9297			nohz.next_balance = rq->next_balance;
 9298#endif
 9299	}
 9300}
 9301
 9302static inline int on_null_domain(struct rq *rq)
 9303{
 9304	return unlikely(!rcu_dereference_sched(rq->sd));
 9305}
 9306
 9307#ifdef CONFIG_NO_HZ_COMMON
 9308/*
 9309 * idle load balancing details
 9310 * - When one of the busy CPUs notice that there may be an idle rebalancing
 9311 *   needed, they will kick the idle load balancer, which then does idle
 9312 *   load balancing for all the idle CPUs.
 9313 * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
 9314 *   anywhere yet.
 9315 */
 
 
 
 
 
 9316
 9317static inline int find_new_ilb(void)
 9318{
 9319	int ilb;
 9320
 9321	for_each_cpu_and(ilb, nohz.idle_cpus_mask,
 9322			      housekeeping_cpumask(HK_FLAG_MISC)) {
 9323		if (idle_cpu(ilb))
 9324			return ilb;
 9325	}
 9326
 9327	return nr_cpu_ids;
 9328}
 9329
 9330/*
 9331 * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
 9332 * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
 
 9333 */
 9334static void kick_ilb(unsigned int flags)
 9335{
 9336	int ilb_cpu;
 9337
 9338	nohz.next_balance++;
 9339
 9340	ilb_cpu = find_new_ilb();
 9341
 9342	if (ilb_cpu >= nr_cpu_ids)
 9343		return;
 9344
 9345	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
 9346	if (flags & NOHZ_KICK_MASK)
 9347		return;
 9348
 9349	/*
 9350	 * Use smp_send_reschedule() instead of resched_cpu().
 9351	 * This way we generate a sched IPI on the target CPU which
 9352	 * is idle. And the softirq performing nohz idle load balance
 9353	 * will be run before returning from the IPI.
 9354	 */
 9355	smp_send_reschedule(ilb_cpu);
 
 9356}
 9357
 9358/*
 9359 * Current decision point for kicking the idle load balancer in the presence
 9360 * of idle CPUs in the system.
 9361 */
 9362static void nohz_balancer_kick(struct rq *rq)
 9363{
 9364	unsigned long now = jiffies;
 9365	struct sched_domain_shared *sds;
 9366	struct sched_domain *sd;
 9367	int nr_busy, i, cpu = rq->cpu;
 9368	unsigned int flags = 0;
 9369
 9370	if (unlikely(rq->idle_balance))
 9371		return;
 9372
 9373	/*
 9374	 * We may be recently in ticked or tickless idle mode. At the first
 9375	 * busy tick after returning from idle, we will update the busy stats.
 9376	 */
 9377	nohz_balance_exit_idle(rq);
 9378
 9379	/*
 9380	 * None are in tickless mode and hence no need for NOHZ idle load
 9381	 * balancing.
 9382	 */
 9383	if (likely(!atomic_read(&nohz.nr_cpus)))
 9384		return;
 9385
 9386	if (READ_ONCE(nohz.has_blocked) &&
 9387	    time_after(now, READ_ONCE(nohz.next_blocked)))
 9388		flags = NOHZ_STATS_KICK;
 9389
 9390	if (time_before(now, nohz.next_balance))
 9391		goto out;
 9392
 9393	if (rq->nr_running >= 2) {
 9394		flags = NOHZ_KICK_MASK;
 9395		goto out;
 9396	}
 9397
 9398	rcu_read_lock();
 9399
 9400	sd = rcu_dereference(rq->sd);
 9401	if (sd) {
 9402		/*
 9403		 * If there's a CFS task and the current CPU has reduced
 9404		 * capacity; kick the ILB to see if there's a better CPU to run
 9405		 * on.
 9406		 */
 9407		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
 9408			flags = NOHZ_KICK_MASK;
 9409			goto unlock;
 9410		}
 9411	}
 9412
 9413	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
 9414	if (sd) {
 9415		/*
 9416		 * When ASYM_PACKING; see if there's a more preferred CPU
 9417		 * currently idle; in which case, kick the ILB to move tasks
 9418		 * around.
 9419		 */
 9420		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
 9421			if (sched_asym_prefer(i, cpu)) {
 9422				flags = NOHZ_KICK_MASK;
 9423				goto unlock;
 9424			}
 9425		}
 9426	}
 9427
 9428	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
 9429	if (sd) {
 9430		/*
 9431		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
 9432		 * to run the misfit task on.
 9433		 */
 9434		if (check_misfit_status(rq, sd)) {
 9435			flags = NOHZ_KICK_MASK;
 9436			goto unlock;
 9437		}
 9438
 9439		/*
 9440		 * For asymmetric systems, we do not want to nicely balance
 9441		 * cache use, instead we want to embrace asymmetry and only
 9442		 * ensure tasks have enough CPU capacity.
 9443		 *
 9444		 * Skip the LLC logic because it's not relevant in that case.
 9445		 */
 9446		goto unlock;
 9447	}
 9448
 9449	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
 9450	if (sds) {
 9451		/*
 9452		 * If there is an imbalance between LLC domains (IOW we could
 9453		 * increase the overall cache use), we need some less-loaded LLC
 9454		 * domain to pull some load. Likewise, we may need to spread
 9455		 * load within the current LLC domain (e.g. packed SMT cores but
 9456		 * other CPUs are idle). We can't really know from here how busy
 9457		 * the others are - so just get a nohz balance going if it looks
 9458		 * like this LLC domain has tasks we could move.
 9459		 */
 9460		nr_busy = atomic_read(&sds->nr_busy_cpus);
 9461		if (nr_busy > 1) {
 9462			flags = NOHZ_KICK_MASK;
 9463			goto unlock;
 9464		}
 9465	}
 9466unlock:
 9467	rcu_read_unlock();
 9468out:
 9469	if (flags)
 9470		kick_ilb(flags);
 9471}
 9472
 9473static void set_cpu_sd_state_busy(int cpu)
 9474{
 9475	struct sched_domain *sd;
 
 9476
 9477	rcu_read_lock();
 9478	sd = rcu_dereference(per_cpu(sd_llc, cpu));
 9479
 9480	if (!sd || !sd->nohz_idle)
 9481		goto unlock;
 9482	sd->nohz_idle = 0;
 9483
 9484	atomic_inc(&sd->shared->nr_busy_cpus);
 9485unlock:
 
 9486	rcu_read_unlock();
 9487}
 9488
 9489void nohz_balance_exit_idle(struct rq *rq)
 9490{
 9491	SCHED_WARN_ON(rq != this_rq());
 
 9492
 9493	if (likely(!rq->nohz_tick_stopped))
 9494		return;
 9495
 9496	rq->nohz_tick_stopped = 0;
 9497	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
 9498	atomic_dec(&nohz.nr_cpus);
 9499
 9500	set_cpu_sd_state_busy(rq->cpu);
 9501}
 9502
 9503static void set_cpu_sd_state_idle(int cpu)
 9504{
 9505	struct sched_domain *sd;
 9506
 9507	rcu_read_lock();
 9508	sd = rcu_dereference(per_cpu(sd_llc, cpu));
 9509
 9510	if (!sd || sd->nohz_idle)
 9511		goto unlock;
 9512	sd->nohz_idle = 1;
 9513
 9514	atomic_dec(&sd->shared->nr_busy_cpus);
 9515unlock:
 9516	rcu_read_unlock();
 9517}
 9518
 9519/*
 9520 * This routine will record that the CPU is going idle with tick stopped.
 9521 * This info will be used in performing idle load balancing in the future.
 9522 */
 9523void nohz_balance_enter_idle(int cpu)
 9524{
 9525	struct rq *rq = cpu_rq(cpu);
 9526
 9527	SCHED_WARN_ON(cpu != smp_processor_id());
 9528
 9529	/* If this CPU is going down, then nothing needs to be done: */
 9530	if (!cpu_active(cpu))
 9531		return;
 9532
 9533	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
 9534	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 9535		return;
 9536
 9537	/*
 9538	 * Can be set safely without rq->lock held
 9539	 * If a clear happens, it will have evaluated last additions because
 9540	 * rq->lock is held during the check and the clear
 9541	 */
 9542	rq->has_blocked_load = 1;
 9543
 9544	/*
 9545	 * The tick is still stopped but load could have been added in the
 9546	 * meantime. We set the nohz.has_blocked flag to trig a check of the
 9547	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
 9548	 * of nohz.has_blocked can only happen after checking the new load
 9549	 */
 9550	if (rq->nohz_tick_stopped)
 9551		goto out;
 9552
 9553	/* If we're a completely isolated CPU, we don't play: */
 9554	if (on_null_domain(rq))
 9555		return;
 9556
 9557	rq->nohz_tick_stopped = 1;
 
 
 9558
 9559	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 9560	atomic_inc(&nohz.nr_cpus);
 
 
 
 
 9561
 9562	/*
 9563	 * Ensures that if nohz_idle_balance() fails to observe our
 9564	 * @idle_cpus_mask store, it must observe the @has_blocked
 9565	 * store.
 9566	 */
 9567	smp_mb__after_atomic();
 
 
 
 
 
 
 9568
 9569	set_cpu_sd_state_idle(cpu);
 9570
 9571out:
 9572	/*
 9573	 * Each time a cpu enter idle, we assume that it has blocked load and
 9574	 * enable the periodic update of the load of idle cpus
 9575	 */
 9576	WRITE_ONCE(nohz.has_blocked, 1);
 
 9577}
 9578
 9579/*
 9580 * Internal function that runs load balance for all idle cpus. The load balance
 9581 * can be a simple update of blocked load or a complete load balance with
 9582 * tasks movement depending of flags.
 9583 * The function returns false if the loop has stopped before running
 9584 * through all idle CPUs.
 9585 */
 9586static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 9587			       enum cpu_idle_type idle)
 9588{
 
 
 
 
 9589	/* Earliest time when we have to do rebalance again */
 9590	unsigned long now = jiffies;
 9591	unsigned long next_balance = now + 60*HZ;
 9592	bool has_blocked_load = false;
 9593	int update_next_balance = 0;
 9594	int this_cpu = this_rq->cpu;
 9595	int balance_cpu;
 9596	int ret = false;
 9597	struct rq *rq;
 9598
 9599	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 9600
 9601	/*
 9602	 * We assume there will be no idle load after this update and clear
 9603	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
 9604	 * set the has_blocked flag and trig another update of idle load.
 9605	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
 9606	 * setting the flag, we are sure to not clear the state and not
 9607	 * check the load of an idle cpu.
 9608	 */
 9609	WRITE_ONCE(nohz.has_blocked, 0);
 9610
 9611	/*
 9612	 * Ensures that if we miss the CPU, we must see the has_blocked
 9613	 * store from nohz_balance_enter_idle().
 9614	 */
 9615	smp_mb();
 9616
 9617	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 9618		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 9619			continue;
 9620
 9621		/*
 9622		 * If this CPU gets work to do, stop the load balancing
 9623		 * work being done for other CPUs. Next load
 9624		 * balancing owner will pick it up.
 9625		 */
 9626		if (need_resched()) {
 9627			has_blocked_load = true;
 9628			goto abort;
 9629		}
 9630
 9631		rq = cpu_rq(balance_cpu);
 9632
 9633		has_blocked_load |= update_nohz_stats(rq, true);
 9634
 9635		/*
 9636		 * If time for next balance is due,
 9637		 * do the balance.
 9638		 */
 9639		if (time_after_eq(jiffies, rq->next_balance)) {
 9640			struct rq_flags rf;
 9641
 9642			rq_lock_irqsave(rq, &rf);
 9643			update_rq_clock(rq);
 9644			rq_unlock_irqrestore(rq, &rf);
 9645
 9646			if (flags & NOHZ_BALANCE_KICK)
 9647				rebalance_domains(rq, CPU_IDLE);
 
 9648		}
 9649
 9650		if (time_after(next_balance, rq->next_balance)) {
 9651			next_balance = rq->next_balance;
 
 
 
 
 
 
 
 
 
 
 
 
 
 9652			update_next_balance = 1;
 9653		}
 9654	}
 9655
 9656	/* Newly idle CPU doesn't need an update */
 9657	if (idle != CPU_NEWLY_IDLE) {
 9658		update_blocked_averages(this_cpu);
 9659		has_blocked_load |= this_rq->has_blocked_load;
 
 
 
 9660	}
 9661
 9662	if (flags & NOHZ_BALANCE_KICK)
 9663		rebalance_domains(this_rq, CPU_IDLE);
 9664
 9665	WRITE_ONCE(nohz.next_blocked,
 9666		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 9667
 9668	/* The full idle balance loop has been done */
 9669	ret = true;
 9670
 9671abort:
 9672	/* There is still blocked load, enable periodic update */
 9673	if (has_blocked_load)
 9674		WRITE_ONCE(nohz.has_blocked, 1);
 9675
 9676	/*
 9677	 * next_balance will be updated only when there is a need.
 9678	 * When the CPU is attached to null domain for ex, it will not be
 9679	 * updated.
 9680	 */
 9681	if (likely(update_next_balance))
 9682		nohz.next_balance = next_balance;
 9683
 9684	return ret;
 9685}
 9686
 
 9687/*
 9688 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
 9689 * rebalancing for all the cpus for whom scheduler ticks are stopped.
 9690 */
 9691static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 9692{
 9693	int this_cpu = this_rq->cpu;
 9694	unsigned int flags;
 9695
 9696	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
 9697		return false;
 9698
 9699	if (idle != CPU_IDLE) {
 9700		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
 9701		return false;
 9702	}
 9703
 9704	/* could be _relaxed() */
 9705	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
 9706	if (!(flags & NOHZ_KICK_MASK))
 9707		return false;
 9708
 9709	_nohz_idle_balance(this_rq, flags, idle);
 9710
 9711	return true;
 9712}
 9713
 9714static void nohz_newidle_balance(struct rq *this_rq)
 9715{
 9716	int this_cpu = this_rq->cpu;
 
 
 9717
 9718	/*
 9719	 * This CPU doesn't want to be disturbed by scheduler
 9720	 * housekeeping
 9721	 */
 9722	if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
 9723		return;
 9724
 9725	/* Will wake up very soon. No time for doing anything else*/
 9726	if (this_rq->avg_idle < sysctl_sched_migration_cost)
 9727		return;
 9728
 9729	/* Don't need to update blocked load of idle CPUs*/
 9730	if (!READ_ONCE(nohz.has_blocked) ||
 9731	    time_before(jiffies, READ_ONCE(nohz.next_blocked)))
 9732		return;
 
 
 
 9733
 9734	raw_spin_unlock(&this_rq->lock);
 9735	/*
 9736	 * This CPU is going to be idle and blocked load of idle CPUs
 9737	 * need to be updated. Run the ilb locally as it is a good
 9738	 * candidate for ilb instead of waking up another idle CPU.
 9739	 * Kick an normal ilb if we failed to do the update.
 9740	 */
 9741	if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
 9742		kick_ilb(NOHZ_STATS_KICK);
 9743	raw_spin_lock(&this_rq->lock);
 9744}
 9745
 9746#else /* !CONFIG_NO_HZ_COMMON */
 9747static inline void nohz_balancer_kick(struct rq *rq) { }
 9748
 9749static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 9750{
 9751	return false;
 
 
 
 
 9752}
 9753
 9754static inline void nohz_newidle_balance(struct rq *this_rq) { }
 9755#endif /* CONFIG_NO_HZ_COMMON */
 9756
 9757/*
 9758 * idle_balance is called by schedule() if this_cpu is about to become
 9759 * idle. Attempts to pull tasks from other CPUs.
 
 
 
 
 
 9760 */
 9761int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 9762{
 9763	unsigned long next_balance = jiffies + HZ;
 9764	int this_cpu = this_rq->cpu;
 9765	struct sched_domain *sd;
 9766	int pulled_task = 0;
 9767	u64 curr_cost = 0;
 9768
 9769	update_misfit_status(NULL, this_rq);
 9770	/*
 9771	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 9772	 * measure the duration of idle_balance() as idle time.
 9773	 */
 9774	this_rq->idle_stamp = rq_clock(this_rq);
 9775
 9776	/*
 9777	 * Do not pull tasks towards !active CPUs...
 9778	 */
 9779	if (!cpu_active(this_cpu))
 9780		return 0;
 9781
 
 
 
 
 
 
 
 9782	/*
 9783	 * This is OK, because current is on_cpu, which avoids it being picked
 9784	 * for load-balance and preemption/IRQs are still disabled avoiding
 9785	 * further scheduler activity on it and we're being very careful to
 9786	 * re-start the picking loop.
 9787	 */
 9788	rq_unpin_lock(this_rq, rf);
 9789
 9790	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 9791	    !READ_ONCE(this_rq->rd->overload)) {
 9792
 9793		rcu_read_lock();
 9794		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 9795		if (sd)
 9796			update_next_balance(sd, &next_balance);
 9797		rcu_read_unlock();
 9798
 9799		nohz_newidle_balance(this_rq);
 9800
 9801		goto out;
 9802	}
 9803
 9804	raw_spin_unlock(&this_rq->lock);
 
 9805
 9806	update_blocked_averages(this_cpu);
 9807	rcu_read_lock();
 9808	for_each_domain(this_cpu, sd) {
 9809		int continue_balancing = 1;
 9810		u64 t0, domain_cost;
 9811
 9812		if (!(sd->flags & SD_LOAD_BALANCE))
 9813			continue;
 9814
 9815		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
 9816			update_next_balance(sd, &next_balance);
 9817			break;
 9818		}
 9819
 9820		if (sd->flags & SD_BALANCE_NEWIDLE) {
 9821			t0 = sched_clock_cpu(this_cpu);
 9822
 9823			pulled_task = load_balance(this_cpu, this_rq,
 9824						   sd, CPU_NEWLY_IDLE,
 9825						   &continue_balancing);
 9826
 9827			domain_cost = sched_clock_cpu(this_cpu) - t0;
 9828			if (domain_cost > sd->max_newidle_lb_cost)
 9829				sd->max_newidle_lb_cost = domain_cost;
 9830
 9831			curr_cost += domain_cost;
 9832		}
 9833
 9834		update_next_balance(sd, &next_balance);
 9835
 9836		/*
 9837		 * Stop searching for tasks to pull if there are
 9838		 * now runnable tasks on this rq.
 9839		 */
 9840		if (pulled_task || this_rq->nr_running > 0)
 9841			break;
 9842	}
 9843	rcu_read_unlock();
 
 9844
 9845	raw_spin_lock(&this_rq->lock);
 9846
 9847	if (curr_cost > this_rq->max_idle_balance_cost)
 9848		this_rq->max_idle_balance_cost = curr_cost;
 9849
 9850out:
 9851	/*
 9852	 * While browsing the domains, we released the rq lock, a task could
 9853	 * have been enqueued in the meantime. Since we're not going idle,
 9854	 * pretend we pulled a task.
 9855	 */
 9856	if (this_rq->cfs.h_nr_running && !pulled_task)
 9857		pulled_task = 1;
 9858
 9859	/* Move the next balance forward */
 9860	if (time_after(this_rq->next_balance, next_balance))
 9861		this_rq->next_balance = next_balance;
 9862
 9863	/* Is there a task of a high priority class? */
 9864	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
 9865		pulled_task = -1;
 9866
 9867	if (pulled_task)
 9868		this_rq->idle_stamp = 0;
 9869
 9870	rq_repin_lock(this_rq, rf);
 9871
 9872	return pulled_task;
 9873}
 
 
 
 9874
 9875/*
 9876 * run_rebalance_domains is triggered when needed from the scheduler tick.
 9877 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
 9878 */
 9879static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 9880{
 9881	struct rq *this_rq = this_rq();
 
 9882	enum cpu_idle_type idle = this_rq->idle_balance ?
 9883						CPU_IDLE : CPU_NOT_IDLE;
 9884
 
 
 9885	/*
 9886	 * If this CPU has a pending nohz_balance_kick, then do the
 9887	 * balancing on behalf of the other idle CPUs whose ticks are
 9888	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
 9889	 * give the idle CPUs a chance to load balance. Else we may
 9890	 * load balance only within the local sched_domain hierarchy
 9891	 * and abort nohz_idle_balance altogether if we pull some load.
 9892	 */
 9893	if (nohz_idle_balance(this_rq, idle))
 9894		return;
 9895
 9896	/* normal load balance */
 9897	update_blocked_averages(this_rq->cpu);
 9898	rebalance_domains(this_rq, idle);
 9899}
 9900
 9901/*
 9902 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 9903 */
 9904void trigger_load_balance(struct rq *rq)
 9905{
 9906	/* Don't need to rebalance while attached to NULL domain */
 9907	if (unlikely(on_null_domain(rq)))
 9908		return;
 9909
 9910	if (time_after_eq(jiffies, rq->next_balance))
 9911		raise_softirq(SCHED_SOFTIRQ);
 9912
 9913	nohz_balancer_kick(rq);
 
 
 9914}
 9915
 9916static void rq_online_fair(struct rq *rq)
 9917{
 9918	update_sysctl();
 9919
 9920	update_runtime_enabled(rq);
 9921}
 9922
 9923static void rq_offline_fair(struct rq *rq)
 9924{
 9925	update_sysctl();
 9926
 9927	/* Ensure any throttled groups are reachable by pick_next_task */
 9928	unthrottle_offline_cfs_rqs(rq);
 9929}
 9930
 9931#endif /* CONFIG_SMP */
 9932
 9933/*
 9934 * scheduler tick hitting a task of our scheduling class.
 9935 *
 9936 * NOTE: This function can be called remotely by the tick offload that
 9937 * goes along full dynticks. Therefore no local assumption can be made
 9938 * and everything must be accessed through the @rq and @curr passed in
 9939 * parameters.
 9940 */
 9941static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 9942{
 9943	struct cfs_rq *cfs_rq;
 9944	struct sched_entity *se = &curr->se;
 9945
 9946	for_each_sched_entity(se) {
 9947		cfs_rq = cfs_rq_of(se);
 9948		entity_tick(cfs_rq, se, queued);
 9949	}
 9950
 9951	if (static_branch_unlikely(&sched_numa_balancing))
 9952		task_tick_numa(rq, curr);
 9953
 9954	update_misfit_status(curr, rq);
 9955	update_overutilized_status(task_rq(curr));
 9956}
 9957
 9958/*
 9959 * called on fork with the child task as argument from the parent's context
 9960 *  - child not yet on the tasklist
 9961 *  - preemption disabled
 9962 */
 9963static void task_fork_fair(struct task_struct *p)
 9964{
 9965	struct cfs_rq *cfs_rq;
 9966	struct sched_entity *se = &p->se, *curr;
 
 9967	struct rq *rq = this_rq();
 9968	struct rq_flags rf;
 
 
 9969
 9970	rq_lock(rq, &rf);
 9971	update_rq_clock(rq);
 9972
 9973	cfs_rq = task_cfs_rq(current);
 9974	curr = cfs_rq->curr;
 9975	if (curr) {
 9976		update_curr(cfs_rq);
 9977		se->vruntime = curr->vruntime;
 
 
 9978	}
 
 
 
 
 
 9979	place_entity(cfs_rq, se, 1);
 9980
 9981	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
 9982		/*
 9983		 * Upon rescheduling, sched_class::put_prev_task() will place
 9984		 * 'current' within the tree based on its new key value.
 9985		 */
 9986		swap(curr->vruntime, se->vruntime);
 9987		resched_curr(rq);
 9988	}
 9989
 9990	se->vruntime -= cfs_rq->min_vruntime;
 9991	rq_unlock(rq, &rf);
 
 9992}
 9993
 9994/*
 9995 * Priority of the task has changed. Check to see if we preempt
 9996 * the current task.
 9997 */
 9998static void
 9999prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
10000{
10001	if (!task_on_rq_queued(p))
10002		return;
10003
10004	/*
10005	 * Reschedule if we are currently running on this runqueue and
10006	 * our priority decreased, or if we are not currently running on
10007	 * this runqueue and our priority is higher than the current's
10008	 */
10009	if (rq->curr == p) {
10010		if (p->prio > oldprio)
10011			resched_curr(rq);
10012	} else
10013		check_preempt_curr(rq, p, 0);
10014}
10015
10016static inline bool vruntime_normalized(struct task_struct *p)
10017{
10018	struct sched_entity *se = &p->se;
10019
10020	/*
10021	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
10022	 * the dequeue_entity(.flags=0) will already have normalized the
10023	 * vruntime.
10024	 */
10025	if (p->on_rq)
10026		return true;
10027
10028	/*
10029	 * When !on_rq, vruntime of the task has usually NOT been normalized.
10030	 * But there are some cases where it has already been normalized:
10031	 *
10032	 * - A forked child which is waiting for being woken up by
10033	 *   wake_up_new_task().
10034	 * - A task which has been woken up by try_to_wake_up() and
10035	 *   waiting for actually being woken up by sched_ttwu_pending().
10036	 */
10037	if (!se->sum_exec_runtime ||
10038	    (p->state == TASK_WAKING && p->sched_remote_wakeup))
10039		return true;
10040
10041	return false;
10042}
10043
10044#ifdef CONFIG_FAIR_GROUP_SCHED
10045/*
10046 * Propagate the changes of the sched_entity across the tg tree to make it
10047 * visible to the root
10048 */
10049static void propagate_entity_cfs_rq(struct sched_entity *se)
10050{
10051	struct cfs_rq *cfs_rq;
10052
10053	/* Start to propagate at parent */
10054	se = se->parent;
10055
10056	for_each_sched_entity(se) {
10057		cfs_rq = cfs_rq_of(se);
10058
10059		if (cfs_rq_throttled(cfs_rq))
10060			break;
10061
10062		update_load_avg(cfs_rq, se, UPDATE_TG);
10063	}
10064}
10065#else
10066static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10067#endif
10068
10069static void detach_entity_cfs_rq(struct sched_entity *se)
10070{
10071	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10072
10073	/* Catch up with the cfs_rq and remove our load when we leave */
10074	update_load_avg(cfs_rq, se, 0);
10075	detach_entity_load_avg(cfs_rq, se);
10076	update_tg_load_avg(cfs_rq, false);
10077	propagate_entity_cfs_rq(se);
10078}
10079
10080static void attach_entity_cfs_rq(struct sched_entity *se)
10081{
10082	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10083
10084#ifdef CONFIG_FAIR_GROUP_SCHED
10085	/*
10086	 * Since the real-depth could have been changed (only FAIR
10087	 * class maintain depth value), reset depth properly.
 
 
 
 
 
10088	 */
10089	se->depth = se->parent ? se->parent->depth + 1 : 0;
10090#endif
10091
10092	/* Synchronize entity with its cfs_rq */
10093	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10094	attach_entity_load_avg(cfs_rq, se, 0);
10095	update_tg_load_avg(cfs_rq, false);
10096	propagate_entity_cfs_rq(se);
10097}
10098
10099static void detach_task_cfs_rq(struct task_struct *p)
10100{
10101	struct sched_entity *se = &p->se;
10102	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10103
10104	if (!vruntime_normalized(p)) {
10105		/*
10106		 * Fix up our vruntime so that the current sleep doesn't
10107		 * cause 'unlimited' sleep bonus.
10108		 */
10109		place_entity(cfs_rq, se, 0);
10110		se->vruntime -= cfs_rq->min_vruntime;
10111	}
10112
10113	detach_entity_cfs_rq(se);
10114}
10115
10116static void attach_task_cfs_rq(struct task_struct *p)
10117{
10118	struct sched_entity *se = &p->se;
10119	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10120
10121	attach_entity_cfs_rq(se);
10122
10123	if (!vruntime_normalized(p))
10124		se->vruntime += cfs_rq->min_vruntime;
10125}
10126
10127static void switched_from_fair(struct rq *rq, struct task_struct *p)
10128{
10129	detach_task_cfs_rq(p);
10130}
10131
 
 
 
10132static void switched_to_fair(struct rq *rq, struct task_struct *p)
10133{
10134	attach_task_cfs_rq(p);
 
10135
10136	if (task_on_rq_queued(p)) {
10137		/*
10138		 * We were most likely switched from sched_rt, so
10139		 * kick off the schedule if running, otherwise just see
10140		 * if we can still preempt the current task.
10141		 */
10142		if (rq->curr == p)
10143			resched_curr(rq);
10144		else
10145			check_preempt_curr(rq, p, 0);
10146	}
10147}
10148
10149/* Account for a task changing its policy or group.
10150 *
10151 * This routine is mostly called to set cfs_rq->curr field when a task
10152 * migrates between groups/classes.
10153 */
10154static void set_next_task_fair(struct rq *rq, struct task_struct *p)
10155{
10156	struct sched_entity *se = &p->se;
10157
10158#ifdef CONFIG_SMP
10159	if (task_on_rq_queued(p)) {
10160		/*
10161		 * Move the next running task to the front of the list, so our
10162		 * cfs_tasks list becomes MRU one.
10163		 */
10164		list_move(&se->group_node, &rq->cfs_tasks);
10165	}
10166#endif
10167
10168	for_each_sched_entity(se) {
10169		struct cfs_rq *cfs_rq = cfs_rq_of(se);
10170
10171		set_next_entity(cfs_rq, se);
10172		/* ensure bandwidth has been allocated on our new cfs_rq */
10173		account_cfs_rq_runtime(cfs_rq, 0);
10174	}
10175}
10176
10177void init_cfs_rq(struct cfs_rq *cfs_rq)
10178{
10179	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
10180	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10181#ifndef CONFIG_64BIT
10182	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10183#endif
10184#ifdef CONFIG_SMP
10185	raw_spin_lock_init(&cfs_rq->removed.lock);
10186#endif
10187}
10188
10189#ifdef CONFIG_FAIR_GROUP_SCHED
10190static void task_set_group_fair(struct task_struct *p)
10191{
10192	struct sched_entity *se = &p->se;
10193
10194	set_task_rq(p, task_cpu(p));
10195	se->depth = se->parent ? se->parent->depth + 1 : 0;
10196}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10197
10198static void task_move_group_fair(struct task_struct *p)
10199{
10200	detach_task_cfs_rq(p);
10201	set_task_rq(p, task_cpu(p));
10202
10203#ifdef CONFIG_SMP
10204	/* Tell se's cfs_rq has been changed -- migrated */
10205	p->se.avg.last_update_time = 0;
10206#endif
10207	attach_task_cfs_rq(p);
10208}
10209
10210static void task_change_group_fair(struct task_struct *p, int type)
10211{
10212	switch (type) {
10213	case TASK_SET_GROUP:
10214		task_set_group_fair(p);
10215		break;
10216
10217	case TASK_MOVE_GROUP:
10218		task_move_group_fair(p);
10219		break;
10220	}
10221}
10222
10223void free_fair_sched_group(struct task_group *tg)
10224{
10225	int i;
10226
10227	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10228
10229	for_each_possible_cpu(i) {
10230		if (tg->cfs_rq)
10231			kfree(tg->cfs_rq[i]);
10232		if (tg->se)
10233			kfree(tg->se[i]);
10234	}
10235
10236	kfree(tg->cfs_rq);
10237	kfree(tg->se);
10238}
10239
10240int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10241{
10242	struct sched_entity *se;
10243	struct cfs_rq *cfs_rq;
 
10244	int i;
10245
10246	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
10247	if (!tg->cfs_rq)
10248		goto err;
10249	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
10250	if (!tg->se)
10251		goto err;
10252
10253	tg->shares = NICE_0_LOAD;
10254
10255	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10256
10257	for_each_possible_cpu(i) {
10258		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10259				      GFP_KERNEL, cpu_to_node(i));
10260		if (!cfs_rq)
10261			goto err;
10262
10263		se = kzalloc_node(sizeof(struct sched_entity),
10264				  GFP_KERNEL, cpu_to_node(i));
10265		if (!se)
10266			goto err_free_rq;
10267
10268		init_cfs_rq(cfs_rq);
10269		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10270		init_entity_runnable_average(se);
10271	}
10272
10273	return 1;
10274
10275err_free_rq:
10276	kfree(cfs_rq);
10277err:
10278	return 0;
10279}
10280
10281void online_fair_sched_group(struct task_group *tg)
10282{
10283	struct sched_entity *se;
10284	struct rq_flags rf;
10285	struct rq *rq;
10286	int i;
10287
10288	for_each_possible_cpu(i) {
10289		rq = cpu_rq(i);
10290		se = tg->se[i];
10291		rq_lock_irq(rq, &rf);
10292		update_rq_clock(rq);
10293		attach_entity_cfs_rq(se);
10294		sync_throttle(tg, i);
10295		rq_unlock_irq(rq, &rf);
10296	}
10297}
10298
10299void unregister_fair_sched_group(struct task_group *tg)
10300{
 
10301	unsigned long flags;
10302	struct rq *rq;
10303	int cpu;
10304
10305	for_each_possible_cpu(cpu) {
10306		if (tg->se[cpu])
10307			remove_entity_load_avg(tg->se[cpu]);
10308
10309		/*
10310		 * Only empty task groups can be destroyed; so we can speculatively
10311		 * check on_list without danger of it being re-added.
10312		 */
10313		if (!tg->cfs_rq[cpu]->on_list)
10314			continue;
10315
10316		rq = cpu_rq(cpu);
10317
10318		raw_spin_lock_irqsave(&rq->lock, flags);
10319		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10320		raw_spin_unlock_irqrestore(&rq->lock, flags);
10321	}
10322}
10323
10324void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10325			struct sched_entity *se, int cpu,
10326			struct sched_entity *parent)
10327{
10328	struct rq *rq = cpu_rq(cpu);
10329
10330	cfs_rq->tg = tg;
10331	cfs_rq->rq = rq;
 
 
 
 
10332	init_cfs_rq_runtime(cfs_rq);
10333
10334	tg->cfs_rq[cpu] = cfs_rq;
10335	tg->se[cpu] = se;
10336
10337	/* se could be NULL for root_task_group */
10338	if (!se)
10339		return;
10340
10341	if (!parent) {
10342		se->cfs_rq = &rq->cfs;
10343		se->depth = 0;
10344	} else {
10345		se->cfs_rq = parent->my_q;
10346		se->depth = parent->depth + 1;
10347	}
10348
10349	se->my_q = cfs_rq;
10350	/* guarantee group entities always have weight */
10351	update_load_set(&se->load, NICE_0_LOAD);
10352	se->parent = parent;
10353}
10354
10355static DEFINE_MUTEX(shares_mutex);
10356
10357int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10358{
10359	int i;
 
10360
10361	/*
10362	 * We can't change the weight of the root cgroup.
10363	 */
10364	if (!tg->se[0])
10365		return -EINVAL;
10366
10367	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10368
10369	mutex_lock(&shares_mutex);
10370	if (tg->shares == shares)
10371		goto done;
10372
10373	tg->shares = shares;
10374	for_each_possible_cpu(i) {
10375		struct rq *rq = cpu_rq(i);
10376		struct sched_entity *se = tg->se[i];
10377		struct rq_flags rf;
10378
 
10379		/* Propagate contribution to hierarchy */
10380		rq_lock_irqsave(rq, &rf);
10381		update_rq_clock(rq);
10382		for_each_sched_entity(se) {
10383			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
10384			update_cfs_group(se);
10385		}
10386		rq_unlock_irqrestore(rq, &rf);
10387	}
10388
10389done:
10390	mutex_unlock(&shares_mutex);
10391	return 0;
10392}
10393#else /* CONFIG_FAIR_GROUP_SCHED */
10394
10395void free_fair_sched_group(struct task_group *tg) { }
10396
10397int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10398{
10399	return 1;
10400}
10401
10402void online_fair_sched_group(struct task_group *tg) { }
10403
10404void unregister_fair_sched_group(struct task_group *tg) { }
10405
10406#endif /* CONFIG_FAIR_GROUP_SCHED */
10407
10408
10409static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10410{
10411	struct sched_entity *se = &task->se;
10412	unsigned int rr_interval = 0;
10413
10414	/*
10415	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
10416	 * idle runqueue:
10417	 */
10418	if (rq->cfs.load.weight)
10419		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10420
10421	return rr_interval;
10422}
10423
10424/*
10425 * All the scheduling class methods:
10426 */
10427const struct sched_class fair_sched_class = {
10428	.next			= &idle_sched_class,
10429	.enqueue_task		= enqueue_task_fair,
10430	.dequeue_task		= dequeue_task_fair,
10431	.yield_task		= yield_task_fair,
10432	.yield_to_task		= yield_to_task_fair,
10433
10434	.check_preempt_curr	= check_preempt_wakeup,
10435
10436	.pick_next_task		= pick_next_task_fair,
10437	.put_prev_task		= put_prev_task_fair,
10438	.set_next_task          = set_next_task_fair,
10439
10440#ifdef CONFIG_SMP
10441	.balance		= balance_fair,
10442	.select_task_rq		= select_task_rq_fair,
10443	.migrate_task_rq	= migrate_task_rq_fair,
10444
10445	.rq_online		= rq_online_fair,
10446	.rq_offline		= rq_offline_fair,
10447
10448	.task_dead		= task_dead_fair,
10449	.set_cpus_allowed	= set_cpus_allowed_common,
10450#endif
10451
 
10452	.task_tick		= task_tick_fair,
10453	.task_fork		= task_fork_fair,
10454
10455	.prio_changed		= prio_changed_fair,
10456	.switched_from		= switched_from_fair,
10457	.switched_to		= switched_to_fair,
10458
10459	.get_rr_interval	= get_rr_interval_fair,
10460
10461	.update_curr		= update_curr_fair,
10462
10463#ifdef CONFIG_FAIR_GROUP_SCHED
10464	.task_change_group	= task_change_group_fair,
10465#endif
10466
10467#ifdef CONFIG_UCLAMP_TASK
10468	.uclamp_enabled		= 1,
10469#endif
10470};
10471
10472#ifdef CONFIG_SCHED_DEBUG
10473void print_cfs_stats(struct seq_file *m, int cpu)
10474{
10475	struct cfs_rq *cfs_rq, *pos;
10476
10477	rcu_read_lock();
10478	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10479		print_cfs_rq(m, cpu, cfs_rq);
10480	rcu_read_unlock();
10481}
10482
10483#ifdef CONFIG_NUMA_BALANCING
10484void show_numa_stats(struct task_struct *p, struct seq_file *m)
10485{
10486	int node;
10487	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10488	struct numa_group *ng;
10489
10490	rcu_read_lock();
10491	ng = rcu_dereference(p->numa_group);
10492	for_each_online_node(node) {
10493		if (p->numa_faults) {
10494			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10495			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10496		}
10497		if (ng) {
10498			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
10499			gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
10500		}
10501		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10502	}
10503	rcu_read_unlock();
10504}
10505#endif /* CONFIG_NUMA_BALANCING */
10506#endif /* CONFIG_SCHED_DEBUG */
10507
10508__init void init_sched_fair_class(void)
10509{
10510#ifdef CONFIG_SMP
10511	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10512
10513#ifdef CONFIG_NO_HZ_COMMON
10514	nohz.next_balance = jiffies;
10515	nohz.next_blocked = jiffies;
10516	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 
10517#endif
10518#endif /* SMP */
10519
10520}
10521
10522/*
10523 * Helper functions to facilitate extracting info from tracepoints.
10524 */
10525
10526const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
10527{
10528#ifdef CONFIG_SMP
10529	return cfs_rq ? &cfs_rq->avg : NULL;
10530#else
10531	return NULL;
10532#endif
10533}
10534EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
10535
10536char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
10537{
10538	if (!cfs_rq) {
10539		if (str)
10540			strlcpy(str, "(null)", len);
10541		else
10542			return NULL;
10543	}
10544
10545	cfs_rq_tg_path(cfs_rq, str, len);
10546	return str;
10547}
10548EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
10549
10550int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
10551{
10552	return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
10553}
10554EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
10555
10556const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
10557{
10558#ifdef CONFIG_SMP
10559	return rq ? &rq->avg_rt : NULL;
10560#else
10561	return NULL;
10562#endif
10563}
10564EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
10565
10566const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
10567{
10568#ifdef CONFIG_SMP
10569	return rq ? &rq->avg_dl : NULL;
10570#else
10571	return NULL;
10572#endif
10573}
10574EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
10575
10576const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
10577{
10578#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
10579	return rq ? &rq->avg_irq : NULL;
10580#else
10581	return NULL;
10582#endif
10583}
10584EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
10585
10586int sched_trace_rq_cpu(struct rq *rq)
10587{
10588	return rq ? cpu_of(rq) : -1;
10589}
10590EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
10591
10592const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
10593{
10594#ifdef CONFIG_SMP
10595	return rd ? rd->span : NULL;
10596#else
10597	return NULL;
10598#endif
10599}
10600EXPORT_SYMBOL_GPL(sched_trace_rd_span);

 
   1/*
   2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3 *
   4 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5 *
   6 *  Interactivity improvements by Mike Galbraith
   7 *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8 *
   9 *  Various enhancements by Dmitry Adamushko.
  10 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11 *
  12 *  Group scheduling enhancements by Srivatsa Vaddagiri
  13 *  Copyright IBM Corporation, 2007
  14 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15 *
  16 *  Scaled math optimizations by Thomas Gleixner
  17 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18 *
  19 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21 */
  22
  23#include <linux/latencytop.h>
  24#include <linux/sched.h>
  25#include <linux/cpumask.h>
  26#include <linux/slab.h>
  27#include <linux/profile.h>
  28#include <linux/interrupt.h>
  29
  30#include <trace/events/sched.h>
  31
  32#include "sched.h"
  33
  34/*
  35 * Targeted preemption latency for CPU-bound tasks:
  36 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  37 *
  38 * NOTE: this latency value is not the same as the concept of
  39 * 'timeslice length' - timeslices in CFS are of variable length
  40 * and have no persistent notion like in traditional, time-slice
  41 * based scheduling concepts.
  42 *
  43 * (to see the precise effective timeslice length of your workload,
  44 *  run vmstat and monitor the context-switches (cs) field)
 
 
  45 */
  46unsigned int sysctl_sched_latency = 6000000ULL;
  47unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  48
  49/*
  50 * The initial- and re-scaling of tunables is configurable
  51 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  52 *
  53 * Options are:
  54 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  55 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  56 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
 
 
 
  57 */
  58enum sched_tunable_scaling sysctl_sched_tunable_scaling
  59	= SCHED_TUNABLESCALING_LOG;
  60
  61/*
  62 * Minimal preemption granularity for CPU-bound tasks:
 
  63 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  64 */
  65unsigned int sysctl_sched_min_granularity = 750000ULL;
  66unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  67
  68/*
  69 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  70 */
  71static unsigned int sched_nr_latency = 8;
  72
  73/*
  74 * After fork, child runs first. If set to 0 (default) then
  75 * parent will (try to) run first.
  76 */
  77unsigned int sysctl_sched_child_runs_first __read_mostly;
  78
  79/*
  80 * SCHED_OTHER wake-up granularity.
  81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  82 *
  83 * This option delays the preemption effects of decoupled workloads
  84 * and reduces their over-scheduling. Synchronous workloads will still
  85 * have immediate wakeup/sleep latencies.
 
 
  86 */
  87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
  88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  89
  90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  91
 
  92/*
  93 * The exponential sliding  window over which load is averaged for shares
  94 * distribution.
  95 * (default: 10msec)
  96 */
  97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 
 
 
 
 
 
 
 
 
 
 
 
  98
  99#ifdef CONFIG_CFS_BANDWIDTH
 100/*
 101 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 102 * each time a cfs_rq requests quota.
 103 *
 104 * Note: in the case that the slice exceeds the runtime remaining (either due
 105 * to consumption or the quota being specified to be smaller than the slice)
 106 * we will always only issue the remaining available time.
 107 *
 108 * default: 5 msec, units: microseconds
 109  */
 110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 111#endif
 112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 113/*
 114 * Increase the granularity value when there are more CPUs,
 115 * because with more CPUs the 'effective latency' as visible
 116 * to users decreases. But the relationship is not linear,
 117 * so pick a second-best guess by going with the log2 of the
 118 * number of CPUs.
 119 *
 120 * This idea comes from the SD scheduler of Con Kolivas:
 121 */
 122static int get_update_sysctl_factor(void)
 123{
 124	unsigned int cpus = min_t(int, num_online_cpus(), 8);
 125	unsigned int factor;
 126
 127	switch (sysctl_sched_tunable_scaling) {
 128	case SCHED_TUNABLESCALING_NONE:
 129		factor = 1;
 130		break;
 131	case SCHED_TUNABLESCALING_LINEAR:
 132		factor = cpus;
 133		break;
 134	case SCHED_TUNABLESCALING_LOG:
 135	default:
 136		factor = 1 + ilog2(cpus);
 137		break;
 138	}
 139
 140	return factor;
 141}
 142
 143static void update_sysctl(void)
 144{
 145	unsigned int factor = get_update_sysctl_factor();
 146
 147#define SET_SYSCTL(name) \
 148	(sysctl_##name = (factor) * normalized_sysctl_##name)
 149	SET_SYSCTL(sched_min_granularity);
 150	SET_SYSCTL(sched_latency);
 151	SET_SYSCTL(sched_wakeup_granularity);
 152#undef SET_SYSCTL
 153}
 154
 155void sched_init_granularity(void)
 156{
 157	update_sysctl();
 158}
 159
 160#if BITS_PER_LONG == 32
 161# define WMULT_CONST	(~0UL)
 162#else
 163# define WMULT_CONST	(1UL << 32)
 164#endif
 
 
 
 
 165
 166#define WMULT_SHIFT	32
 167
 168/*
 169 * Shift right and round:
 170 */
 171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
 
 
 172
 173/*
 174 * delta *= weight / lw
 
 
 
 
 
 
 
 
 
 175 */
 176static unsigned long
 177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 178		struct load_weight *lw)
 179{
 180	u64 tmp;
 
 181
 182	/*
 183	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
 184	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
 185	 * 2^SCHED_LOAD_RESOLUTION.
 186	 */
 187	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
 188		tmp = (u64)delta_exec * scale_load_down(weight);
 189	else
 190		tmp = (u64)delta_exec;
 191
 192	if (!lw->inv_weight) {
 193		unsigned long w = scale_load_down(lw->weight);
 194
 195		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 196			lw->inv_weight = 1;
 197		else if (unlikely(!w))
 198			lw->inv_weight = WMULT_CONST;
 199		else
 200			lw->inv_weight = WMULT_CONST / w;
 201	}
 202
 203	/*
 204	 * Check whether we'd overflow the 64-bit multiplication:
 205	 */
 206	if (unlikely(tmp > WMULT_CONST))
 207		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 208			WMULT_SHIFT/2);
 209	else
 210		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 211
 212	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 213}
 214
 215
 216const struct sched_class fair_sched_class;
 217
 218/**************************************************************
 219 * CFS operations on generic schedulable entities:
 220 */
 221
 222#ifdef CONFIG_FAIR_GROUP_SCHED
 223
 224/* cpu runqueue to which this cfs_rq is attached */
 225static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 226{
 227	return cfs_rq->rq;
 228}
 229
 230/* An entity is a task if it doesn't "own" a runqueue */
 231#define entity_is_task(se)	(!se->my_q)
 232
 233static inline struct task_struct *task_of(struct sched_entity *se)
 234{
 235#ifdef CONFIG_SCHED_DEBUG
 236	WARN_ON_ONCE(!entity_is_task(se));
 237#endif
 238	return container_of(se, struct task_struct, se);
 239}
 240
 241/* Walk up scheduling entities hierarchy */
 242#define for_each_sched_entity(se) \
 243		for (; se; se = se->parent)
 244
 245static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 246{
 247	return p->se.cfs_rq;
 248}
 249
 250/* runqueue on which this entity is (to be) queued */
 251static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 252{
 253	return se->cfs_rq;
 254}
 255
 256/* runqueue "owned" by this group */
 257static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 258{
 259	return grp->my_q;
 260}
 261
 262static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 263{
 264	if (!cfs_rq->on_list) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 265		/*
 266		 * Ensure we either appear before our parent (if already
 267		 * enqueued) or force our parent to appear after us when it is
 268		 * enqueued.  The fact that we always enqueue bottom-up
 269		 * reduces this to two cases.
 270		 */
 271		if (cfs_rq->tg->parent &&
 272		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 273			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 274				&rq_of(cfs_rq)->leaf_cfs_rq_list);
 275		} else {
 276			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 277				&rq_of(cfs_rq)->leaf_cfs_rq_list);
 278		}
 
 
 279
 280		cfs_rq->on_list = 1;
 
 
 
 
 
 
 
 
 
 
 
 
 281	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 282}
 283
 284static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 285{
 286	if (cfs_rq->on_list) {
 
 
 
 
 
 
 
 
 
 
 
 
 287		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 288		cfs_rq->on_list = 0;
 289	}
 290}
 291
 
 
 
 
 
 292/* Iterate thr' all leaf cfs_rq's on a runqueue */
 293#define for_each_leaf_cfs_rq(rq, cfs_rq) \
 294	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 295
 296/* Do the two (enqueued) entities belong to the same group ? */
 297static inline int
 298is_same_group(struct sched_entity *se, struct sched_entity *pse)
 299{
 300	if (se->cfs_rq == pse->cfs_rq)
 301		return 1;
 302
 303	return 0;
 304}
 305
 306static inline struct sched_entity *parent_entity(struct sched_entity *se)
 307{
 308	return se->parent;
 309}
 310
 311/* return depth at which a sched entity is present in the hierarchy */
 312static inline int depth_se(struct sched_entity *se)
 313{
 314	int depth = 0;
 315
 316	for_each_sched_entity(se)
 317		depth++;
 318
 319	return depth;
 320}
 321
 322static void
 323find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 324{
 325	int se_depth, pse_depth;
 326
 327	/*
 328	 * preemption test can be made between sibling entities who are in the
 329	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 330	 * both tasks until we find their ancestors who are siblings of common
 331	 * parent.
 332	 */
 333
 334	/* First walk up until both entities are at same depth */
 335	se_depth = depth_se(*se);
 336	pse_depth = depth_se(*pse);
 337
 338	while (se_depth > pse_depth) {
 339		se_depth--;
 340		*se = parent_entity(*se);
 341	}
 342
 343	while (pse_depth > se_depth) {
 344		pse_depth--;
 345		*pse = parent_entity(*pse);
 346	}
 347
 348	while (!is_same_group(*se, *pse)) {
 349		*se = parent_entity(*se);
 350		*pse = parent_entity(*pse);
 351	}
 352}
 353
 354#else	/* !CONFIG_FAIR_GROUP_SCHED */
 355
 356static inline struct task_struct *task_of(struct sched_entity *se)
 357{
 358	return container_of(se, struct task_struct, se);
 359}
 360
 361static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 362{
 363	return container_of(cfs_rq, struct rq, cfs);
 364}
 365
 366#define entity_is_task(se)	1
 367
 368#define for_each_sched_entity(se) \
 369		for (; se; se = NULL)
 370
 371static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 372{
 373	return &task_rq(p)->cfs;
 374}
 375
 376static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 377{
 378	struct task_struct *p = task_of(se);
 379	struct rq *rq = task_rq(p);
 380
 381	return &rq->cfs;
 382}
 383
 384/* runqueue "owned" by this group */
 385static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 386{
 387	return NULL;
 388}
 389
 390static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 391{
 
 
 
 
 
 
 
 392}
 393
 394static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 395{
 396}
 397
 398#define for_each_leaf_cfs_rq(rq, cfs_rq) \
 399		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 400
 401static inline int
 402is_same_group(struct sched_entity *se, struct sched_entity *pse)
 403{
 404	return 1;
 405}
 406
 
 
 
 407static inline struct sched_entity *parent_entity(struct sched_entity *se)
 408{
 409	return NULL;
 410}
 411
 412static inline void
 413find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 414{
 415}
 416
 417#endif	/* CONFIG_FAIR_GROUP_SCHED */
 418
 419static __always_inline
 420void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
 421
 422/**************************************************************
 423 * Scheduling class tree data structure manipulation methods:
 424 */
 425
 426static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
 427{
 428	s64 delta = (s64)(vruntime - min_vruntime);
 429	if (delta > 0)
 430		min_vruntime = vruntime;
 431
 432	return min_vruntime;
 433}
 434
 435static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 436{
 437	s64 delta = (s64)(vruntime - min_vruntime);
 438	if (delta < 0)
 439		min_vruntime = vruntime;
 440
 441	return min_vruntime;
 442}
 443
 444static inline int entity_before(struct sched_entity *a,
 445				struct sched_entity *b)
 446{
 447	return (s64)(a->vruntime - b->vruntime) < 0;
 448}
 449
 450static void update_min_vruntime(struct cfs_rq *cfs_rq)
 451{
 
 
 
 452	u64 vruntime = cfs_rq->min_vruntime;
 453
 454	if (cfs_rq->curr)
 455		vruntime = cfs_rq->curr->vruntime;
 
 
 
 
 456
 457	if (cfs_rq->rb_leftmost) {
 458		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 459						   struct sched_entity,
 460						   run_node);
 461
 462		if (!cfs_rq->curr)
 463			vruntime = se->vruntime;
 464		else
 465			vruntime = min_vruntime(vruntime, se->vruntime);
 466	}
 467
 
 468	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 469#ifndef CONFIG_64BIT
 470	smp_wmb();
 471	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 472#endif
 473}
 474
 475/*
 476 * Enqueue an entity into the rb-tree:
 477 */
 478static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 479{
 480	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 481	struct rb_node *parent = NULL;
 482	struct sched_entity *entry;
 483	int leftmost = 1;
 484
 485	/*
 486	 * Find the right place in the rbtree:
 487	 */
 488	while (*link) {
 489		parent = *link;
 490		entry = rb_entry(parent, struct sched_entity, run_node);
 491		/*
 492		 * We dont care about collisions. Nodes with
 493		 * the same key stay together.
 494		 */
 495		if (entity_before(se, entry)) {
 496			link = &parent->rb_left;
 497		} else {
 498			link = &parent->rb_right;
 499			leftmost = 0;
 500		}
 501	}
 502
 503	/*
 504	 * Maintain a cache of leftmost tree entries (it is frequently
 505	 * used):
 506	 */
 507	if (leftmost)
 508		cfs_rq->rb_leftmost = &se->run_node;
 509
 510	rb_link_node(&se->run_node, parent, link);
 511	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 
 512}
 513
 514static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 515{
 516	if (cfs_rq->rb_leftmost == &se->run_node) {
 517		struct rb_node *next_node;
 518
 519		next_node = rb_next(&se->run_node);
 520		cfs_rq->rb_leftmost = next_node;
 521	}
 522
 523	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 524}
 525
 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 527{
 528	struct rb_node *left = cfs_rq->rb_leftmost;
 529
 530	if (!left)
 531		return NULL;
 532
 533	return rb_entry(left, struct sched_entity, run_node);
 534}
 535
 536static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 537{
 538	struct rb_node *next = rb_next(&se->run_node);
 539
 540	if (!next)
 541		return NULL;
 542
 543	return rb_entry(next, struct sched_entity, run_node);
 544}
 545
 546#ifdef CONFIG_SCHED_DEBUG
 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 548{
 549	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 550
 551	if (!last)
 552		return NULL;
 553
 554	return rb_entry(last, struct sched_entity, run_node);
 555}
 556
 557/**************************************************************
 558 * Scheduling class statistics methods:
 559 */
 560
 561int sched_proc_update_handler(struct ctl_table *table, int write,
 562		void __user *buffer, size_t *lenp,
 563		loff_t *ppos)
 564{
 565	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 566	int factor = get_update_sysctl_factor();
 567
 568	if (ret || !write)
 569		return ret;
 570
 571	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 572					sysctl_sched_min_granularity);
 573
 574#define WRT_SYSCTL(name) \
 575	(normalized_sysctl_##name = sysctl_##name / (factor))
 576	WRT_SYSCTL(sched_min_granularity);
 577	WRT_SYSCTL(sched_latency);
 578	WRT_SYSCTL(sched_wakeup_granularity);
 579#undef WRT_SYSCTL
 580
 581	return 0;
 582}
 583#endif
 584
 585/*
 586 * delta /= w
 587 */
 588static inline unsigned long
 589calc_delta_fair(unsigned long delta, struct sched_entity *se)
 590{
 591	if (unlikely(se->load.weight != NICE_0_LOAD))
 592		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
 593
 594	return delta;
 595}
 596
 597/*
 598 * The idea is to set a period in which each task runs once.
 599 *
 600 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
 601 * this period because otherwise the slices get too small.
 602 *
 603 * p = (nr <= nl) ? l : l*nr/nl
 604 */
 605static u64 __sched_period(unsigned long nr_running)
 606{
 607	u64 period = sysctl_sched_latency;
 608	unsigned long nr_latency = sched_nr_latency;
 609
 610	if (unlikely(nr_running > nr_latency)) {
 611		period = sysctl_sched_min_granularity;
 612		period *= nr_running;
 613	}
 614
 615	return period;
 616}
 617
 618/*
 619 * We calculate the wall-time slice from the period by taking a part
 620 * proportional to the weight.
 621 *
 622 * s = p*P[w/rw]
 623 */
 624static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 625{
 626	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 627
 628	for_each_sched_entity(se) {
 629		struct load_weight *load;
 630		struct load_weight lw;
 631
 632		cfs_rq = cfs_rq_of(se);
 633		load = &cfs_rq->load;
 634
 635		if (unlikely(!se->on_rq)) {
 636			lw = cfs_rq->load;
 637
 638			update_load_add(&lw, se->load.weight);
 639			load = &lw;
 640		}
 641		slice = calc_delta_mine(slice, se->load.weight, load);
 642	}
 643	return slice;
 644}
 645
 646/*
 647 * We calculate the vruntime slice of a to be inserted task
 648 *
 649 * vs = s/w
 650 */
 651static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 652{
 653	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 654}
 655
 656static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
 657static void update_cfs_shares(struct cfs_rq *cfs_rq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 658
 659/*
 660 * Update the current task's runtime statistics. Skip current tasks that
 661 * are not in our scheduling class.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 662 */
 663static inline void
 664__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 665	      unsigned long delta_exec)
 666{
 667	unsigned long delta_exec_weighted;
 
 
 
 
 
 
 
 
 
 668
 669	schedstat_set(curr->statistics.exec_max,
 670		      max((u64)delta_exec, curr->statistics.exec_max));
 
 
 
 
 671
 672	curr->sum_exec_runtime += delta_exec;
 673	schedstat_add(cfs_rq, exec_clock, delta_exec);
 674	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 
 
 
 
 
 
 
 
 
 
 
 675
 676	curr->vruntime += delta_exec_weighted;
 677	update_min_vruntime(cfs_rq);
 678
 679#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
 680	cfs_rq->load_unacc_exec_time += delta_exec;
 681#endif
 
 
 
 
 
 
 682}
 
 683
 
 
 
 684static void update_curr(struct cfs_rq *cfs_rq)
 685{
 686	struct sched_entity *curr = cfs_rq->curr;
 687	u64 now = rq_of(cfs_rq)->clock_task;
 688	unsigned long delta_exec;
 689
 690	if (unlikely(!curr))
 691		return;
 692
 693	/*
 694	 * Get the amount of time the current task was running
 695	 * since the last time we changed load (this cannot
 696	 * overflow on 32 bits):
 697	 */
 698	delta_exec = (unsigned long)(now - curr->exec_start);
 699	if (!delta_exec)
 700		return;
 701
 702	__update_curr(cfs_rq, curr, delta_exec);
 703	curr->exec_start = now;
 704
 
 
 
 
 
 
 
 
 
 705	if (entity_is_task(curr)) {
 706		struct task_struct *curtask = task_of(curr);
 707
 708		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 709		cpuacct_charge(curtask, delta_exec);
 710		account_group_exec_runtime(curtask, delta_exec);
 711	}
 712
 713	account_cfs_rq_runtime(cfs_rq, delta_exec);
 714}
 715
 
 
 
 
 
 716static inline void
 717update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 718{
 719	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 720}
 721
 722/*
 723 * Task is being enqueued - update stats:
 724 */
 725static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 726{
 
 
 
 727	/*
 728	 * Are we enqueueing a waiting task? (for current tasks
 729	 * a dequeue/enqueue event is a NOP)
 730	 */
 731	if (se != cfs_rq->curr)
 732		update_stats_wait_start(cfs_rq, se);
 733}
 734
 735static void
 736update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 737{
 738	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 739			rq_of(cfs_rq)->clock - se->statistics.wait_start));
 740	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 741	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 742			rq_of(cfs_rq)->clock - se->statistics.wait_start);
 743#ifdef CONFIG_SCHEDSTATS
 744	if (entity_is_task(se)) {
 745		trace_sched_stat_wait(task_of(se),
 746			rq_of(cfs_rq)->clock - se->statistics.wait_start);
 747	}
 748#endif
 749	schedstat_set(se->statistics.wait_start, 0);
 750}
 751
 752static inline void
 753update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 754{
 
 
 
 
 755	/*
 756	 * Mark the end of the wait period if dequeueing a
 757	 * waiting task:
 758	 */
 759	if (se != cfs_rq->curr)
 760		update_stats_wait_end(cfs_rq, se);
 
 
 
 
 
 
 
 
 
 
 
 761}
 762
 763/*
 764 * We are picking a new current task - update its stats:
 765 */
 766static inline void
 767update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 768{
 769	/*
 770	 * We are starting a new run period:
 771	 */
 772	se->exec_start = rq_of(cfs_rq)->clock_task;
 773}
 774
 775/**************************************************
 776 * Scheduling class queueing methods:
 777 */
 778
 779static void
 780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 781{
 782	update_load_add(&cfs_rq->load, se->load.weight);
 783	if (!parent_entity(se))
 784		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 785#ifdef CONFIG_SMP
 786	if (entity_is_task(se))
 787		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
 788#endif
 789	cfs_rq->nr_running++;
 790}
 791
 792static void
 793account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 794{
 795	update_load_sub(&cfs_rq->load, se->load.weight);
 796	if (!parent_entity(se))
 797		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
 798	if (entity_is_task(se))
 799		list_del_init(&se->group_node);
 800	cfs_rq->nr_running--;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 801}
 802
 803#ifdef CONFIG_FAIR_GROUP_SCHED
 804/* we need this in update_cfs_load and load-balance functions below */
 805static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 806# ifdef CONFIG_SMP
 807static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
 808					    int global_update)
 809{
 810	struct task_group *tg = cfs_rq->tg;
 811	long load_avg;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 812
 813	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
 814	load_avg -= cfs_rq->load_contribution;
 
 
 
 
 
 
 
 
 
 815
 816	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
 817		atomic_add(load_avg, &tg->load_weight);
 818		cfs_rq->load_contribution += load_avg;
 
 
 
 
 819	}
 
 
 
 
 
 
 
 
 
 820}
 821
 822static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 823{
 824	u64 period = sysctl_sched_shares_window;
 825	u64 now, delta;
 826	unsigned long load = cfs_rq->load.weight;
 
 
 
 
 
 
 
 
 827
 828	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 829		return;
 830
 831	now = rq_of(cfs_rq)->clock_task;
 832	delta = now - cfs_rq->load_stamp;
 
 
 
 
 
 
 
 
 
 833
 834	/* truncate load history at 4 idle periods */
 835	if (cfs_rq->load_stamp > cfs_rq->load_last &&
 836	    now - cfs_rq->load_last > 4 * period) {
 837		cfs_rq->load_period = 0;
 838		cfs_rq->load_avg = 0;
 839		delta = period - 1;
 840	}
 841
 842	cfs_rq->load_stamp = now;
 843	cfs_rq->load_unacc_exec_time = 0;
 844	cfs_rq->load_period += delta;
 845	if (load) {
 846		cfs_rq->load_last = now;
 847		cfs_rq->load_avg += delta * load;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 848	}
 
 
 
 
 
 849
 850	/* consider updating load contribution on each fold or truncate */
 851	if (global_update || cfs_rq->load_period > period
 852	    || !cfs_rq->load_period)
 853		update_cfs_rq_load_contribution(cfs_rq, global_update);
 
 
 
 
 
 854
 855	while (cfs_rq->load_period > period) {
 856		/*
 857		 * Inline assembly required to prevent the compiler
 858		 * optimising this loop into a divmod call.
 859		 * See __iter_div_u64_rem() for another example of this.
 860		 */
 861		asm("" : "+rm" (cfs_rq->load_period));
 862		cfs_rq->load_period /= 2;
 863		cfs_rq->load_avg /= 2;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 864	}
 865
 866	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
 867		list_del_leaf_cfs_rq(cfs_rq);
 
 
 
 
 
 
 
 
 
 
 868}
 869
 870static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 
 
 
 871{
 872	long tg_weight;
 
 
 
 
 
 
 
 873
 874	/*
 875	 * Use this CPU's actual weight instead of the last load_contribution
 876	 * to gain a more accurate current total weight. See
 877	 * update_cfs_rq_load_contribution().
 
 878	 */
 879	tg_weight = atomic_read(&tg->load_weight);
 880	tg_weight -= cfs_rq->load_contribution;
 881	tg_weight += cfs_rq->load.weight;
 882
 883	return tg_weight;
 
 
 
 
 
 
 
 884}
 885
 886static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 887{
 888	long tg_weight, load, shares;
 
 
 
 
 889
 890	tg_weight = calc_tg_weight(tg, cfs_rq);
 891	load = cfs_rq->load.weight;
 
 
 
 892
 893	shares = (tg->shares * load);
 894	if (tg_weight)
 895		shares /= tg_weight;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 896
 897	if (shares < MIN_SHARES)
 898		shares = MIN_SHARES;
 899	if (shares > tg->shares)
 900		shares = tg->shares;
 901
 902	return shares;
 
 903}
 904
 905static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 906{
 907	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
 908		update_cfs_load(cfs_rq, 0);
 909		update_cfs_shares(cfs_rq);
 
 
 
 
 
 
 
 
 
 
 
 910	}
 
 
 911}
 912# else /* CONFIG_SMP */
 913static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 
 914{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 915}
 916
 917static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 
 918{
 919	return tg->shares;
 
 
 
 
 920}
 921
 922static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 
 923{
 
 
 924}
 925# endif /* CONFIG_SMP */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 926static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 927			    unsigned long weight)
 928{
 929	if (se->on_rq) {
 930		/* commit outstanding execution time */
 931		if (cfs_rq->curr == se)
 932			update_curr(cfs_rq);
 933		account_entity_dequeue(cfs_rq, se);
 
 934	}
 
 935
 
 936	update_load_set(&se->load, weight);
 937
 938	if (se->on_rq)
 
 
 
 
 
 
 
 
 
 
 
 939		account_entity_enqueue(cfs_rq, se);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 940}
 941
 942static void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 943{
 944	struct task_group *tg;
 945	struct sched_entity *se;
 946	long shares;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 947
 948	tg = cfs_rq->tg;
 949	se = tg->se[cpu_of(rq_of(cfs_rq))];
 950	if (!se || throttled_hierarchy(cfs_rq))
 951		return;
 
 952#ifndef CONFIG_SMP
 953	if (likely(se->load.weight == tg->shares))
 
 
 954		return;
 
 
 
 955#endif
 956	shares = calc_cfs_shares(cfs_rq, tg);
 957
 958	reweight_entity(cfs_rq_of(se), se, shares);
 959}
 
 960#else /* CONFIG_FAIR_GROUP_SCHED */
 961static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 962{
 
 
 
 
 
 
 
 
 
 
 
 
 963}
 964
 965static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 
 
 
 
 
 966{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 967}
 968
 969static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 970{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 971}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 972#endif /* CONFIG_FAIR_GROUP_SCHED */
 973
 974static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 975{
 976#ifdef CONFIG_SCHEDSTATS
 977	struct task_struct *tsk = NULL;
 
 978
 979	if (entity_is_task(se))
 980		tsk = task_of(se);
 
 981
 982	if (se->statistics.sleep_start) {
 983		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
 
 
 
 
 984
 985		if ((s64)delta < 0)
 986			delta = 0;
 
 987
 988		if (unlikely(delta > se->statistics.sleep_max))
 989			se->statistics.sleep_max = delta;
 
 990
 991		se->statistics.sleep_start = 0;
 992		se->statistics.sum_sleep_runtime += delta;
 993
 994		if (tsk) {
 995			account_scheduler_latency(tsk, delta >> 10, 1);
 996			trace_sched_stat_sleep(tsk, delta);
 997		}
 998	}
 999	if (se->statistics.block_start) {
1000		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1001
1002		if ((s64)delta < 0)
1003			delta = 0;
 
 
 
 
 
 
 
1004
1005		if (unlikely(delta > se->statistics.block_max))
1006			se->statistics.block_max = delta;
1007
1008		se->statistics.block_start = 0;
1009		se->statistics.sum_sleep_runtime += delta;
 
 
 
 
 
 
 
 
 
 
1010
1011		if (tsk) {
1012			if (tsk->in_iowait) {
1013				se->statistics.iowait_sum += delta;
1014				se->statistics.iowait_count++;
1015				trace_sched_stat_iowait(tsk, delta);
1016			}
 
 
 
1017
1018			trace_sched_stat_blocked(tsk, delta);
 
 
 
 
 
 
1019
1020			/*
1021			 * Blocking time is in units of nanosecs, so shift by
1022			 * 20 to get a milliseconds-range estimation of the
1023			 * amount of time that the task spent sleeping:
1024			 */
1025			if (unlikely(prof_on == SLEEP_PROFILING)) {
1026				profile_hits(SLEEP_PROFILING,
1027						(void *)get_wchan(tsk),
1028						delta >> 20);
1029			}
1030			account_scheduler_latency(tsk, delta >> 10, 0);
1031		}
1032	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1033#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1034}
1035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1036static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1037{
1038#ifdef CONFIG_SCHED_DEBUG
1039	s64 d = se->vruntime - cfs_rq->min_vruntime;
1040
1041	if (d < 0)
1042		d = -d;
1043
1044	if (d > 3*sysctl_sched_latency)
1045		schedstat_inc(cfs_rq, nr_spread_over);
1046#endif
1047}
1048
1049static void
1050place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1051{
1052	u64 vruntime = cfs_rq->min_vruntime;
1053
1054	/*
1055	 * The 'current' period is already promised to the current tasks,
1056	 * however the extra weight of the new task will slow them down a
1057	 * little, place the new task so that it fits in the slot that
1058	 * stays open at the end.
1059	 */
1060	if (initial && sched_feat(START_DEBIT))
1061		vruntime += sched_vslice(cfs_rq, se);
1062
1063	/* sleeps up to a single latency don't count. */
1064	if (!initial) {
1065		unsigned long thresh = sysctl_sched_latency;
1066
1067		/*
1068		 * Halve their sleep time's effect, to allow
1069		 * for a gentler effect of sleepers:
1070		 */
1071		if (sched_feat(GENTLE_FAIR_SLEEPERS))
1072			thresh >>= 1;
1073
1074		vruntime -= thresh;
1075	}
1076
1077	/* ensure we never gain time by being placed backwards. */
1078	vruntime = max_vruntime(se->vruntime, vruntime);
 
1079
1080	se->vruntime = vruntime;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1081}
1082
1083static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1084
1085static void
1086enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1087{
 
 
 
1088	/*
1089	 * Update the normalized vruntime before updating min_vruntime
1090	 * through callig update_curr().
1091	 */
1092	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1093		se->vruntime += cfs_rq->min_vruntime;
1094
 
 
1095	/*
1096	 * Update run-time statistics of the 'current'.
 
 
 
1097	 */
1098	update_curr(cfs_rq);
1099	update_cfs_load(cfs_rq, 0);
 
 
 
 
 
 
 
 
 
 
 
 
1100	account_entity_enqueue(cfs_rq, se);
1101	update_cfs_shares(cfs_rq);
1102
1103	if (flags & ENQUEUE_WAKEUP) {
1104		place_entity(cfs_rq, se, 0);
1105		enqueue_sleeper(cfs_rq, se);
1106	}
1107
1108	update_stats_enqueue(cfs_rq, se);
 
1109	check_spread(cfs_rq, se);
1110	if (se != cfs_rq->curr)
1111		__enqueue_entity(cfs_rq, se);
1112	se->on_rq = 1;
1113
1114	if (cfs_rq->nr_running == 1) {
1115		list_add_leaf_cfs_rq(cfs_rq);
1116		check_enqueue_throttle(cfs_rq);
1117	}
1118}
1119
1120static void __clear_buddies_last(struct sched_entity *se)
1121{
1122	for_each_sched_entity(se) {
1123		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1124		if (cfs_rq->last == se)
1125			cfs_rq->last = NULL;
1126		else
1127			break;
 
 
1128	}
1129}
1130
1131static void __clear_buddies_next(struct sched_entity *se)
1132{
1133	for_each_sched_entity(se) {
1134		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1135		if (cfs_rq->next == se)
1136			cfs_rq->next = NULL;
1137		else
1138			break;
 
 
1139	}
1140}
1141
1142static void __clear_buddies_skip(struct sched_entity *se)
1143{
1144	for_each_sched_entity(se) {
1145		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1146		if (cfs_rq->skip == se)
1147			cfs_rq->skip = NULL;
1148		else
1149			break;
 
 
1150	}
1151}
1152
1153static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1154{
1155	if (cfs_rq->last == se)
1156		__clear_buddies_last(se);
1157
1158	if (cfs_rq->next == se)
1159		__clear_buddies_next(se);
1160
1161	if (cfs_rq->skip == se)
1162		__clear_buddies_skip(se);
1163}
1164
1165static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1166
1167static void
1168dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1169{
1170	/*
1171	 * Update run-time statistics of the 'current'.
1172	 */
1173	update_curr(cfs_rq);
1174
1175	update_stats_dequeue(cfs_rq, se);
1176	if (flags & DEQUEUE_SLEEP) {
1177#ifdef CONFIG_SCHEDSTATS
1178		if (entity_is_task(se)) {
1179			struct task_struct *tsk = task_of(se);
 
 
 
 
 
1180
1181			if (tsk->state & TASK_INTERRUPTIBLE)
1182				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1183			if (tsk->state & TASK_UNINTERRUPTIBLE)
1184				se->statistics.block_start = rq_of(cfs_rq)->clock;
1185		}
1186#endif
1187	}
1188
1189	clear_buddies(cfs_rq, se);
1190
1191	if (se != cfs_rq->curr)
1192		__dequeue_entity(cfs_rq, se);
1193	se->on_rq = 0;
1194	update_cfs_load(cfs_rq, 0);
1195	account_entity_dequeue(cfs_rq, se);
1196
1197	/*
1198	 * Normalize the entity after updating the min_vruntime because the
1199	 * update can refer to the ->curr item and we need to reflect this
1200	 * movement in our normalized position.
 
1201	 */
1202	if (!(flags & DEQUEUE_SLEEP))
1203		se->vruntime -= cfs_rq->min_vruntime;
1204
1205	/* return excess runtime on last dequeue */
1206	return_cfs_rq_runtime(cfs_rq);
1207
1208	update_min_vruntime(cfs_rq);
1209	update_cfs_shares(cfs_rq);
 
 
 
 
 
 
 
 
1210}
1211
1212/*
1213 * Preempt the current task with a newly woken task if needed:
1214 */
1215static void
1216check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1217{
1218	unsigned long ideal_runtime, delta_exec;
1219	struct sched_entity *se;
1220	s64 delta;
1221
1222	ideal_runtime = sched_slice(cfs_rq, curr);
1223	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1224	if (delta_exec > ideal_runtime) {
1225		resched_task(rq_of(cfs_rq)->curr);
1226		/*
1227		 * The current task ran long enough, ensure it doesn't get
1228		 * re-elected due to buddy favours.
1229		 */
1230		clear_buddies(cfs_rq, curr);
1231		return;
1232	}
1233
1234	/*
1235	 * Ensure that a task that missed wakeup preemption by a
1236	 * narrow margin doesn't have to wait for a full slice.
1237	 * This also mitigates buddy induced latencies under load.
1238	 */
1239	if (delta_exec < sysctl_sched_min_granularity)
1240		return;
1241
1242	se = __pick_first_entity(cfs_rq);
1243	delta = curr->vruntime - se->vruntime;
1244
1245	if (delta < 0)
1246		return;
1247
1248	if (delta > ideal_runtime)
1249		resched_task(rq_of(cfs_rq)->curr);
1250}
1251
1252static void
1253set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1254{
1255	/* 'current' is not kept within the tree. */
1256	if (se->on_rq) {
1257		/*
1258		 * Any task has to be enqueued before it get to execute on
1259		 * a CPU. So account for the time it spent waiting on the
1260		 * runqueue.
1261		 */
1262		update_stats_wait_end(cfs_rq, se);
1263		__dequeue_entity(cfs_rq, se);
 
1264	}
1265
1266	update_stats_curr_start(cfs_rq, se);
1267	cfs_rq->curr = se;
1268#ifdef CONFIG_SCHEDSTATS
1269	/*
1270	 * Track our maximum slice length, if the CPU's load is at
1271	 * least twice that of our own weight (i.e. dont track it
1272	 * when there are only lesser-weight tasks around):
1273	 */
1274	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1275		se->statistics.slice_max = max(se->statistics.slice_max,
1276			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 
 
1277	}
1278#endif
1279	se->prev_sum_exec_runtime = se->sum_exec_runtime;
1280}
1281
1282static int
1283wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1284
1285/*
1286 * Pick the next process, keeping these things in mind, in this order:
1287 * 1) keep things fair between processes/task groups
1288 * 2) pick the "next" process, since someone really wants that to run
1289 * 3) pick the "last" process, for cache locality
1290 * 4) do not run the "skip" process, if something else is available
1291 */
1292static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 
1293{
1294	struct sched_entity *se = __pick_first_entity(cfs_rq);
1295	struct sched_entity *left = se;
 
 
 
 
 
 
 
 
 
1296
1297	/*
1298	 * Avoid running the skip buddy, if running something else can
1299	 * be done without getting too unfair.
1300	 */
1301	if (cfs_rq->skip == se) {
1302		struct sched_entity *second = __pick_next_entity(se);
 
 
 
 
 
 
 
 
 
1303		if (second && wakeup_preempt_entity(second, left) < 1)
1304			se = second;
1305	}
1306
1307	/*
1308	 * Prefer last buddy, try to return the CPU to a preempted task.
1309	 */
1310	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1311		se = cfs_rq->last;
1312
1313	/*
1314	 * Someone really wants this to run. If it's not unfair, run it.
1315	 */
1316	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1317		se = cfs_rq->next;
1318
1319	clear_buddies(cfs_rq, se);
1320
1321	return se;
1322}
1323
1324static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1325
1326static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1327{
1328	/*
1329	 * If still on the runqueue then deactivate_task()
1330	 * was not called and update_curr() has to be done:
1331	 */
1332	if (prev->on_rq)
1333		update_curr(cfs_rq);
1334
1335	/* throttle cfs_rqs exceeding runtime */
1336	check_cfs_rq_runtime(cfs_rq);
1337
1338	check_spread(cfs_rq, prev);
 
1339	if (prev->on_rq) {
1340		update_stats_wait_start(cfs_rq, prev);
1341		/* Put 'current' back into the tree. */
1342		__enqueue_entity(cfs_rq, prev);
 
 
1343	}
1344	cfs_rq->curr = NULL;
1345}
1346
1347static void
1348entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1349{
1350	/*
1351	 * Update run-time statistics of the 'current'.
1352	 */
1353	update_curr(cfs_rq);
1354
1355	/*
1356	 * Update share accounting for long-running entities.
1357	 */
1358	update_entity_shares_tick(cfs_rq);
 
1359
1360#ifdef CONFIG_SCHED_HRTICK
1361	/*
1362	 * queued ticks are scheduled to match the slice, so don't bother
1363	 * validating it and just reschedule.
1364	 */
1365	if (queued) {
1366		resched_task(rq_of(cfs_rq)->curr);
1367		return;
1368	}
1369	/*
1370	 * don't let the period tick interfere with the hrtick preemption
1371	 */
1372	if (!sched_feat(DOUBLE_TICK) &&
1373			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1374		return;
1375#endif
1376
1377	if (cfs_rq->nr_running > 1)
1378		check_preempt_tick(cfs_rq, curr);
1379}
1380
1381
1382/**************************************************
1383 * CFS bandwidth control machinery
1384 */
1385
1386#ifdef CONFIG_CFS_BANDWIDTH
1387
1388#ifdef HAVE_JUMP_LABEL
1389static struct static_key __cfs_bandwidth_used;
1390
1391static inline bool cfs_bandwidth_used(void)
1392{
1393	return static_key_false(&__cfs_bandwidth_used);
1394}
1395
1396void account_cfs_bandwidth_used(int enabled, int was_enabled)
1397{
1398	/* only need to count groups transitioning between enabled/!enabled */
1399	if (enabled && !was_enabled)
1400		static_key_slow_inc(&__cfs_bandwidth_used);
1401	else if (!enabled && was_enabled)
1402		static_key_slow_dec(&__cfs_bandwidth_used);
1403}
1404#else /* HAVE_JUMP_LABEL */
 
 
 
 
 
1405static bool cfs_bandwidth_used(void)
1406{
1407	return true;
1408}
1409
1410void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1411#endif /* HAVE_JUMP_LABEL */
 
1412
1413/*
1414 * default period for cfs group bandwidth.
1415 * default: 0.1s, units: nanoseconds
1416 */
1417static inline u64 default_cfs_period(void)
1418{
1419	return 100000000ULL;
1420}
1421
1422static inline u64 sched_cfs_bandwidth_slice(void)
1423{
1424	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1425}
1426
1427/*
1428 * Replenish runtime according to assigned quota and update expiration time.
1429 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1430 * additional synchronization around rq->lock.
1431 *
1432 * requires cfs_b->lock
1433 */
1434void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1435{
1436	u64 now;
1437
1438	if (cfs_b->quota == RUNTIME_INF)
1439		return;
1440
1441	now = sched_clock_cpu(smp_processor_id());
1442	cfs_b->runtime = cfs_b->quota;
1443	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1444}
1445
1446static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1447{
1448	return &tg->cfs_bandwidth;
1449}
1450
1451/* returns 0 on failure to allocate runtime */
1452static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1453{
1454	struct task_group *tg = cfs_rq->tg;
1455	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1456	u64 amount = 0, min_amount, expires;
1457
1458	/* note: this is a positive sum as runtime_remaining <= 0 */
1459	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1460
1461	raw_spin_lock(&cfs_b->lock);
1462	if (cfs_b->quota == RUNTIME_INF)
1463		amount = min_amount;
1464	else {
1465		/*
1466		 * If the bandwidth pool has become inactive, then at least one
1467		 * period must have elapsed since the last consumption.
1468		 * Refresh the global state and ensure bandwidth timer becomes
1469		 * active.
1470		 */
1471		if (!cfs_b->timer_active) {
1472			__refill_cfs_bandwidth_runtime(cfs_b);
1473			__start_cfs_bandwidth(cfs_b);
1474		}
1475
1476		if (cfs_b->runtime > 0) {
1477			amount = min(cfs_b->runtime, min_amount);
1478			cfs_b->runtime -= amount;
1479			cfs_b->idle = 0;
1480		}
1481	}
1482	expires = cfs_b->runtime_expires;
1483	raw_spin_unlock(&cfs_b->lock);
1484
1485	cfs_rq->runtime_remaining += amount;
1486	/*
1487	 * we may have advanced our local expiration to account for allowed
1488	 * spread between our sched_clock and the one on which runtime was
1489	 * issued.
1490	 */
1491	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1492		cfs_rq->runtime_expires = expires;
1493
1494	return cfs_rq->runtime_remaining > 0;
1495}
1496
1497/*
1498 * Note: This depends on the synchronization provided by sched_clock and the
1499 * fact that rq->clock snapshots this value.
1500 */
1501static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1502{
1503	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1504	struct rq *rq = rq_of(cfs_rq);
1505
1506	/* if the deadline is ahead of our clock, nothing to do */
1507	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1508		return;
1509
1510	if (cfs_rq->runtime_remaining < 0)
1511		return;
1512
1513	/*
1514	 * If the local deadline has passed we have to consider the
1515	 * possibility that our sched_clock is 'fast' and the global deadline
1516	 * has not truly expired.
1517	 *
1518	 * Fortunately we can check determine whether this the case by checking
1519	 * whether the global deadline has advanced.
1520	 */
1521
1522	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1523		/* extend local deadline, drift is bounded above by 2 ticks */
1524		cfs_rq->runtime_expires += TICK_NSEC;
1525	} else {
1526		/* global deadline is ahead, expiration has passed */
1527		cfs_rq->runtime_remaining = 0;
1528	}
1529}
1530
1531static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1532				     unsigned long delta_exec)
1533{
1534	/* dock delta_exec before expiring quota (as it could span periods) */
1535	cfs_rq->runtime_remaining -= delta_exec;
1536	expire_cfs_rq_runtime(cfs_rq);
1537
1538	if (likely(cfs_rq->runtime_remaining > 0))
1539		return;
1540
 
 
1541	/*
1542	 * if we're unable to extend our runtime we resched so that the active
1543	 * hierarchy can be throttled
1544	 */
1545	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1546		resched_task(rq_of(cfs_rq)->curr);
1547}
1548
1549static __always_inline
1550void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
1551{
1552	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1553		return;
1554
1555	__account_cfs_rq_runtime(cfs_rq, delta_exec);
1556}
1557
1558static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1559{
1560	return cfs_bandwidth_used() && cfs_rq->throttled;
1561}
1562
1563/* check whether cfs_rq, or any parent, is throttled */
1564static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1565{
1566	return cfs_bandwidth_used() && cfs_rq->throttle_count;
1567}
1568
1569/*
1570 * Ensure that neither of the group entities corresponding to src_cpu or
1571 * dest_cpu are members of a throttled hierarchy when performing group
1572 * load-balance operations.
1573 */
1574static inline int throttled_lb_pair(struct task_group *tg,
1575				    int src_cpu, int dest_cpu)
1576{
1577	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1578
1579	src_cfs_rq = tg->cfs_rq[src_cpu];
1580	dest_cfs_rq = tg->cfs_rq[dest_cpu];
1581
1582	return throttled_hierarchy(src_cfs_rq) ||
1583	       throttled_hierarchy(dest_cfs_rq);
1584}
1585
1586/* updated child weight may affect parent so we have to do this bottom up */
1587static int tg_unthrottle_up(struct task_group *tg, void *data)
1588{
1589	struct rq *rq = data;
1590	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1591
1592	cfs_rq->throttle_count--;
1593#ifdef CONFIG_SMP
1594	if (!cfs_rq->throttle_count) {
1595		u64 delta = rq->clock_task - cfs_rq->load_stamp;
 
1596
1597		/* leaving throttled state, advance shares averaging windows */
1598		cfs_rq->load_stamp += delta;
1599		cfs_rq->load_last += delta;
1600
1601		/* update entity weight now that we are on_rq again */
1602		update_cfs_shares(cfs_rq);
1603	}
1604#endif
1605
1606	return 0;
1607}
1608
1609static int tg_throttle_down(struct task_group *tg, void *data)
1610{
1611	struct rq *rq = data;
1612	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1613
1614	/* group is entering throttled state, record last load */
1615	if (!cfs_rq->throttle_count)
1616		update_cfs_load(cfs_rq, 0);
 
 
1617	cfs_rq->throttle_count++;
1618
1619	return 0;
1620}
1621
1622static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1623{
1624	struct rq *rq = rq_of(cfs_rq);
1625	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1626	struct sched_entity *se;
1627	long task_delta, dequeue = 1;
 
1628
1629	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1630
1631	/* account load preceding throttle */
1632	rcu_read_lock();
1633	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1634	rcu_read_unlock();
1635
1636	task_delta = cfs_rq->h_nr_running;
 
1637	for_each_sched_entity(se) {
1638		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1639		/* throttled entity or throttle-on-deactivate */
1640		if (!se->on_rq)
1641			break;
1642
1643		if (dequeue)
1644			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1645		qcfs_rq->h_nr_running -= task_delta;
 
1646
1647		if (qcfs_rq->load.weight)
1648			dequeue = 0;
1649	}
1650
1651	if (!se)
1652		rq->nr_running -= task_delta;
1653
1654	cfs_rq->throttled = 1;
1655	cfs_rq->throttled_timestamp = rq->clock;
1656	raw_spin_lock(&cfs_b->lock);
1657	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1658	raw_spin_unlock(&cfs_b->lock);
1659}
1660
1661void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1662{
1663	struct rq *rq = rq_of(cfs_rq);
1664	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1665	struct sched_entity *se;
1666	int enqueue = 1;
1667	long task_delta;
1668
1669	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1670
1671	cfs_rq->throttled = 0;
 
 
 
1672	raw_spin_lock(&cfs_b->lock);
1673	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1674	list_del_rcu(&cfs_rq->throttled_list);
1675	raw_spin_unlock(&cfs_b->lock);
1676	cfs_rq->throttled_timestamp = 0;
1677
1678	update_rq_clock(rq);
1679	/* update hierarchical throttle state */
1680	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1681
1682	if (!cfs_rq->load.weight)
1683		return;
1684
1685	task_delta = cfs_rq->h_nr_running;
 
1686	for_each_sched_entity(se) {
1687		if (se->on_rq)
1688			enqueue = 0;
1689
1690		cfs_rq = cfs_rq_of(se);
1691		if (enqueue)
1692			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1693		cfs_rq->h_nr_running += task_delta;
 
1694
1695		if (cfs_rq_throttled(cfs_rq))
1696			break;
1697	}
1698
 
 
1699	if (!se)
1700		rq->nr_running += task_delta;
1701
1702	/* determine whether we need to wake up potentially idle cpu */
1703	if (rq->curr == rq->idle && rq->cfs.nr_running)
1704		resched_task(rq->curr);
1705}
1706
1707static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1708		u64 remaining, u64 expires)
1709{
1710	struct cfs_rq *cfs_rq;
1711	u64 runtime = remaining;
 
1712
1713	rcu_read_lock();
1714	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1715				throttled_list) {
1716		struct rq *rq = rq_of(cfs_rq);
 
1717
1718		raw_spin_lock(&rq->lock);
1719		if (!cfs_rq_throttled(cfs_rq))
1720			goto next;
1721
 
 
 
1722		runtime = -cfs_rq->runtime_remaining + 1;
1723		if (runtime > remaining)
1724			runtime = remaining;
1725		remaining -= runtime;
1726
1727		cfs_rq->runtime_remaining += runtime;
1728		cfs_rq->runtime_expires = expires;
1729
1730		/* we check whether we're throttled above */
1731		if (cfs_rq->runtime_remaining > 0)
1732			unthrottle_cfs_rq(cfs_rq);
1733
1734next:
1735		raw_spin_unlock(&rq->lock);
1736
1737		if (!remaining)
1738			break;
1739	}
1740	rcu_read_unlock();
1741
1742	return remaining;
1743}
1744
1745/*
1746 * Responsible for refilling a task_group's bandwidth and unthrottling its
1747 * cfs_rqs as appropriate. If there has been no activity within the last
1748 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1749 * used to track this state.
1750 */
1751static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1752{
1753	u64 runtime, runtime_expires;
1754	int idle = 1, throttled;
1755
1756	raw_spin_lock(&cfs_b->lock);
1757	/* no need to continue the timer with no bandwidth constraint */
1758	if (cfs_b->quota == RUNTIME_INF)
1759		goto out_unlock;
1760
1761	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1762	/* idle depends on !throttled (for the case of a large deficit) */
1763	idle = cfs_b->idle && !throttled;
1764	cfs_b->nr_periods += overrun;
1765
1766	/* if we're going inactive then everything else can be deferred */
1767	if (idle)
1768		goto out_unlock;
 
 
 
1769
1770	__refill_cfs_bandwidth_runtime(cfs_b);
1771
1772	if (!throttled) {
1773		/* mark as potentially idle for the upcoming period */
1774		cfs_b->idle = 1;
1775		goto out_unlock;
1776	}
1777
1778	/* account preceding periods in which throttling occurred */
1779	cfs_b->nr_throttled += overrun;
1780
1781	/*
1782	 * There are throttled entities so we must first use the new bandwidth
1783	 * to unthrottle them before making it generally available.  This
1784	 * ensures that all existing debts will be paid before a new cfs_rq is
1785	 * allowed to run.
 
1786	 */
1787	runtime = cfs_b->runtime;
1788	runtime_expires = cfs_b->runtime_expires;
1789	cfs_b->runtime = 0;
1790
1791	/*
1792	 * This check is repeated as we are holding onto the new bandwidth
1793	 * while we unthrottle.  This can potentially race with an unthrottled
1794	 * group trying to acquire new bandwidth from the global pool.
1795	 */
1796	while (throttled && runtime > 0) {
1797		raw_spin_unlock(&cfs_b->lock);
1798		/* we can't nest cfs_b->lock while distributing bandwidth */
1799		runtime = distribute_cfs_runtime(cfs_b, runtime,
1800						 runtime_expires);
1801		raw_spin_lock(&cfs_b->lock);
1802
 
1803		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
 
1804	}
1805
1806	/* return (any) remaining runtime */
1807	cfs_b->runtime = runtime;
1808	/*
1809	 * While we are ensured activity in the period following an
1810	 * unthrottle, this also covers the case in which the new bandwidth is
1811	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
1812	 * timer to remain active while there are any throttled entities.)
1813	 */
1814	cfs_b->idle = 0;
1815out_unlock:
1816	if (idle)
1817		cfs_b->timer_active = 0;
1818	raw_spin_unlock(&cfs_b->lock);
1819
1820	return idle;
 
 
 
1821}
1822
1823/* a cfs_rq won't donate quota below this amount */
1824static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1825/* minimum remaining period time to redistribute slack quota */
1826static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1827/* how long we wait to gather additional slack before distributing */
1828static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1829
1830/* are we near the end of the current quota period? */
 
 
 
 
 
 
1831static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1832{
1833	struct hrtimer *refresh_timer = &cfs_b->period_timer;
1834	u64 remaining;
1835
1836	/* if the call-back is running a quota refresh is already occurring */
1837	if (hrtimer_callback_running(refresh_timer))
1838		return 1;
1839
1840	/* is a quota refresh about to occur? */
1841	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1842	if (remaining < min_expire)
1843		return 1;
1844
1845	return 0;
1846}
1847
1848static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1849{
1850	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1851
1852	/* if there's a quota refresh soon don't bother with slack */
1853	if (runtime_refresh_within(cfs_b, min_left))
1854		return;
1855
1856	start_bandwidth_timer(&cfs_b->slack_timer,
1857				ns_to_ktime(cfs_bandwidth_slack_period));
 
 
 
 
 
 
1858}
1859
1860/* we know any runtime found here is valid as update_curr() precedes return */
1861static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1862{
1863	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1864	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1865
1866	if (slack_runtime <= 0)
1867		return;
1868
1869	raw_spin_lock(&cfs_b->lock);
1870	if (cfs_b->quota != RUNTIME_INF &&
1871	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1872		cfs_b->runtime += slack_runtime;
1873
1874		/* we are under rq->lock, defer unthrottling using a timer */
1875		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1876		    !list_empty(&cfs_b->throttled_cfs_rq))
1877			start_cfs_slack_bandwidth(cfs_b);
1878	}
1879	raw_spin_unlock(&cfs_b->lock);
1880
1881	/* even if it's not valid for return we don't want to try again */
1882	cfs_rq->runtime_remaining -= slack_runtime;
1883}
1884
1885static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1886{
1887	if (!cfs_bandwidth_used())
1888		return;
1889
1890	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1891		return;
1892
1893	__return_cfs_rq_runtime(cfs_rq);
1894}
1895
1896/*
1897 * This is done with a timer (instead of inline with bandwidth return) since
1898 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1899 */
1900static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1901{
1902	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1903	u64 expires;
1904
1905	/* confirm we're still not at a refresh boundary */
1906	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
 
 
 
1907		return;
 
1908
1909	raw_spin_lock(&cfs_b->lock);
1910	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
 
 
 
 
1911		runtime = cfs_b->runtime;
1912		cfs_b->runtime = 0;
1913	}
1914	expires = cfs_b->runtime_expires;
1915	raw_spin_unlock(&cfs_b->lock);
 
1916
1917	if (!runtime)
1918		return;
1919
1920	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1921
1922	raw_spin_lock(&cfs_b->lock);
1923	if (expires == cfs_b->runtime_expires)
1924		cfs_b->runtime = runtime;
1925	raw_spin_unlock(&cfs_b->lock);
1926}
1927
1928/*
1929 * When a group wakes up we want to make sure that its quota is not already
1930 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1931 * runtime as update_curr() throttling can not not trigger until it's on-rq.
1932 */
1933static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1934{
1935	if (!cfs_bandwidth_used())
1936		return;
1937
1938	/* an active group must be handled by the update_curr()->put() path */
1939	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1940		return;
1941
1942	/* ensure the group is not already throttled */
1943	if (cfs_rq_throttled(cfs_rq))
1944		return;
1945
1946	/* update runtime allocation */
1947	account_cfs_rq_runtime(cfs_rq, 0);
1948	if (cfs_rq->runtime_remaining <= 0)
1949		throttle_cfs_rq(cfs_rq);
1950}
1951
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1952/* conditionally throttle active cfs_rq's from put_prev_entity() */
1953static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1954{
1955	if (!cfs_bandwidth_used())
1956		return;
1957
1958	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1959		return;
1960
1961	/*
1962	 * it's possible for a throttled entity to be forced into a running
1963	 * state (e.g. set_curr_task), in this case we're finished.
1964	 */
1965	if (cfs_rq_throttled(cfs_rq))
1966		return;
1967
1968	throttle_cfs_rq(cfs_rq);
 
1969}
1970
1971static inline u64 default_cfs_period(void);
1972static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1973static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1974
1975static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1976{
1977	struct cfs_bandwidth *cfs_b =
1978		container_of(timer, struct cfs_bandwidth, slack_timer);
 
1979	do_sched_cfs_slack_timer(cfs_b);
1980
1981	return HRTIMER_NORESTART;
1982}
1983
 
 
1984static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1985{
1986	struct cfs_bandwidth *cfs_b =
1987		container_of(timer, struct cfs_bandwidth, period_timer);
1988	ktime_t now;
1989	int overrun;
1990	int idle = 0;
 
1991
 
1992	for (;;) {
1993		now = hrtimer_cb_get_time(timer);
1994		overrun = hrtimer_forward(timer, now, cfs_b->period);
1995
1996		if (!overrun)
1997			break;
1998
1999		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2000	}
 
 
 
2001
2002	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2003}
2004
2005void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2006{
2007	raw_spin_lock_init(&cfs_b->lock);
2008	cfs_b->runtime = 0;
2009	cfs_b->quota = RUNTIME_INF;
2010	cfs_b->period = ns_to_ktime(default_cfs_period());
2011
2012	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2013	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2014	cfs_b->period_timer.function = sched_cfs_period_timer;
2015	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2016	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 
 
2017}
2018
2019static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2020{
2021	cfs_rq->runtime_enabled = 0;
2022	INIT_LIST_HEAD(&cfs_rq->throttled_list);
2023}
2024
2025/* requires cfs_b->lock, may release to reprogram timer */
2026void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2027{
2028	/*
2029	 * The timer may be active because we're trying to set a new bandwidth
2030	 * period or because we're racing with the tear-down path
2031	 * (timer_active==0 becomes visible before the hrtimer call-back
2032	 * terminates).  In either case we ensure that it's re-programmed
2033	 */
2034	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2035		raw_spin_unlock(&cfs_b->lock);
2036		/* ensure cfs_b->lock is available while we wait */
2037		hrtimer_cancel(&cfs_b->period_timer);
2038
2039		raw_spin_lock(&cfs_b->lock);
2040		/* if someone else restarted the timer then we're done */
2041		if (cfs_b->timer_active)
2042			return;
2043	}
2044
2045	cfs_b->timer_active = 1;
2046	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
 
2047}
2048
2049static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2050{
 
 
 
 
2051	hrtimer_cancel(&cfs_b->period_timer);
2052	hrtimer_cancel(&cfs_b->slack_timer);
2053}
2054
2055void unthrottle_offline_cfs_rqs(struct rq *rq)
 
 
 
 
 
 
 
 
2056{
2057	struct cfs_rq *cfs_rq;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2058
2059	for_each_leaf_cfs_rq(rq, cfs_rq) {
2060		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 
2061
2062		if (!cfs_rq->runtime_enabled)
2063			continue;
2064
2065		/*
2066		 * clock_task is not advancing so we just need to make sure
2067		 * there's some valid quota amount
2068		 */
2069		cfs_rq->runtime_remaining = cfs_b->quota;
 
 
 
 
 
 
2070		if (cfs_rq_throttled(cfs_rq))
2071			unthrottle_cfs_rq(cfs_rq);
2072	}
 
2073}
2074
2075#else /* CONFIG_CFS_BANDWIDTH */
2076static __always_inline
2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
 
 
 
 
2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 
2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2081
2082static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2083{
2084	return 0;
2085}
2086
2087static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2088{
2089	return 0;
2090}
2091
2092static inline int throttled_lb_pair(struct task_group *tg,
2093				    int src_cpu, int dest_cpu)
2094{
2095	return 0;
2096}
2097
2098void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2099
2100#ifdef CONFIG_FAIR_GROUP_SCHED
2101static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2102#endif
2103
2104static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2105{
2106	return NULL;
2107}
2108static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2109void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 
2110
2111#endif /* CONFIG_CFS_BANDWIDTH */
2112
2113/**************************************************
2114 * CFS operations on tasks:
2115 */
2116
2117#ifdef CONFIG_SCHED_HRTICK
2118static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2119{
2120	struct sched_entity *se = &p->se;
2121	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2122
2123	WARN_ON(task_rq(p) != rq);
2124
2125	if (cfs_rq->nr_running > 1) {
2126		u64 slice = sched_slice(cfs_rq, se);
2127		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2128		s64 delta = slice - ran;
2129
2130		if (delta < 0) {
2131			if (rq->curr == p)
2132				resched_task(p);
2133			return;
2134		}
2135
2136		/*
2137		 * Don't schedule slices shorter than 10000ns, that just
2138		 * doesn't make sense. Rely on vruntime for fairness.
2139		 */
2140		if (rq->curr != p)
2141			delta = max_t(s64, 10000LL, delta);
2142
2143		hrtick_start(rq, delta);
2144	}
2145}
2146
2147/*
2148 * called from enqueue/dequeue and updates the hrtick when the
2149 * current task is from our class and nr_running is low enough
2150 * to matter.
2151 */
2152static void hrtick_update(struct rq *rq)
2153{
2154	struct task_struct *curr = rq->curr;
2155
2156	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2157		return;
2158
2159	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2160		hrtick_start_fair(rq, curr);
2161}
2162#else /* !CONFIG_SCHED_HRTICK */
2163static inline void
2164hrtick_start_fair(struct rq *rq, struct task_struct *p)
2165{
2166}
2167
2168static inline void hrtick_update(struct rq *rq)
2169{
2170}
2171#endif
2172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2173/*
2174 * The enqueue_task method is called before nr_running is
2175 * increased. Here we update the fair scheduling stats and
2176 * then put the task into the rbtree:
2177 */
2178static void
2179enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2180{
2181	struct cfs_rq *cfs_rq;
2182	struct sched_entity *se = &p->se;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2183
2184	for_each_sched_entity(se) {
2185		if (se->on_rq)
2186			break;
2187		cfs_rq = cfs_rq_of(se);
2188		enqueue_entity(cfs_rq, se, flags);
2189
2190		/*
2191		 * end evaluation on encountering a throttled cfs_rq
2192		 *
2193		 * note: in the case of encountering a throttled cfs_rq we will
2194		 * post the final h_nr_running increment below.
2195		*/
2196		if (cfs_rq_throttled(cfs_rq))
2197			break;
2198		cfs_rq->h_nr_running++;
 
2199
2200		flags = ENQUEUE_WAKEUP;
2201	}
2202
2203	for_each_sched_entity(se) {
2204		cfs_rq = cfs_rq_of(se);
2205		cfs_rq->h_nr_running++;
 
2206
2207		if (cfs_rq_throttled(cfs_rq))
2208			break;
2209
2210		update_cfs_load(cfs_rq, 0);
2211		update_cfs_shares(cfs_rq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2212	}
2213
2214	if (!se)
2215		inc_nr_running(rq);
2216	hrtick_update(rq);
2217}
2218
2219static void set_next_buddy(struct sched_entity *se);
2220
2221/*
2222 * The dequeue_task method is called before nr_running is
2223 * decreased. We remove the task from the rbtree and
2224 * update the fair scheduling stats:
2225 */
2226static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2227{
2228	struct cfs_rq *cfs_rq;
2229	struct sched_entity *se = &p->se;
2230	int task_sleep = flags & DEQUEUE_SLEEP;
 
2231
2232	for_each_sched_entity(se) {
2233		cfs_rq = cfs_rq_of(se);
2234		dequeue_entity(cfs_rq, se, flags);
2235
2236		/*
2237		 * end evaluation on encountering a throttled cfs_rq
2238		 *
2239		 * note: in the case of encountering a throttled cfs_rq we will
2240		 * post the final h_nr_running decrement below.
2241		*/
2242		if (cfs_rq_throttled(cfs_rq))
2243			break;
2244		cfs_rq->h_nr_running--;
 
2245
2246		/* Don't dequeue parent if it has other entities besides us */
2247		if (cfs_rq->load.weight) {
 
 
2248			/*
2249			 * Bias pick_next to pick a task from this cfs_rq, as
2250			 * p is sleeping when it is within its sched_slice.
2251			 */
2252			if (task_sleep && parent_entity(se))
2253				set_next_buddy(parent_entity(se));
2254
2255			/* avoid re-evaluating load for this entity */
2256			se = parent_entity(se);
2257			break;
2258		}
2259		flags |= DEQUEUE_SLEEP;
2260	}
2261
2262	for_each_sched_entity(se) {
2263		cfs_rq = cfs_rq_of(se);
2264		cfs_rq->h_nr_running--;
 
2265
2266		if (cfs_rq_throttled(cfs_rq))
2267			break;
2268
2269		update_cfs_load(cfs_rq, 0);
2270		update_cfs_shares(cfs_rq);
2271	}
2272
2273	if (!se)
2274		dec_nr_running(rq);
 
 
2275	hrtick_update(rq);
2276}
2277
2278#ifdef CONFIG_SMP
2279/* Used instead of source_load when we know the type == 0 */
2280static unsigned long weighted_cpuload(const int cpu)
2281{
2282	return cpu_rq(cpu)->load.weight;
2283}
2284
2285/*
2286 * Return a low guess at the load of a migration-source cpu weighted
2287 * according to the scheduling class and "nice" value.
2288 *
2289 * We want to under-estimate the load of migration sources, to
2290 * balance conservatively.
2291 */
2292static unsigned long source_load(int cpu, int type)
2293{
2294	struct rq *rq = cpu_rq(cpu);
2295	unsigned long total = weighted_cpuload(cpu);
2296
2297	if (type == 0 || !sched_feat(LB_BIAS))
2298		return total;
2299
2300	return min(rq->cpu_load[type-1], total);
2301}
 
 
 
 
 
 
 
2302
2303/*
2304 * Return a high guess at the load of a migration-target cpu weighted
2305 * according to the scheduling class and "nice" value.
2306 */
2307static unsigned long target_load(int cpu, int type)
2308{
2309	struct rq *rq = cpu_rq(cpu);
2310	unsigned long total = weighted_cpuload(cpu);
2311
2312	if (type == 0 || !sched_feat(LB_BIAS))
2313		return total;
 
2314
2315	return max(rq->cpu_load[type-1], total);
 
 
2316}
2317
2318static unsigned long power_of(int cpu)
2319{
2320	return cpu_rq(cpu)->cpu_power;
2321}
2322
2323static unsigned long cpu_avg_load_per_task(int cpu)
2324{
2325	struct rq *rq = cpu_rq(cpu);
2326	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 
2327
2328	if (nr_running)
2329		return rq->load.weight / nr_running;
2330
2331	return 0;
2332}
2333
2334
2335static void task_waking_fair(struct task_struct *p)
2336{
2337	struct sched_entity *se = &p->se;
2338	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2339	u64 min_vruntime;
 
 
 
 
 
2340
2341#ifndef CONFIG_64BIT
2342	u64 min_vruntime_copy;
2343
2344	do {
2345		min_vruntime_copy = cfs_rq->min_vruntime_copy;
2346		smp_rmb();
2347		min_vruntime = cfs_rq->min_vruntime;
2348	} while (min_vruntime != min_vruntime_copy);
2349#else
2350	min_vruntime = cfs_rq->min_vruntime;
2351#endif
2352
2353	se->vruntime -= min_vruntime;
2354}
2355
2356#ifdef CONFIG_FAIR_GROUP_SCHED
2357/*
2358 * effective_load() calculates the load change as seen from the root_task_group
2359 *
2360 * Adding load to a group doesn't make a group heavier, but can cause movement
2361 * of group shares between cpus. Assuming the shares were perfectly aligned one
2362 * can calculate the shift in shares.
2363 *
2364 * Calculate the effective load difference if @wl is added (subtracted) to @tg
2365 * on this @cpu and results in a total addition (subtraction) of @wg to the
2366 * total group weight.
2367 *
2368 * Given a runqueue weight distribution (rw_i) we can compute a shares
2369 * distribution (s_i) using:
2370 *
2371 *   s_i = rw_i / \Sum rw_j						(1)
 
2372 *
2373 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2374 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2375 * shares distribution (s_i):
2376 *
2377 *   rw_i = {   2,   4,   1,   0 }
2378 *   s_i  = { 2/7, 4/7, 1/7,   0 }
2379 *
2380 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2381 * task used to run on and the CPU the waker is running on), we need to
2382 * compute the effect of waking a task on either CPU and, in case of a sync
2383 * wakeup, compute the effect of the current task going to sleep.
2384 *
2385 * So for a change of @wl to the local @cpu with an overall group weight change
2386 * of @wl we can compute the new shares distribution (s'_i) using:
2387 *
2388 *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
2389 *
2390 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
2391 * differences in waking a task to CPU 0. The additional task changes the
2392 * weight and shares distributions like:
2393 *
2394 *   rw'_i = {   3,   4,   1,   0 }
2395 *   s'_i  = { 3/8, 4/8, 1/8,   0 }
2396 *
2397 * We can then compute the difference in effective weight by using:
2398 *
2399 *   dw_i = S * (s'_i - s_i)						(3)
2400 *
2401 * Where 'S' is the group weight as seen by its parent.
2402 *
2403 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
2404 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
2405 * 4/7) times the weight of the group.
2406 */
2407static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2408{
2409	struct sched_entity *se = tg->se[cpu];
 
 
2410
2411	if (!tg->parent)	/* the trivial, non-cgroup case */
2412		return wl;
 
 
 
 
2413
2414	for_each_sched_entity(se) {
2415		long w, W;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2416
2417		tg = se->my_q->tg;
 
2418
2419		/*
2420		 * W = @wg + \Sum rw_j
2421		 */
2422		W = wg + calc_tg_weight(tg, se->my_q);
2423
2424		/*
2425		 * w = rw_i + @wl
2426		 */
2427		w = se->my_q->load.weight + wl;
 
 
2428
2429		/*
2430		 * wl = S * s'_i; see (2)
2431		 */
2432		if (W > 0 && w < W)
2433			wl = (w * tg->shares) / W;
2434		else
2435			wl = tg->shares;
2436
2437		/*
2438		 * Per the above, wl is the new se->load.weight value; since
2439		 * those are clipped to [MIN_SHARES, ...) do so now. See
2440		 * calc_cfs_shares().
2441		 */
2442		if (wl < MIN_SHARES)
2443			wl = MIN_SHARES;
2444
2445		/*
2446		 * wl = dw_i = S * (s'_i - s_i); see (3)
2447		 */
2448		wl -= se->load.weight;
2449
2450		/*
2451		 * Recursively apply this logic to all parent groups to compute
2452		 * the final effective load change on the root group. Since
2453		 * only the @tg group gets extra weight, all parent groups can
2454		 * only redistribute existing shares. @wl is the shift in shares
2455		 * resulting from this level per the above.
2456		 */
2457		wg = 0;
2458	}
2459
2460	return wl;
2461}
2462#else
2463
2464static inline unsigned long effective_load(struct task_group *tg, int cpu,
2465		unsigned long wl, unsigned long wg)
2466{
2467	return wl;
2468}
2469
2470#endif
2471
2472static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
2473{
2474	s64 this_load, load;
2475	int idx, this_cpu, prev_cpu;
2476	unsigned long tl_per_task;
2477	struct task_group *tg;
2478	unsigned long weight;
2479	int balanced;
2480
2481	idx	  = sd->wake_idx;
2482	this_cpu  = smp_processor_id();
2483	prev_cpu  = task_cpu(p);
2484	load	  = source_load(prev_cpu, idx);
2485	this_load = target_load(this_cpu, idx);
2486
2487	/*
2488	 * If sync wakeup then subtract the (maximum possible)
2489	 * effect of the currently running task from the load
2490	 * of the current CPU:
 
2491	 */
2492	if (sync) {
2493		tg = task_group(current);
2494		weight = current->se.load.weight;
2495
2496		this_load += effective_load(tg, this_cpu, -weight, -weight);
2497		load += effective_load(tg, prev_cpu, 0, -weight);
2498	}
2499
2500	tg = task_group(p);
2501	weight = p->se.load.weight;
 
 
2502
2503	/*
2504	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
2505	 * due to the sync cause above having dropped this_load to 0, we'll
2506	 * always have an imbalance, but there's really nothing you can do
2507	 * about that, so that's good too.
2508	 *
2509	 * Otherwise check if either cpus are near enough in load to allow this
2510	 * task to be woken on this_cpu.
2511	 */
2512	if (this_load > 0) {
2513		s64 this_eff_load, prev_eff_load;
2514
2515		this_eff_load = 100;
2516		this_eff_load *= power_of(prev_cpu);
2517		this_eff_load *= this_load +
2518			effective_load(tg, this_cpu, weight, weight);
2519
2520		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
2521		prev_eff_load *= power_of(this_cpu);
2522		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
2523
2524		balanced = this_eff_load <= prev_eff_load;
2525	} else
2526		balanced = true;
2527
2528	/*
2529	 * If the currently running task will sleep within
2530	 * a reasonable amount of time then attract this newly
2531	 * woken task:
2532	 */
2533	if (sync && balanced)
2534		return 1;
2535
2536	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
2537	tl_per_task = cpu_avg_load_per_task(this_cpu);
2538
2539	if (balanced ||
2540	    (this_load <= load &&
2541	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
2542		/*
2543		 * This domain has SD_WAKE_AFFINE and
2544		 * p is cache cold in this domain, and
2545		 * there is no bad imbalance.
2546		 */
2547		schedstat_inc(sd, ttwu_move_affine);
2548		schedstat_inc(p, se.statistics.nr_wakeups_affine);
2549
2550		return 1;
2551	}
2552	return 0;
2553}
2554
2555/*
2556 * find_idlest_group finds and returns the least busy CPU group within the
2557 * domain.
 
 
2558 */
2559static struct sched_group *
2560find_idlest_group(struct sched_domain *sd, struct task_struct *p,
2561		  int this_cpu, int load_idx)
2562{
2563	struct sched_group *idlest = NULL, *group = sd->groups;
2564	unsigned long min_load = ULONG_MAX, this_load = 0;
2565	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
 
 
 
 
 
2566
2567	do {
2568		unsigned long load, avg_load;
 
2569		int local_group;
2570		int i;
2571
2572		/* Skip over this group if it has no CPUs allowed */
2573		if (!cpumask_intersects(sched_group_cpus(group),
2574					tsk_cpus_allowed(p)))
2575			continue;
2576
2577		local_group = cpumask_test_cpu(this_cpu,
2578					       sched_group_cpus(group));
2579
2580		/* Tally up the load of all CPUs in the group */
 
 
 
2581		avg_load = 0;
 
 
2582
2583		for_each_cpu(i, sched_group_cpus(group)) {
2584			/* Bias balancing toward cpus of our domain */
2585			if (local_group)
2586				load = source_load(i, load_idx);
2587			else
2588				load = target_load(i, load_idx);
 
2589
2590			avg_load += load;
 
2591		}
2592
2593		/* Adjust by relative CPU power of the group */
2594		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
 
 
 
2595
2596		if (local_group) {
2597			this_load = avg_load;
2598		} else if (avg_load < min_load) {
2599			min_load = avg_load;
2600			idlest = group;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2601		}
2602	} while (group = group->next, group != sd->groups);
2603
2604	if (!idlest || 100*this_load < imbalance*min_load)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2605		return NULL;
 
2606	return idlest;
2607}
2608
2609/*
2610 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2611 */
2612static int
2613find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2614{
2615	unsigned long load, min_load = ULONG_MAX;
2616	int idlest = -1;
 
 
 
2617	int i;
2618
 
 
 
 
2619	/* Traverse only the allowed CPUs */
2620	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
2621		load = weighted_cpuload(i);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2622
2623		if (load < min_load || (load == min_load && i == this_cpu)) {
2624			min_load = load;
2625			idlest = i;
 
 
2626		}
2627	}
2628
2629	return idlest;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2630}
2631
2632/*
2633 * Try and locate an idle CPU in the sched_domain.
2634 */
2635static int select_idle_sibling(struct task_struct *p, int target)
2636{
2637	int cpu = smp_processor_id();
2638	int prev_cpu = task_cpu(p);
2639	struct sched_domain *sd;
2640	struct sched_group *sg;
2641	int i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2642
2643	/*
2644	 * If the task is going to be woken-up on this cpu and if it is
2645	 * already idle, then it is the right target.
 
2646	 */
2647	if (target == cpu && idle_cpu(cpu))
2648		return cpu;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2649
2650	/*
2651	 * If the task is going to be woken-up on the cpu where it previously
2652	 * ran and if it is currently idle, then it the right target.
 
 
2653	 */
2654	if (target == prev_cpu && idle_cpu(prev_cpu))
2655		return prev_cpu;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2656
2657	/*
2658	 * Otherwise, iterate the domains and find an elegible idle cpu.
 
 
 
 
 
 
2659	 */
2660	sd = rcu_dereference(per_cpu(sd_llc, target));
2661	for_each_lower_domain(sd) {
2662		sg = sd->groups;
2663		do {
2664			if (!cpumask_intersects(sched_group_cpus(sg),
2665						tsk_cpus_allowed(p)))
2666				goto next;
2667
2668			for_each_cpu(i, sched_group_cpus(sg)) {
2669				if (!idle_cpu(i))
2670					goto next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2671			}
 
2672
2673			target = cpumask_first_and(sched_group_cpus(sg),
2674					tsk_cpus_allowed(p));
2675			goto done;
2676next:
2677			sg = sg->next;
2678		} while (sg != sd->groups);
 
 
 
2679	}
2680done:
2681	return target;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2682}
2683
2684/*
2685 * sched_balance_self: balance the current task (running on cpu) in domains
2686 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2687 * SD_BALANCE_EXEC.
2688 *
2689 * Balance, ie. select the least loaded group.
 
2690 *
2691 * Returns the target CPU number, or the same CPU if no balancing is needed.
2692 *
2693 * preempt must be disabled.
2694 */
2695static int
2696select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2697{
2698	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
2699	int cpu = smp_processor_id();
2700	int prev_cpu = task_cpu(p);
2701	int new_cpu = cpu;
2702	int want_affine = 0;
2703	int want_sd = 1;
2704	int sync = wake_flags & WF_SYNC;
 
 
2705
2706	if (p->nr_cpus_allowed == 1)
2707		return prev_cpu;
 
 
 
 
2708
2709	if (sd_flag & SD_BALANCE_WAKE) {
2710		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2711			want_affine = 1;
2712		new_cpu = prev_cpu;
2713	}
2714
2715	rcu_read_lock();
2716	for_each_domain(cpu, tmp) {
2717		if (!(tmp->flags & SD_LOAD_BALANCE))
2718			continue;
2719
2720		/*
2721		 * If power savings logic is enabled for a domain, see if we
2722		 * are not overloaded, if so, don't balance wider.
2723		 */
2724		if (tmp->flags & (SD_PREFER_LOCAL)) {
2725			unsigned long power = 0;
2726			unsigned long nr_running = 0;
2727			unsigned long capacity;
2728			int i;
2729
2730			for_each_cpu(i, sched_domain_span(tmp)) {
2731				power += power_of(i);
2732				nr_running += cpu_rq(i)->cfs.nr_running;
2733			}
2734
2735			capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736
2737			if (nr_running < capacity)
2738				want_sd = 0;
2739		}
2740
2741		/*
2742		 * If both cpu and prev_cpu are part of this domain,
2743		 * cpu is a valid SD_WAKE_AFFINE target.
2744		 */
2745		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2746		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2747			affine_sd = tmp;
2748			want_affine = 0;
 
 
 
2749		}
2750
2751		if (!want_sd && !want_affine)
 
 
2752			break;
 
2753
2754		if (!(tmp->flags & sd_flag))
2755			continue;
 
 
 
 
 
2756
2757		if (want_sd)
2758			sd = tmp;
2759	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2760
2761	if (affine_sd) {
2762		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
2763			prev_cpu = cpu;
 
 
 
 
 
2764
2765		new_cpu = select_idle_sibling(p, prev_cpu);
2766		goto unlock;
2767	}
2768
2769	while (sd) {
2770		int load_idx = sd->forkexec_idx;
2771		struct sched_group *group;
2772		int weight;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2773
2774		if (!(sd->flags & sd_flag)) {
2775			sd = sd->child;
2776			continue;
2777		}
2778
2779		if (sd_flag & SD_BALANCE_WAKE)
2780			load_idx = sd->wake_idx;
2781
2782		group = find_idlest_group(sd, p, cpu, load_idx);
2783		if (!group) {
2784			sd = sd->child;
2785			continue;
2786		}
2787
2788		new_cpu = find_idlest_cpu(group, p, cpu);
2789		if (new_cpu == -1 || new_cpu == cpu) {
2790			/* Now try balancing at a lower domain level of cpu */
2791			sd = sd->child;
2792			continue;
2793		}
2794
2795		/* Now try balancing at a lower domain level of new_cpu */
2796		cpu = new_cpu;
2797		weight = sd->span_weight;
2798		sd = NULL;
2799		for_each_domain(cpu, tmp) {
2800			if (weight <= tmp->span_weight)
2801				break;
2802			if (tmp->flags & sd_flag)
2803				sd = tmp;
2804		}
2805		/* while loop will break here if sd == NULL */
2806	}
2807unlock:
2808	rcu_read_unlock();
2809
2810	return new_cpu;
2811}
2812#endif /* CONFIG_SMP */
2813
2814static unsigned long
2815wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
2816{
2817	unsigned long gran = sysctl_sched_wakeup_granularity;
2818
2819	/*
2820	 * Since its curr running now, convert the gran from real-time
2821	 * to virtual-time in his units.
2822	 *
2823	 * By using 'se' instead of 'curr' we penalize light tasks, so
2824	 * they get preempted easier. That is, if 'se' < 'curr' then
2825	 * the resulting gran will be larger, therefore penalizing the
2826	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
2827	 * be smaller, again penalizing the lighter task.
2828	 *
2829	 * This is especially important for buddies when the leftmost
2830	 * task is higher priority than the buddy.
2831	 */
2832	return calc_delta_fair(gran, se);
2833}
2834
2835/*
2836 * Should 'se' preempt 'curr'.
2837 *
2838 *             |s1
2839 *        |s2
2840 *   |s3
2841 *         g
2842 *      |<--->|c
2843 *
2844 *  w(c, s1) = -1
2845 *  w(c, s2) =  0
2846 *  w(c, s3) =  1
2847 *
2848 */
2849static int
2850wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
2851{
2852	s64 gran, vdiff = curr->vruntime - se->vruntime;
2853
2854	if (vdiff <= 0)
2855		return -1;
2856
2857	gran = wakeup_gran(curr, se);
2858	if (vdiff > gran)
2859		return 1;
2860
2861	return 0;
2862}
2863
2864static void set_last_buddy(struct sched_entity *se)
2865{
2866	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2867		return;
2868
2869	for_each_sched_entity(se)
 
 
2870		cfs_rq_of(se)->last = se;
 
2871}
2872
2873static void set_next_buddy(struct sched_entity *se)
2874{
2875	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2876		return;
2877
2878	for_each_sched_entity(se)
 
 
2879		cfs_rq_of(se)->next = se;
 
2880}
2881
2882static void set_skip_buddy(struct sched_entity *se)
2883{
2884	for_each_sched_entity(se)
2885		cfs_rq_of(se)->skip = se;
2886}
2887
2888/*
2889 * Preempt the current task with a newly woken task if needed:
2890 */
2891static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2892{
2893	struct task_struct *curr = rq->curr;
2894	struct sched_entity *se = &curr->se, *pse = &p->se;
2895	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2896	int scale = cfs_rq->nr_running >= sched_nr_latency;
2897	int next_buddy_marked = 0;
2898
2899	if (unlikely(se == pse))
2900		return;
2901
2902	/*
2903	 * This is possible from callers such as move_task(), in which we
2904	 * unconditionally check_prempt_curr() after an enqueue (which may have
2905	 * lead to a throttle).  This both saves work and prevents false
2906	 * next-buddy nomination below.
2907	 */
2908	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2909		return;
2910
2911	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
2912		set_next_buddy(pse);
2913		next_buddy_marked = 1;
2914	}
2915
2916	/*
2917	 * We can come here with TIF_NEED_RESCHED already set from new task
2918	 * wake up path.
2919	 *
2920	 * Note: this also catches the edge-case of curr being in a throttled
2921	 * group (e.g. via set_curr_task), since update_curr() (in the
2922	 * enqueue of curr) will have resulted in resched being set.  This
2923	 * prevents us from potentially nominating it as a false LAST_BUDDY
2924	 * below.
2925	 */
2926	if (test_tsk_need_resched(curr))
2927		return;
2928
2929	/* Idle tasks are by definition preempted by non-idle tasks. */
2930	if (unlikely(curr->policy == SCHED_IDLE) &&
2931	    likely(p->policy != SCHED_IDLE))
2932		goto preempt;
2933
2934	/*
2935	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
2936	 * is driven by the tick):
2937	 */
2938	if (unlikely(p->policy != SCHED_NORMAL))
2939		return;
2940
2941	find_matching_se(&se, &pse);
2942	update_curr(cfs_rq_of(se));
2943	BUG_ON(!pse);
2944	if (wakeup_preempt_entity(se, pse) == 1) {
2945		/*
2946		 * Bias pick_next to pick the sched entity that is
2947		 * triggering this preemption.
2948		 */
2949		if (!next_buddy_marked)
2950			set_next_buddy(pse);
2951		goto preempt;
2952	}
2953
2954	return;
2955
2956preempt:
2957	resched_task(curr);
2958	/*
2959	 * Only set the backward buddy when the current task is still
2960	 * on the rq. This can happen when a wakeup gets interleaved
2961	 * with schedule on the ->pre_schedule() or idle_balance()
2962	 * point, either of which can * drop the rq lock.
2963	 *
2964	 * Also, during early boot the idle thread is in the fair class,
2965	 * for obvious reasons its a bad idea to schedule back to it.
2966	 */
2967	if (unlikely(!se->on_rq || curr == rq->idle))
2968		return;
2969
2970	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
2971		set_last_buddy(se);
2972}
2973
2974static struct task_struct *pick_next_task_fair(struct rq *rq)
 
2975{
2976	struct task_struct *p;
2977	struct cfs_rq *cfs_rq = &rq->cfs;
2978	struct sched_entity *se;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2979
2980	if (!cfs_rq->nr_running)
2981		return NULL;
 
 
 
 
 
 
 
2982
2983	do {
2984		se = pick_next_entity(cfs_rq);
2985		set_next_entity(cfs_rq, se);
2986		cfs_rq = group_cfs_rq(se);
2987	} while (cfs_rq);
2988
2989	p = task_of(se);
 
 
 
 
 
 
 
 
 
 
 
2990	if (hrtick_enabled(rq))
2991		hrtick_start_fair(rq, p);
2992
 
 
2993	return p;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2994}
2995
2996/*
2997 * Account for a descheduled task:
2998 */
2999static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3000{
3001	struct sched_entity *se = &prev->se;
3002	struct cfs_rq *cfs_rq;
3003
3004	for_each_sched_entity(se) {
3005		cfs_rq = cfs_rq_of(se);
3006		put_prev_entity(cfs_rq, se);
3007	}
3008}
3009
3010/*
3011 * sched_yield() is very simple
3012 *
3013 * The magic of dealing with the ->skip buddy is in pick_next_entity.
3014 */
3015static void yield_task_fair(struct rq *rq)
3016{
3017	struct task_struct *curr = rq->curr;
3018	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3019	struct sched_entity *se = &curr->se;
3020
3021	/*
3022	 * Are we the only task in the tree?
3023	 */
3024	if (unlikely(rq->nr_running == 1))
3025		return;
3026
3027	clear_buddies(cfs_rq, se);
3028
3029	if (curr->policy != SCHED_BATCH) {
3030		update_rq_clock(rq);
3031		/*
3032		 * Update run-time statistics of the 'current'.
3033		 */
3034		update_curr(cfs_rq);
3035		/*
3036		 * Tell update_rq_clock() that we've just updated,
3037		 * so we don't do microscopic update in schedule()
3038		 * and double the fastpath cost.
3039		 */
3040		 rq->skip_clock_update = 1;
3041	}
3042
3043	set_skip_buddy(se);
3044}
3045
3046static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3047{
3048	struct sched_entity *se = &p->se;
3049
3050	/* throttled hierarchies are not runnable */
3051	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3052		return false;
3053
3054	/* Tell the scheduler that we'd really like pse to run next. */
3055	set_next_buddy(se);
3056
3057	yield_task_fair(rq);
3058
3059	return true;
3060}
3061
3062#ifdef CONFIG_SMP
3063/**************************************************
3064 * Fair scheduling class load-balancing methods:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3065 */
3066
3067static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3068
 
 
 
 
 
 
 
 
 
3069#define LBF_ALL_PINNED	0x01
3070#define LBF_NEED_BREAK	0x02
 
 
 
 
3071
3072struct lb_env {
3073	struct sched_domain	*sd;
3074
 
3075	int			src_cpu;
3076	struct rq		*src_rq;
3077
3078	int			dst_cpu;
3079	struct rq		*dst_rq;
3080
 
 
3081	enum cpu_idle_type	idle;
3082	long			imbalance;
 
 
 
3083	unsigned int		flags;
3084
3085	unsigned int		loop;
3086	unsigned int		loop_break;
3087	unsigned int		loop_max;
 
 
 
 
3088};
3089
3090/*
3091 * move_task - move a task from one runqueue to another runqueue.
3092 * Both runqueues must be locked.
3093 */
3094static void move_task(struct task_struct *p, struct lb_env *env)
3095{
3096	deactivate_task(env->src_rq, p, 0);
3097	set_task_cpu(p, env->dst_cpu);
3098	activate_task(env->dst_rq, p, 0);
3099	check_preempt_curr(env->dst_rq, p, 0);
3100}
3101
3102/*
3103 * Is this task likely cache-hot:
3104 */
3105static int
3106task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3107{
3108	s64 delta;
3109
 
 
3110	if (p->sched_class != &fair_sched_class)
3111		return 0;
3112
3113	if (unlikely(p->policy == SCHED_IDLE))
3114		return 0;
3115
3116	/*
3117	 * Buddy candidates are cache hot:
3118	 */
3119	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3120			(&p->se == cfs_rq_of(&p->se)->next ||
3121			 &p->se == cfs_rq_of(&p->se)->last))
3122		return 1;
3123
3124	if (sysctl_sched_migration_cost == -1)
3125		return 1;
3126	if (sysctl_sched_migration_cost == 0)
3127		return 0;
3128
3129	delta = now - p->se.exec_start;
3130
3131	return delta < (s64)sysctl_sched_migration_cost;
3132}
3133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3134/*
3135 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3136 */
3137static
3138int can_migrate_task(struct task_struct *p, struct lb_env *env)
3139{
3140	int tsk_cache_hot = 0;
 
 
 
3141	/*
3142	 * We do not migrate tasks that are:
3143	 * 1) running (obviously), or
3144	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3145	 * 3) are cache-hot on their current CPU.
 
3146	 */
3147	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3148		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3149		return 0;
3150	}
 
 
3151	env->flags &= ~LBF_ALL_PINNED;
3152
3153	if (task_running(env->src_rq, p)) {
3154		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3155		return 0;
3156	}
3157
3158	/*
3159	 * Aggressive migration if:
3160	 * 1) task is cache cold, or
3161	 * 2) too many balance attempts have failed.
3162	 */
3163
3164	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3165	if (!tsk_cache_hot ||
3166		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3167#ifdef CONFIG_SCHEDSTATS
3168		if (tsk_cache_hot) {
3169			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3170			schedstat_inc(p, se.statistics.nr_forced_migrations);
 
 
3171		}
3172#endif
3173		return 1;
3174	}
3175
3176	if (tsk_cache_hot) {
3177		schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3178		return 0;
3179	}
3180	return 1;
 
 
 
 
 
 
 
 
3181}
3182
3183/*
3184 * move_one_task tries to move exactly one task from busiest to this_rq, as
3185 * part of active balancing operations within "domain".
3186 * Returns 1 if successful and 0 otherwise.
3187 *
3188 * Called with both runqueues locked.
3189 */
3190static int move_one_task(struct lb_env *env)
3191{
3192	struct task_struct *p, *n;
3193
3194	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3195		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3196			continue;
3197
 
 
3198		if (!can_migrate_task(p, env))
3199			continue;
3200
3201		move_task(p, env);
 
3202		/*
3203		 * Right now, this is only the second place move_task()
3204		 * is called, so we can safely collect move_task()
3205		 * stats here rather than inside move_task().
 
3206		 */
3207		schedstat_inc(env->sd, lb_gained[env->idle]);
3208		return 1;
3209	}
3210	return 0;
3211}
3212
3213static unsigned long task_h_load(struct task_struct *p);
3214
3215static const unsigned int sched_nr_migrate_break = 32;
3216
3217/*
3218 * move_tasks tries to move up to imbalance weighted load from busiest to
3219 * this_rq, as part of a balancing operation within domain "sd".
3220 * Returns 1 if successful and 0 otherwise.
3221 *
3222 * Called with both runqueues locked.
3223 */
3224static int move_tasks(struct lb_env *env)
3225{
3226	struct list_head *tasks = &env->src_rq->cfs_tasks;
3227	struct task_struct *p;
3228	unsigned long load;
3229	int pulled = 0;
 
 
3230
3231	if (env->imbalance <= 0)
3232		return 0;
3233
3234	while (!list_empty(tasks)) {
3235		p = list_first_entry(tasks, struct task_struct, se.group_node);
 
 
 
 
 
 
 
3236
3237		env->loop++;
3238		/* We've more or less seen every task there is, call it quits */
3239		if (env->loop > env->loop_max)
3240			break;
3241
3242		/* take a breather every nr_migrate tasks */
3243		if (env->loop > env->loop_break) {
3244			env->loop_break += sched_nr_migrate_break;
3245			env->flags |= LBF_NEED_BREAK;
3246			break;
3247		}
3248
3249		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3250			goto next;
3251
3252		load = task_h_load(p);
3253
3254		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3255			goto next;
3256
3257		if ((load / 2) > env->imbalance)
3258			goto next;
3259
3260		if (!can_migrate_task(p, env))
3261			goto next;
3262
3263		move_task(p, env);
3264		pulled++;
3265		env->imbalance -= load;
3266
3267#ifdef CONFIG_PREEMPT
3268		/*
3269		 * NEWIDLE balancing is a source of latency, so preemptible
3270		 * kernels will stop after the first task is pulled to minimize
3271		 * the critical section.
3272		 */
3273		if (env->idle == CPU_NEWLY_IDLE)
3274			break;
3275#endif
3276
3277		/*
3278		 * We only want to steal up to the prescribed amount of
3279		 * weighted load.
3280		 */
3281		if (env->imbalance <= 0)
3282			break;
3283
3284		continue;
3285next:
3286		list_move_tail(&p->se.group_node, tasks);
3287	}
3288
3289	/*
3290	 * Right now, this is one of only two places move_task() is called,
3291	 * so we can safely collect move_task() stats here rather than
3292	 * inside move_task().
3293	 */
3294	schedstat_add(env->sd, lb_gained[env->idle], pulled);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3295
3296	return pulled;
 
 
 
 
 
 
 
 
 
 
 
3297}
3298
3299#ifdef CONFIG_FAIR_GROUP_SCHED
3300/*
3301 * update tg->load_weight by folding this cpu's load_avg
 
3302 */
3303static int update_shares_cpu(struct task_group *tg, int cpu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3304{
3305	struct cfs_rq *cfs_rq;
3306	unsigned long flags;
3307	struct rq *rq;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3308
3309	if (!tg->se[cpu])
3310		return 0;
 
 
 
 
 
 
3311
3312	rq = cpu_rq(cpu);
3313	cfs_rq = tg->cfs_rq[cpu];
3314
3315	raw_spin_lock_irqsave(&rq->lock, flags);
 
 
 
3316
3317	update_rq_clock(rq);
3318	update_cfs_load(cfs_rq, 1);
3319
3320	/*
3321	 * We need to update shares after updating tg->load_weight in
3322	 * order to adjust the weight of groups with long running tasks.
3323	 */
3324	update_cfs_shares(cfs_rq);
3325
3326	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
3327
3328	return 0;
3329}
3330
3331static void update_shares(int cpu)
3332{
3333	struct cfs_rq *cfs_rq;
3334	struct rq *rq = cpu_rq(cpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3335
3336	rcu_read_lock();
3337	/*
3338	 * Iterates the task_group tree in a bottom up fashion, see
3339	 * list_add_leaf_cfs_rq() for details.
3340	 */
3341	for_each_leaf_cfs_rq(rq, cfs_rq) {
3342		/* throttled entities do not contribute to load */
3343		if (throttled_hierarchy(cfs_rq))
3344			continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
3345
3346		update_shares_cpu(cfs_rq->tg, cpu);
 
 
3347	}
3348	rcu_read_unlock();
 
 
3349}
3350
3351/*
3352 * Compute the cpu's hierarchical load factor for each task group.
3353 * This needs to be done in a top-down fashion because the load of a child
3354 * group is a fraction of its parents load.
3355 */
3356static int tg_load_down(struct task_group *tg, void *data)
3357{
 
 
 
3358	unsigned long load;
3359	long cpu = (long)data;
3360
3361	if (!tg->parent) {
3362		load = cpu_rq(cpu)->load.weight;
3363	} else {
3364		load = tg->parent->cfs_rq[cpu]->h_load;
3365		load *= tg->se[cpu]->load.weight;
3366		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
 
 
 
3367	}
3368
3369	tg->cfs_rq[cpu]->h_load = load;
 
 
 
3370
3371	return 0;
3372}
3373
3374static void update_h_load(long cpu)
3375{
3376	rcu_read_lock();
3377	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3378	rcu_read_unlock();
3379}
3380
3381static unsigned long task_h_load(struct task_struct *p)
3382{
3383	struct cfs_rq *cfs_rq = task_cfs_rq(p);
3384	unsigned long load;
3385
3386	load = p->se.load.weight;
3387	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
3388
3389	return load;
 
 
3390}
3391#else
3392static inline void update_shares(int cpu)
3393{
3394}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3395
3396static inline void update_h_load(long cpu)
3397{
3398}
3399
3400static unsigned long task_h_load(struct task_struct *p)
3401{
3402	return p->se.load.weight;
3403}
3404#endif
3405
3406/********** Helpers for find_busiest_group ************************/
3407/*
3408 * sd_lb_stats - Structure to store the statistics of a sched_domain
3409 * 		during load balancing.
3410 */
3411struct sd_lb_stats {
3412	struct sched_group *busiest; /* Busiest group in this sd */
3413	struct sched_group *this;  /* Local group in this sd */
3414	unsigned long total_load;  /* Total load of all groups in sd */
3415	unsigned long total_pwr;   /*	Total power of all groups in sd */
3416	unsigned long avg_load;	   /* Average load across all groups in sd */
3417
3418	/** Statistics of this group */
3419	unsigned long this_load;
3420	unsigned long this_load_per_task;
3421	unsigned long this_nr_running;
3422	unsigned long this_has_capacity;
3423	unsigned int  this_idle_cpus;
3424
3425	/* Statistics of the busiest group */
3426	unsigned int  busiest_idle_cpus;
3427	unsigned long max_load;
3428	unsigned long busiest_load_per_task;
3429	unsigned long busiest_nr_running;
3430	unsigned long busiest_group_capacity;
3431	unsigned long busiest_has_capacity;
3432	unsigned int  busiest_group_weight;
3433
3434	int group_imb; /* Is there imbalance in this sd */
3435};
3436
3437/*
3438 * sg_lb_stats - stats of a sched_group required for load_balancing
3439 */
3440struct sg_lb_stats {
3441	unsigned long avg_load; /*Avg load across the CPUs of the group */
3442	unsigned long group_load; /* Total load over the CPUs of the group */
3443	unsigned long sum_nr_running; /* Nr tasks running in the group */
3444	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3445	unsigned long group_capacity;
3446	unsigned long idle_cpus;
3447	unsigned long group_weight;
3448	int group_imb; /* Is there an imbalance in the group ? */
3449	int group_has_capacity; /* Is there extra capacity in the group? */
 
 
 
 
 
 
 
3450};
3451
3452/**
3453 * get_sd_load_idx - Obtain the load index for a given sched domain.
3454 * @sd: The sched_domain whose load_idx is to be obtained.
3455 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3456 */
3457static inline int get_sd_load_idx(struct sched_domain *sd,
3458					enum cpu_idle_type idle)
3459{
3460	int load_idx;
3461
3462	switch (idle) {
3463	case CPU_NOT_IDLE:
3464		load_idx = sd->busy_idx;
3465		break;
3466
3467	case CPU_NEWLY_IDLE:
3468		load_idx = sd->newidle_idx;
3469		break;
3470	default:
3471		load_idx = sd->idle_idx;
3472		break;
3473	}
3474
3475	return load_idx;
3476}
3477
3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3479{
3480	return SCHED_POWER_SCALE;
3481}
3482
3483unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3484{
3485	return default_scale_freq_power(sd, cpu);
3486}
3487
3488unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3489{
3490	unsigned long weight = sd->span_weight;
3491	unsigned long smt_gain = sd->smt_gain;
3492
3493	smt_gain /= weight;
3494
3495	return smt_gain;
 
 
3496}
3497
3498unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3499{
3500	return default_scale_smt_power(sd, cpu);
3501}
3502
3503unsigned long scale_rt_power(int cpu)
3504{
3505	struct rq *rq = cpu_rq(cpu);
3506	u64 total, available, age_stamp, avg;
 
 
3507
3508	/*
3509	 * Since we're reading these variables without serialization make sure
3510	 * we read them once before doing sanity checks on them.
3511	 */
3512	age_stamp = ACCESS_ONCE(rq->age_stamp);
3513	avg = ACCESS_ONCE(rq->rt_avg);
3514
3515	total = sched_avg_period() + (rq->clock - age_stamp);
 
3516
3517	if (unlikely(total < avg)) {
3518		/* Ensures that power won't end up being negative */
3519		available = 0;
3520	} else {
3521		available = total - avg;
3522	}
3523
3524	if (unlikely((s64)total < SCHED_POWER_SCALE))
3525		total = SCHED_POWER_SCALE;
3526
3527	total >>= SCHED_POWER_SHIFT;
3528
3529	return div_u64(available, total);
3530}
3531
3532static void update_cpu_power(struct sched_domain *sd, int cpu)
3533{
3534	unsigned long weight = sd->span_weight;
3535	unsigned long power = SCHED_POWER_SCALE;
3536	struct sched_group *sdg = sd->groups;
3537
3538	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3539		if (sched_feat(ARCH_POWER))
3540			power *= arch_scale_smt_power(sd, cpu);
3541		else
3542			power *= default_scale_smt_power(sd, cpu);
3543
3544		power >>= SCHED_POWER_SHIFT;
3545	}
3546
3547	sdg->sgp->power_orig = power;
3548
3549	if (sched_feat(ARCH_POWER))
3550		power *= arch_scale_freq_power(sd, cpu);
3551	else
3552		power *= default_scale_freq_power(sd, cpu);
3553
3554	power >>= SCHED_POWER_SHIFT;
3555
3556	power *= scale_rt_power(cpu);
3557	power >>= SCHED_POWER_SHIFT;
3558
3559	if (!power)
3560		power = 1;
3561
3562	cpu_rq(cpu)->cpu_power = power;
3563	sdg->sgp->power = power;
 
 
3564}
3565
3566void update_group_power(struct sched_domain *sd, int cpu)
3567{
3568	struct sched_domain *child = sd->child;
3569	struct sched_group *group, *sdg = sd->groups;
3570	unsigned long power;
3571	unsigned long interval;
3572
3573	interval = msecs_to_jiffies(sd->balance_interval);
3574	interval = clamp(interval, 1UL, max_load_balance_interval);
3575	sdg->sgp->next_update = jiffies + interval;
3576
3577	if (!child) {
3578		update_cpu_power(sd, cpu);
3579		return;
3580	}
3581
3582	power = 0;
 
 
3583
3584	if (child->flags & SD_OVERLAP) {
3585		/*
3586		 * SD_OVERLAP domains cannot assume that child groups
3587		 * span the current group.
3588		 */
3589
3590		for_each_cpu(cpu, sched_group_cpus(sdg))
3591			power += power_of(cpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3592	} else  {
3593		/*
3594		 * !SD_OVERLAP domains can assume that child groups
3595		 * span the current group.
3596		 */ 
3597
3598		group = child->groups;
3599		do {
3600			power += group->sgp->power;
 
 
 
 
3601			group = group->next;
3602		} while (group != child->groups);
3603	}
3604
3605	sdg->sgp->power_orig = sdg->sgp->power = power;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3606}
3607
3608/*
3609 * Try and fix up capacity for tiny siblings, this is needed when
3610 * things like SD_ASYM_PACKING need f_b_g to select another sibling
3611 * which on its own isn't powerful enough.
 
 
 
3612 *
3613 * See update_sd_pick_busiest() and check_asym_packing().
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3614 */
3615static inline int
3616fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3617{
3618	/*
3619	 * Only siblings can have significantly less than SCHED_POWER_SCALE
3620	 */
3621	if (!(sd->flags & SD_SHARE_CPUPOWER))
3622		return 0;
 
 
 
 
 
 
3623
3624	/*
3625	 * If ~90% of the cpu_power is still there, we're good.
3626	 */
3627	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
3628		return 1;
3629
3630	return 0;
 
 
 
3631}
3632
3633/**
3634 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3635 * @env: The load balancing environment.
3636 * @group: sched_group whose statistics are to be updated.
3637 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3638 * @local_group: Does group contain this_cpu.
3639 * @cpus: Set of cpus considered for load balancing.
3640 * @balance: Should we balance.
3641 * @sgs: variable to hold the statistics for this group.
 
3642 */
3643static inline void update_sg_lb_stats(struct lb_env *env,
3644			struct sched_group *group, int load_idx,
3645			int local_group, const struct cpumask *cpus,
3646			int *balance, struct sg_lb_stats *sgs)
3647{
3648	unsigned long nr_running, max_nr_running, min_nr_running;
3649	unsigned long load, max_cpu_load, min_cpu_load;
3650	unsigned int balance_cpu = -1, first_idle_cpu = 0;
3651	unsigned long avg_load_per_task = 0;
3652	int i;
3653
3654	if (local_group)
3655		balance_cpu = group_balance_cpu(group);
3656
3657	/* Tally up the load of all CPUs in the group */
3658	max_cpu_load = 0;
3659	min_cpu_load = ~0UL;
3660	max_nr_running = 0;
3661	min_nr_running = ~0UL;
3662
3663	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3664		struct rq *rq = cpu_rq(i);
 
3665
3666		nr_running = rq->nr_running;
 
 
3667
3668		/* Bias balancing toward cpus of our domain */
3669		if (local_group) {
3670			if (idle_cpu(i) && !first_idle_cpu &&
3671					cpumask_test_cpu(i, sched_group_mask(group))) {
3672				first_idle_cpu = 1;
3673				balance_cpu = i;
3674			}
3675
3676			load = target_load(i, load_idx);
3677		} else {
3678			load = source_load(i, load_idx);
3679			if (load > max_cpu_load)
3680				max_cpu_load = load;
3681			if (min_cpu_load > load)
3682				min_cpu_load = load;
3683
3684			if (nr_running > max_nr_running)
3685				max_nr_running = nr_running;
3686			if (min_nr_running > nr_running)
3687				min_nr_running = nr_running;
3688		}
3689
3690		sgs->group_load += load;
3691		sgs->sum_nr_running += nr_running;
3692		sgs->sum_weighted_load += weighted_cpuload(i);
3693		if (idle_cpu(i))
3694			sgs->idle_cpus++;
3695	}
3696
3697	/*
3698	 * First idle cpu or the first cpu(busiest) in this sched group
3699	 * is eligible for doing load balancing at this and above
3700	 * domains. In the newly idle case, we will allow all the cpu's
3701	 * to do the newly idle load balance.
3702	 */
3703	if (local_group) {
3704		if (env->idle != CPU_NEWLY_IDLE) {
3705			if (balance_cpu != env->dst_cpu) {
3706				*balance = 0;
3707				return;
3708			}
3709			update_group_power(env->sd, env->dst_cpu);
3710		} else if (time_after_eq(jiffies, group->sgp->next_update))
3711			update_group_power(env->sd, env->dst_cpu);
3712	}
3713
3714	/* Adjust by relative CPU power of the group */
3715	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
 
3716
3717	/*
3718	 * Consider the group unbalanced when the imbalance is larger
3719	 * than the average weight of a task.
3720	 *
3721	 * APZ: with cgroup the avg task weight can vary wildly and
3722	 *      might not be a suitable number - should we keep a
3723	 *      normalized nr_running number somewhere that negates
3724	 *      the hierarchy?
3725	 */
3726	if (sgs->sum_nr_running)
3727		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3728
3729	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3730	    (max_nr_running - min_nr_running) > 1)
3731		sgs->group_imb = 1;
3732
3733	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3734						SCHED_POWER_SCALE);
3735	if (!sgs->group_capacity)
3736		sgs->group_capacity = fix_small_capacity(env->sd, group);
3737	sgs->group_weight = group->group_weight;
3738
3739	if (sgs->group_capacity > sgs->sum_nr_running)
3740		sgs->group_has_capacity = 1;
3741}
3742
3743/**
3744 * update_sd_pick_busiest - return 1 on busiest group
3745 * @env: The load balancing environment.
3746 * @sds: sched_domain statistics
3747 * @sg: sched_group candidate to be checked for being the busiest
3748 * @sgs: sched_group statistics
3749 *
3750 * Determine if @sg is a busier group than the previously selected
3751 * busiest group.
 
 
 
3752 */
3753static bool update_sd_pick_busiest(struct lb_env *env,
3754				   struct sd_lb_stats *sds,
3755				   struct sched_group *sg,
3756				   struct sg_lb_stats *sgs)
3757{
3758	if (sgs->avg_load <= sds->max_load)
 
 
 
 
 
 
 
 
 
 
3759		return false;
3760
3761	if (sgs->sum_nr_running > sgs->group_capacity)
3762		return true;
3763
3764	if (sgs->group_imb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3765		return true;
3766
 
 
 
3767	/*
3768	 * ASYM_PACKING needs to move all the work to the lowest
3769	 * numbered CPUs in the group, therefore mark all groups
3770	 * higher than ourself as busy.
3771	 */
3772	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3773	    env->dst_cpu < group_first_cpu(sg)) {
3774		if (!sds->busiest)
3775			return true;
3776
3777		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
 
 
3778			return true;
3779	}
3780
3781	return false;
3782}
3783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3784/**
3785 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3786 * @env: The load balancing environment.
3787 * @cpus: Set of cpus considered for load balancing.
3788 * @balance: Should we balance.
3789 * @sds: variable to hold the statistics for this sched_domain.
3790 */
3791static inline void update_sd_lb_stats(struct lb_env *env,
3792				      const struct cpumask *cpus,
3793				      int *balance, struct sd_lb_stats *sds)
3794{
3795	struct sched_domain *child = env->sd->child;
3796	struct sched_group *sg = env->sd->groups;
3797	struct sg_lb_stats sgs;
3798	int load_idx, prefer_sibling = 0;
3799
3800	if (child && child->flags & SD_PREFER_SIBLING)
3801		prefer_sibling = 1;
3802
3803	load_idx = get_sd_load_idx(env->sd, env->idle);
 
 
3804
3805	do {
 
3806		int local_group;
3807
3808		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3809		memset(&sgs, 0, sizeof(sgs));
3810		update_sg_lb_stats(env, sg, load_idx, local_group,
3811				   cpus, balance, &sgs);
3812
3813		if (local_group && !(*balance))
3814			return;
 
 
 
 
3815
3816		sds->total_load += sgs.group_load;
3817		sds->total_pwr += sg->sgp->power;
3818
3819		/*
3820		 * In case the child domain prefers tasks go to siblings
3821		 * first, lower the sg capacity to one so that we'll try
3822		 * and move all the excess tasks away. We lower the capacity
3823		 * of a group only if the local group has the capacity to fit
3824		 * these excess tasks, i.e. nr_running < group_capacity. The
3825		 * extra check prevents the case where you always pull from the
3826		 * heaviest group when it is already under-utilized (possible
3827		 * with a large weight task outweighs the tasks on the system).
3828		 */
3829		if (prefer_sibling && !local_group && sds->this_has_capacity)
3830			sgs.group_capacity = min(sgs.group_capacity, 1UL);
 
 
 
 
3831
3832		if (local_group) {
3833			sds->this_load = sgs.avg_load;
3834			sds->this = sg;
3835			sds->this_nr_running = sgs.sum_nr_running;
3836			sds->this_load_per_task = sgs.sum_weighted_load;
3837			sds->this_has_capacity = sgs.group_has_capacity;
3838			sds->this_idle_cpus = sgs.idle_cpus;
3839		} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3840			sds->max_load = sgs.avg_load;
3841			sds->busiest = sg;
3842			sds->busiest_nr_running = sgs.sum_nr_running;
3843			sds->busiest_idle_cpus = sgs.idle_cpus;
3844			sds->busiest_group_capacity = sgs.group_capacity;
3845			sds->busiest_load_per_task = sgs.sum_weighted_load;
3846			sds->busiest_has_capacity = sgs.group_has_capacity;
3847			sds->busiest_group_weight = sgs.group_weight;
3848			sds->group_imb = sgs.group_imb;
3849		}
3850
 
 
 
 
 
 
3851		sg = sg->next;
3852	} while (sg != env->sd->groups);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3853}
3854
3855/**
3856 * check_asym_packing - Check to see if the group is packed into the
3857 *			sched doman.
3858 *
3859 * This is primarily intended to used at the sibling level.  Some
3860 * cores like POWER7 prefer to use lower numbered SMT threads.  In the
3861 * case of POWER7, it can move to lower SMT modes only when higher
3862 * threads are idle.  When in lower SMT modes, the threads will
3863 * perform better since they share less core resources.  Hence when we
3864 * have idle threads, we want them to be the higher ones.
3865 *
3866 * This packing function is run on idle threads.  It checks to see if
3867 * the busiest CPU in this domain (core in the P7 case) has a higher
3868 * CPU number than the packing function is being run on.  Here we are
3869 * assuming lower CPU number will be equivalent to lower a SMT thread
3870 * number.
3871 *
3872 * Returns 1 when packing is required and a task should be moved to
3873 * this CPU.  The amount of the imbalance is returned in *imbalance.
3874 *
3875 * @env: The load balancing environment.
3876 * @sds: Statistics of the sched_domain which is to be packed
3877 */
3878static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3879{
3880	int busiest_cpu;
3881
3882	if (!(env->sd->flags & SD_ASYM_PACKING))
3883		return 0;
3884
 
 
 
3885	if (!sds->busiest)
3886		return 0;
3887
3888	busiest_cpu = group_first_cpu(sds->busiest);
3889	if (env->dst_cpu > busiest_cpu)
3890		return 0;
3891
3892	env->imbalance = DIV_ROUND_CLOSEST(
3893		sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3894
3895	return 1;
3896}
3897
3898/**
3899 * fix_small_imbalance - Calculate the minor imbalance that exists
3900 *			amongst the groups of a sched_domain, during
3901 *			load balancing.
3902 * @env: The load balancing environment.
3903 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3904 */
3905static inline
3906void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
3907{
3908	unsigned long tmp, pwr_now = 0, pwr_move = 0;
3909	unsigned int imbn = 2;
3910	unsigned long scaled_busy_load_per_task;
 
3911
3912	if (sds->this_nr_running) {
3913		sds->this_load_per_task /= sds->this_nr_running;
3914		if (sds->busiest_load_per_task >
3915				sds->this_load_per_task)
3916			imbn = 1;
3917	} else {
3918		sds->this_load_per_task =
3919			cpu_avg_load_per_task(env->dst_cpu);
3920	}
3921
3922	scaled_busy_load_per_task = sds->busiest_load_per_task
3923					 * SCHED_POWER_SCALE;
3924	scaled_busy_load_per_task /= sds->busiest->sgp->power;
3925
3926	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
3927			(scaled_busy_load_per_task * imbn)) {
3928		env->imbalance = sds->busiest_load_per_task;
 
 
 
 
 
3929		return;
3930	}
3931
3932	/*
3933	 * OK, we don't have enough imbalance to justify moving tasks,
3934	 * however we may be able to increase total CPU power used by
3935	 * moving them.
3936	 */
3937
3938	pwr_now += sds->busiest->sgp->power *
3939			min(sds->busiest_load_per_task, sds->max_load);
3940	pwr_now += sds->this->sgp->power *
3941			min(sds->this_load_per_task, sds->this_load);
3942	pwr_now /= SCHED_POWER_SCALE;
3943
3944	/* Amount of load we'd subtract */
3945	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3946		sds->busiest->sgp->power;
3947	if (sds->max_load > tmp)
3948		pwr_move += sds->busiest->sgp->power *
3949			min(sds->busiest_load_per_task, sds->max_load - tmp);
3950
3951	/* Amount of load we'd add */
3952	if (sds->max_load * sds->busiest->sgp->power <
3953		sds->busiest_load_per_task * SCHED_POWER_SCALE)
3954		tmp = (sds->max_load * sds->busiest->sgp->power) /
3955			sds->this->sgp->power;
3956	else
3957		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3958			sds->this->sgp->power;
3959	pwr_move += sds->this->sgp->power *
3960			min(sds->this_load_per_task, sds->this_load + tmp);
3961	pwr_move /= SCHED_POWER_SCALE;
 
3962
3963	/* Move if we gain throughput */
3964	if (pwr_move > pwr_now)
3965		env->imbalance = sds->busiest_load_per_task;
3966}
3967
3968/**
3969 * calculate_imbalance - Calculate the amount of imbalance present within the
3970 *			 groups of a given sched_domain during load balance.
3971 * @env: load balance environment
3972 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3973 */
3974static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
3975{
3976	unsigned long max_pull, load_above_capacity = ~0UL;
 
 
 
 
3977
3978	sds->busiest_load_per_task /= sds->busiest_nr_running;
3979	if (sds->group_imb) {
3980		sds->busiest_load_per_task =
3981			min(sds->busiest_load_per_task, sds->avg_load);
 
 
 
3982	}
3983
3984	/*
3985	 * In the presence of smp nice balancing, certain scenarios can have
3986	 * max load less than avg load(as we skip the groups at or below
3987	 * its cpu_power, while calculating max_load..)
 
3988	 */
3989	if (sds->max_load < sds->avg_load) {
 
 
3990		env->imbalance = 0;
3991		return fix_small_imbalance(env, sds);
3992	}
3993
3994	if (!sds->group_imb) {
3995		/*
3996		 * Don't want to pull so many tasks that a group would go idle.
3997		 */
3998		load_above_capacity = (sds->busiest_nr_running -
3999						sds->busiest_group_capacity);
4000
4001		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4002
4003		load_above_capacity /= sds->busiest->sgp->power;
 
 
4004	}
4005
4006	/*
4007	 * We're trying to get all the cpus to the average_load, so we don't
4008	 * want to push ourselves above the average load, nor do we wish to
4009	 * reduce the max loaded cpu below the average load. At the same time,
4010	 * we also don't want to reduce the group load below the group capacity
4011	 * (so that we can implement power-savings policies etc). Thus we look
4012	 * for the minimum possible imbalance.
4013	 * Be careful of negative numbers as they'll appear as very large values
4014	 * with unsigned longs.
4015	 */
4016	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4017
4018	/* How much load to actually move to equalise the imbalance */
4019	env->imbalance = min(max_pull * sds->busiest->sgp->power,
4020		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
4021			/ SCHED_POWER_SCALE;
 
 
 
 
 
 
 
4022
4023	/*
4024	 * if *imbalance is less than the average load per runnable task
4025	 * there is no guarantee that any tasks will be moved so we'll have
4026	 * a think about bumping its value to force at least one task to be
4027	 * moved
4028	 */
4029	if (env->imbalance < sds->busiest_load_per_task)
4030		return fix_small_imbalance(env, sds);
4031
4032}
4033
4034/******* find_busiest_group() helpers end here *********************/
4035
4036/**
4037 * find_busiest_group - Returns the busiest group within the sched_domain
4038 * if there is an imbalance. If there isn't an imbalance, and
4039 * the user has opted for power-savings, it returns a group whose
4040 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4041 * such a group exists.
4042 *
4043 * Also calculates the amount of weighted load which should be moved
4044 * to restore balance.
4045 *
4046 * @env: The load balancing environment.
4047 * @cpus: The set of CPUs under consideration for load-balancing.
4048 * @balance: Pointer to a variable indicating if this_cpu
4049 *	is the appropriate cpu to perform load balancing at this_level.
4050 *
4051 * Returns:	- the busiest group if imbalance exists.
4052 *		- If no imbalance and user has opted for power-savings balance,
4053 *		   return the least loaded group whose CPUs can be
4054 *		   put to idle by rebalancing its tasks onto our group.
4055 */
4056static struct sched_group *
4057find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4058{
 
4059	struct sd_lb_stats sds;
4060
4061	memset(&sds, 0, sizeof(sds));
4062
4063	/*
4064	 * Compute the various statistics relavent for load balancing at
4065	 * this level.
4066	 */
4067	update_sd_lb_stats(env, cpus, balance, &sds);
 
 
 
4068
4069	/*
4070	 * this_cpu is not the appropriate cpu to perform load balancing at
4071	 * this level.
4072	 */
4073	if (!(*balance))
4074		goto ret;
4075
4076	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4077	    check_asym_packing(env, &sds))
4078		return sds.busiest;
4079
4080	/* There is no busy sibling group to pull tasks from */
4081	if (!sds.busiest || sds.busiest_nr_running == 0)
4082		goto out_balanced;
4083
4084	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
 
 
4085
4086	/*
4087	 * If the busiest group is imbalanced the below checks don't
4088	 * work because they assumes all things are equal, which typically
4089	 * isn't true due to cpus_allowed constraints and the like.
4090	 */
4091	if (sds.group_imb)
4092		goto force_balance;
4093
4094	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4095	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4096			!sds.busiest_has_capacity)
 
 
 
 
 
 
 
4097		goto force_balance;
4098
4099	/*
4100	 * If the local group is more busy than the selected busiest group
4101	 * don't try and pull any tasks.
4102	 */
4103	if (sds.this_load >= sds.max_load)
4104		goto out_balanced;
4105
4106	/*
4107	 * Don't pull any tasks if this group is already above the domain
4108	 * average load.
4109	 */
4110	if (sds.this_load >= sds.avg_load)
4111		goto out_balanced;
4112
4113	if (env->idle == CPU_IDLE) {
4114		/*
4115		 * This cpu is idle. If the busiest group load doesn't
4116		 * have more tasks than the number of available cpu's and
4117		 * there is no imbalance between this and busiest group
4118		 * wrt to idle cpu's, it is balanced.
 
4119		 */
4120		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4121		    sds.busiest_nr_running <= sds.busiest_group_weight)
4122			goto out_balanced;
4123	} else {
4124		/*
4125		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4126		 * imbalance_pct to be conservative.
4127		 */
4128		if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
 
4129			goto out_balanced;
4130	}
4131
4132force_balance:
4133	/* Looks like there is an imbalance. Compute it */
 
4134	calculate_imbalance(env, &sds);
4135	return sds.busiest;
4136
4137out_balanced:
4138ret:
4139	env->imbalance = 0;
4140	return NULL;
4141}
4142
4143/*
4144 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4145 */
4146static struct rq *find_busiest_queue(struct lb_env *env,
4147				     struct sched_group *group,
4148				     const struct cpumask *cpus)
4149{
4150	struct rq *busiest = NULL, *rq;
4151	unsigned long max_load = 0;
4152	int i;
4153
4154	for_each_cpu(i, sched_group_cpus(group)) {
4155		unsigned long power = power_of(i);
4156		unsigned long capacity = DIV_ROUND_CLOSEST(power,
4157							   SCHED_POWER_SCALE);
4158		unsigned long wl;
4159
4160		if (!capacity)
4161			capacity = fix_small_capacity(env->sd, group);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4162
4163		if (!cpumask_test_cpu(i, cpus))
4164			continue;
 
4165
4166		rq = cpu_rq(i);
4167		wl = weighted_cpuload(i);
4168
4169		/*
4170		 * When comparing with imbalance, use weighted_cpuload()
4171		 * which is not scaled with the cpu power.
 
 
4172		 */
4173		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
 
 
4174			continue;
4175
 
 
4176		/*
4177		 * For the load comparisons with the other cpu's, consider
4178		 * the weighted_cpuload() scaled with the cpu power, so that
4179		 * the load can be moved away from the cpu that is potentially
4180		 * running at a lower capacity.
4181		 */
4182		wl = (wl * SCHED_POWER_SCALE) / power;
4183
4184		if (wl > max_load) {
4185			max_load = wl;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4186			busiest = rq;
4187		}
4188	}
4189
4190	return busiest;
4191}
4192
4193/*
4194 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4195 * so long as it is large enough.
4196 */
4197#define MAX_PINNED_INTERVAL	512
4198
4199/* Working cpumask for load_balance and load_balance_newidle. */
4200DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
 
 
 
 
 
 
 
 
4201
4202static int need_active_balance(struct lb_env *env)
 
4203{
4204	struct sched_domain *sd = env->sd;
4205
4206	if (env->idle == CPU_NEWLY_IDLE) {
 
4207
4208		/*
4209		 * ASYM_PACKING needs to force migrate tasks from busy but
4210		 * higher numbered CPUs in order to pack all tasks in the
4211		 * lowest numbered CPUs.
4212		 */
4213		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
 
 
 
 
4214			return 1;
4215	}
4216
 
 
 
 
 
 
 
 
 
 
 
 
 
4217	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4218}
4219
4220static int active_load_balance_cpu_stop(void *data);
4221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4222/*
4223 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4224 * tasks if there is an imbalance.
4225 */
4226static int load_balance(int this_cpu, struct rq *this_rq,
4227			struct sched_domain *sd, enum cpu_idle_type idle,
4228			int *balance)
4229{
4230	int ld_moved, active_balance = 0;
 
4231	struct sched_group *group;
4232	struct rq *busiest;
4233	unsigned long flags;
4234	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4235
4236	struct lb_env env = {
4237		.sd		= sd,
4238		.dst_cpu	= this_cpu,
4239		.dst_rq		= this_rq,
 
4240		.idle		= idle,
4241		.loop_break	= sched_nr_migrate_break,
 
 
 
4242	};
4243
4244	cpumask_copy(cpus, cpu_active_mask);
4245
4246	schedstat_inc(sd, lb_count[idle]);
4247
4248redo:
4249	group = find_busiest_group(&env, cpus, balance);
4250
4251	if (*balance == 0)
4252		goto out_balanced;
 
4253
 
4254	if (!group) {
4255		schedstat_inc(sd, lb_nobusyg[idle]);
4256		goto out_balanced;
4257	}
4258
4259	busiest = find_busiest_queue(&env, group, cpus);
4260	if (!busiest) {
4261		schedstat_inc(sd, lb_nobusyq[idle]);
4262		goto out_balanced;
4263	}
4264
4265	BUG_ON(busiest == this_rq);
4266
4267	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 
 
 
4268
4269	ld_moved = 0;
4270	if (busiest->nr_running > 1) {
4271		/*
4272		 * Attempt to move tasks. If find_busiest_group has found
4273		 * an imbalance but busiest->nr_running <= 1, the group is
4274		 * still unbalanced. ld_moved simply stays zero, so it is
4275		 * correctly treated as an imbalance.
4276		 */
4277		env.flags |= LBF_ALL_PINNED;
4278		env.src_cpu   = busiest->cpu;
4279		env.src_rq    = busiest;
4280		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
4281
4282more_balance:
4283		local_irq_save(flags);
4284		double_rq_lock(this_rq, busiest);
4285		if (!env.loop)
4286			update_h_load(env.src_cpu);
4287		ld_moved += move_tasks(&env);
4288		double_rq_unlock(this_rq, busiest);
4289		local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4290
4291		if (env.flags & LBF_NEED_BREAK) {
4292			env.flags &= ~LBF_NEED_BREAK;
4293			goto more_balance;
4294		}
4295
4296		/*
4297		 * some other cpu did the load balance for us.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4298		 */
4299		if (ld_moved && this_cpu != smp_processor_id())
4300			resched_cpu(this_cpu);
 
 
 
 
4301
4302		/* All tasks on this runqueue were pinned by CPU affinity */
4303		if (unlikely(env.flags & LBF_ALL_PINNED)) {
4304			cpumask_clear_cpu(cpu_of(busiest), cpus);
4305			if (!cpumask_empty(cpus))
 
 
 
 
 
 
 
 
 
 
4306				goto redo;
4307			goto out_balanced;
 
4308		}
4309	}
4310
4311	if (!ld_moved) {
4312		schedstat_inc(sd, lb_failed[idle]);
4313		/*
4314		 * Increment the failure counter only on periodic balance.
4315		 * We do not want newidle balance, which can be very
4316		 * frequent, pollute the failure counter causing
4317		 * excessive cache_hot migrations and active balances.
4318		 */
4319		if (idle != CPU_NEWLY_IDLE)
4320			sd->nr_balance_failed++;
4321
4322		if (need_active_balance(&env)) {
 
 
4323			raw_spin_lock_irqsave(&busiest->lock, flags);
4324
4325			/* don't kick the active_load_balance_cpu_stop,
4326			 * if the curr task on busiest cpu can't be
4327			 * moved to this_cpu
 
4328			 */
4329			if (!cpumask_test_cpu(this_cpu,
4330					tsk_cpus_allowed(busiest->curr))) {
4331				raw_spin_unlock_irqrestore(&busiest->lock,
4332							    flags);
4333				env.flags |= LBF_ALL_PINNED;
4334				goto out_one_pinned;
4335			}
4336
4337			/*
4338			 * ->active_balance synchronizes accesses to
4339			 * ->active_balance_work.  Once set, it's cleared
4340			 * only after active load balance is finished.
4341			 */
4342			if (!busiest->active_balance) {
4343				busiest->active_balance = 1;
4344				busiest->push_cpu = this_cpu;
4345				active_balance = 1;
4346			}
4347			raw_spin_unlock_irqrestore(&busiest->lock, flags);
4348
4349			if (active_balance) {
4350				stop_one_cpu_nowait(cpu_of(busiest),
4351					active_load_balance_cpu_stop, busiest,
4352					&busiest->active_balance_work);
4353			}
4354
4355			/*
4356			 * We've kicked active balancing, reset the failure
4357			 * counter.
4358			 */
4359			sd->nr_balance_failed = sd->cache_nice_tries+1;
4360		}
4361	} else
4362		sd->nr_balance_failed = 0;
4363
4364	if (likely(!active_balance)) {
4365		/* We were unbalanced, so reset the balancing interval */
4366		sd->balance_interval = sd->min_interval;
4367	} else {
4368		/*
4369		 * If we've begun active balancing, start to back off. This
4370		 * case may not be covered by the all_pinned logic if there
4371		 * is only 1 task on the busy runqueue (because we don't call
4372		 * move_tasks).
4373		 */
4374		if (sd->balance_interval < sd->max_interval)
4375			sd->balance_interval *= 2;
4376	}
4377
4378	goto out;
4379
4380out_balanced:
4381	schedstat_inc(sd, lb_balanced[idle]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4382
4383	sd->nr_balance_failed = 0;
4384
4385out_one_pinned:
 
 
 
 
 
 
 
 
 
 
 
4386	/* tune up the balancing interval */
4387	if (((env.flags & LBF_ALL_PINNED) &&
4388			sd->balance_interval < MAX_PINNED_INTERVAL) ||
4389			(sd->balance_interval < sd->max_interval))
4390		sd->balance_interval *= 2;
4391
4392	ld_moved = 0;
4393out:
4394	return ld_moved;
4395}
4396
4397/*
4398 * idle_balance is called by schedule() if this_cpu is about to become
4399 * idle. Attempts to pull tasks from other CPUs.
4400 */
4401void idle_balance(int this_cpu, struct rq *this_rq)
4402{
4403	struct sched_domain *sd;
4404	int pulled_task = 0;
4405	unsigned long next_balance = jiffies + HZ;
4406
4407	this_rq->idle_stamp = this_rq->clock;
 
4408
4409	if (this_rq->avg_idle < sysctl_sched_migration_cost)
4410		return;
 
4411
4412	/*
4413	 * Drop the rq->lock, but keep IRQ/preempt disabled.
4414	 */
4415	raw_spin_unlock(&this_rq->lock);
4416
4417	update_shares(this_cpu);
4418	rcu_read_lock();
4419	for_each_domain(this_cpu, sd) {
4420		unsigned long interval;
4421		int balance = 1;
4422
4423		if (!(sd->flags & SD_LOAD_BALANCE))
4424			continue;
4425
4426		if (sd->flags & SD_BALANCE_NEWIDLE) {
4427			/* If we've pulled tasks over stop searching: */
4428			pulled_task = load_balance(this_cpu, this_rq,
4429						   sd, CPU_NEWLY_IDLE, &balance);
4430		}
4431
4432		interval = msecs_to_jiffies(sd->balance_interval);
4433		if (time_after(next_balance, sd->last_balance + interval))
4434			next_balance = sd->last_balance + interval;
4435		if (pulled_task) {
4436			this_rq->idle_stamp = 0;
4437			break;
4438		}
4439	}
4440	rcu_read_unlock();
4441
4442	raw_spin_lock(&this_rq->lock);
 
 
4443
4444	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4445		/*
4446		 * We are going idle. next_balance may be set based on
4447		 * a busy processor. So reset next_balance.
4448		 */
4449		this_rq->next_balance = next_balance;
4450	}
4451}
4452
4453/*
4454 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
4455 * running tasks off the busiest CPU onto idle CPUs. It requires at
4456 * least 1 task to be running on each physical CPU where possible, and
4457 * avoids physical / logical imbalances.
4458 */
4459static int active_load_balance_cpu_stop(void *data)
4460{
4461	struct rq *busiest_rq = data;
4462	int busiest_cpu = cpu_of(busiest_rq);
4463	int target_cpu = busiest_rq->push_cpu;
4464	struct rq *target_rq = cpu_rq(target_cpu);
4465	struct sched_domain *sd;
 
 
4466
4467	raw_spin_lock_irq(&busiest_rq->lock);
 
 
 
 
 
 
 
4468
4469	/* make sure the requested cpu hasn't gone down in the meantime */
4470	if (unlikely(busiest_cpu != smp_processor_id() ||
4471		     !busiest_rq->active_balance))
4472		goto out_unlock;
4473
4474	/* Is there any task to move? */
4475	if (busiest_rq->nr_running <= 1)
4476		goto out_unlock;
4477
4478	/*
4479	 * This condition is "impossible", if it occurs
4480	 * we need to fix it. Originally reported by
4481	 * Bjorn Helgaas on a 128-cpu setup.
4482	 */
4483	BUG_ON(busiest_rq == target_rq);
4484
4485	/* move a task from busiest_rq to target_rq */
4486	double_lock_balance(busiest_rq, target_rq);
4487
4488	/* Search for an sd spanning us and the target CPU. */
4489	rcu_read_lock();
4490	for_each_domain(target_cpu, sd) {
4491		if ((sd->flags & SD_LOAD_BALANCE) &&
4492		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4493				break;
4494	}
4495
4496	if (likely(sd)) {
4497		struct lb_env env = {
4498			.sd		= sd,
4499			.dst_cpu	= target_cpu,
4500			.dst_rq		= target_rq,
4501			.src_cpu	= busiest_rq->cpu,
4502			.src_rq		= busiest_rq,
4503			.idle		= CPU_IDLE,
 
 
 
 
 
 
 
4504		};
4505
4506		schedstat_inc(sd, alb_count);
 
4507
4508		if (move_one_task(&env))
4509			schedstat_inc(sd, alb_pushed);
4510		else
4511			schedstat_inc(sd, alb_failed);
 
 
 
 
4512	}
4513	rcu_read_unlock();
4514	double_unlock_balance(busiest_rq, target_rq);
4515out_unlock:
4516	busiest_rq->active_balance = 0;
4517	raw_spin_unlock_irq(&busiest_rq->lock);
 
 
 
 
 
 
4518	return 0;
4519}
4520
4521#ifdef CONFIG_NO_HZ
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4522/*
4523 * idle load balancing details
4524 * - When one of the busy CPUs notice that there may be an idle rebalancing
4525 *   needed, they will kick the idle load balancer, which then does idle
4526 *   load balancing for all the idle CPUs.
 
 
4527 */
4528static struct {
4529	cpumask_var_t idle_cpus_mask;
4530	atomic_t nr_cpus;
4531	unsigned long next_balance;     /* in jiffy units */
4532} nohz ____cacheline_aligned;
4533
4534static inline int find_new_ilb(int call_cpu)
4535{
4536	int ilb = cpumask_first(nohz.idle_cpus_mask);
4537
4538	if (ilb < nr_cpu_ids && idle_cpu(ilb))
4539		return ilb;
 
 
 
4540
4541	return nr_cpu_ids;
4542}
4543
4544/*
4545 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
4546 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
4547 * CPU (if there is one).
4548 */
4549static void nohz_balancer_kick(int cpu)
4550{
4551	int ilb_cpu;
4552
4553	nohz.next_balance++;
4554
4555	ilb_cpu = find_new_ilb(cpu);
4556
4557	if (ilb_cpu >= nr_cpu_ids)
4558		return;
4559
4560	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
 
4561		return;
 
4562	/*
4563	 * Use smp_send_reschedule() instead of resched_cpu().
4564	 * This way we generate a sched IPI on the target cpu which
4565	 * is idle. And the softirq performing nohz idle load balance
4566	 * will be run before returning from the IPI.
4567	 */
4568	smp_send_reschedule(ilb_cpu);
4569	return;
4570}
4571
4572static inline void clear_nohz_tick_stopped(int cpu)
 
 
 
 
4573{
4574	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4575		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
4576		atomic_dec(&nohz.nr_cpus);
4577		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4578	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4579}
4580
4581static inline void set_cpu_sd_state_busy(void)
4582{
4583	struct sched_domain *sd;
4584	int cpu = smp_processor_id();
4585
4586	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4587		return;
4588	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
 
 
 
4589
4590	rcu_read_lock();
4591	for_each_domain(cpu, sd)
4592		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4593	rcu_read_unlock();
4594}
4595
4596void set_cpu_sd_state_idle(void)
4597{
4598	struct sched_domain *sd;
4599	int cpu = smp_processor_id();
4600
4601	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4602		return;
4603	set_bit(NOHZ_IDLE, nohz_flags(cpu));
 
 
 
 
 
 
 
 
 
 
4604
4605	rcu_read_lock();
4606	for_each_domain(cpu, sd)
4607		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 
 
 
 
 
 
4608	rcu_read_unlock();
4609}
4610
4611/*
4612 * This routine will record that this cpu is going idle with tick stopped.
4613 * This info will be used in performing idle load balancing in the future.
4614 */
4615void select_nohz_load_balancer(int stop_tick)
4616{
4617	int cpu = smp_processor_id();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4618
4619	/*
4620	 * If this cpu is going down, then nothing needs to be done.
 
 
 
4621	 */
4622	if (!cpu_active(cpu))
 
 
 
 
4623		return;
4624
4625	if (stop_tick) {
4626		if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4627			return;
4628
4629		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4630		atomic_inc(&nohz.nr_cpus);
4631		set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4632	}
4633	return;
4634}
4635
4636static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4637					unsigned long action, void *hcpu)
4638{
4639	switch (action & ~CPU_TASKS_FROZEN) {
4640	case CPU_DYING:
4641		clear_nohz_tick_stopped(smp_processor_id());
4642		return NOTIFY_OK;
4643	default:
4644		return NOTIFY_DONE;
4645	}
4646}
4647#endif
4648
4649static DEFINE_SPINLOCK(balancing);
4650
4651/*
4652 * Scale the max load_balance interval with the number of CPUs in the system.
4653 * This trades load-balance latency on larger machines for less cross talk.
4654 */
4655void update_max_interval(void)
4656{
4657	max_load_balance_interval = HZ*num_online_cpus()/10;
4658}
4659
4660/*
4661 * It checks each scheduling domain to see if it is due to be balanced,
4662 * and initiates a balancing operation if so.
4663 *
4664 * Balancing parameters are set up in arch_init_sched_domains.
 
4665 */
4666static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 
4667{
4668	int balance = 1;
4669	struct rq *rq = cpu_rq(cpu);
4670	unsigned long interval;
4671	struct sched_domain *sd;
4672	/* Earliest time when we have to do rebalance again */
4673	unsigned long next_balance = jiffies + 60*HZ;
 
 
4674	int update_next_balance = 0;
4675	int need_serialize;
 
 
 
4676
4677	update_shares(cpu);
4678
4679	rcu_read_lock();
4680	for_each_domain(cpu, sd) {
4681		if (!(sd->flags & SD_LOAD_BALANCE))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4682			continue;
4683
4684		interval = sd->balance_interval;
4685		if (idle != CPU_IDLE)
4686			interval *= sd->busy_factor;
4687
4688		/* scale ms to jiffies */
4689		interval = msecs_to_jiffies(interval);
4690		interval = clamp(interval, 1UL, max_load_balance_interval);
 
 
 
 
4691
4692		need_serialize = sd->flags & SD_SERIALIZE;
 
 
 
 
 
 
 
 
 
 
 
4693
4694		if (need_serialize) {
4695			if (!spin_trylock(&balancing))
4696				goto out;
4697		}
4698
4699		if (time_after_eq(jiffies, sd->last_balance + interval)) {
4700			if (load_balance(cpu, rq, sd, idle, &balance)) {
4701				/*
4702				 * We've pulled tasks over so either we're no
4703				 * longer idle.
4704				 */
4705				idle = CPU_NOT_IDLE;
4706			}
4707			sd->last_balance = jiffies;
4708		}
4709		if (need_serialize)
4710			spin_unlock(&balancing);
4711out:
4712		if (time_after(next_balance, sd->last_balance + interval)) {
4713			next_balance = sd->last_balance + interval;
4714			update_next_balance = 1;
4715		}
 
4716
4717		/*
4718		 * Stop the load balance at this level. There is another
4719		 * CPU in our sched group which is doing load balancing more
4720		 * actively.
4721		 */
4722		if (!balance)
4723			break;
4724	}
4725	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 
4726
4727	/*
4728	 * next_balance will be updated only when there is a need.
4729	 * When the cpu is attached to null domain for ex, it will not be
4730	 * updated.
4731	 */
4732	if (likely(update_next_balance))
4733		rq->next_balance = next_balance;
 
 
4734}
4735
4736#ifdef CONFIG_NO_HZ
4737/*
4738 * In CONFIG_NO_HZ case, the idle balance kickee will do the
4739 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4740 */
4741static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4742{
4743	struct rq *this_rq = cpu_rq(this_cpu);
4744	struct rq *rq;
4745	int balance_cpu;
4746
4747	if (idle != CPU_IDLE ||
4748	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
4749		goto end;
 
 
 
4750
4751	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4752		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4753			continue;
4754
4755		/*
4756		 * If this cpu gets work to do, stop the load balancing
4757		 * work being done for other cpus. Next load
4758		 * balancing owner will pick it up.
4759		 */
4760		if (need_resched())
4761			break;
4762
4763		raw_spin_lock_irq(&this_rq->lock);
4764		update_rq_clock(this_rq);
4765		update_idle_cpu_load(this_rq);
4766		raw_spin_unlock_irq(&this_rq->lock);
 
 
 
 
 
 
 
4767
4768		rebalance_domains(balance_cpu, CPU_IDLE);
 
4769
4770		rq = cpu_rq(balance_cpu);
4771		if (time_after(this_rq->next_balance, rq->next_balance))
4772			this_rq->next_balance = rq->next_balance;
4773	}
4774	nohz.next_balance = this_rq->next_balance;
4775end:
4776	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4777}
4778
 
 
 
4779/*
4780 * Current heuristic for kicking the idle load balancer in the presence
4781 * of an idle cpu is the system.
4782 *   - This rq has more than one task.
4783 *   - At any scheduler domain level, this cpu's scheduler group has multiple
4784 *     busy cpu's exceeding the group's power.
4785 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
4786 *     domain span are idle.
4787 */
4788static inline int nohz_kick_needed(struct rq *rq, int cpu)
4789{
4790	unsigned long now = jiffies;
 
4791	struct sched_domain *sd;
 
 
 
 
 
 
 
 
 
4792
4793	if (unlikely(idle_cpu(cpu)))
 
 
 
4794		return 0;
4795
4796       /*
4797	* We may be recently in ticked or tickless idle mode. At the first
4798	* busy tick after returning from idle, we will update the busy stats.
4799	*/
4800	set_cpu_sd_state_busy();
4801	clear_nohz_tick_stopped(cpu);
4802
4803	/*
4804	 * None are in tickless mode and hence no need for NOHZ idle load
4805	 * balancing.
 
 
4806	 */
4807	if (likely(!atomic_read(&nohz.nr_cpus)))
4808		return 0;
 
 
 
 
 
 
 
 
 
 
4809
4810	if (time_before(now, nohz.next_balance))
4811		return 0;
4812
4813	if (rq->nr_running >= 2)
4814		goto need_kick;
4815
 
4816	rcu_read_lock();
4817	for_each_domain(cpu, sd) {
4818		struct sched_group *sg = sd->groups;
4819		struct sched_group_power *sgp = sg->sgp;
4820		int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4821
4822		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4823			goto need_kick_unlock;
4824
4825		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
4826		    && (cpumask_first_and(nohz.idle_cpus_mask,
4827					  sched_domain_span(sd)) < cpu))
4828			goto need_kick_unlock;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4829
4830		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
 
 
 
 
4831			break;
4832	}
4833	rcu_read_unlock();
4834	return 0;
4835
4836need_kick_unlock:
4837	rcu_read_unlock();
4838need_kick:
4839	return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4840}
4841#else
4842static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
4843#endif
4844
4845/*
4846 * run_rebalance_domains is triggered when needed from the scheduler tick.
4847 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
4848 */
4849static void run_rebalance_domains(struct softirq_action *h)
4850{
4851	int this_cpu = smp_processor_id();
4852	struct rq *this_rq = cpu_rq(this_cpu);
4853	enum cpu_idle_type idle = this_rq->idle_balance ?
4854						CPU_IDLE : CPU_NOT_IDLE;
4855
4856	rebalance_domains(this_cpu, idle);
4857
4858	/*
4859	 * If this cpu has a pending nohz_balance_kick, then do the
4860	 * balancing on behalf of the other idle cpus whose ticks are
4861	 * stopped.
4862	 */
4863	nohz_idle_balance(this_cpu, idle);
4864}
 
 
 
4865
4866static inline int on_null_domain(int cpu)
4867{
4868	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4869}
4870
4871/*
4872 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4873 */
4874void trigger_load_balance(struct rq *rq, int cpu)
4875{
4876	/* Don't need to rebalance while attached to NULL domain */
4877	if (time_after_eq(jiffies, rq->next_balance) &&
4878	    likely(!on_null_domain(cpu)))
 
 
4879		raise_softirq(SCHED_SOFTIRQ);
4880#ifdef CONFIG_NO_HZ
4881	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4882		nohz_balancer_kick(cpu);
4883#endif
4884}
4885
4886static void rq_online_fair(struct rq *rq)
4887{
4888	update_sysctl();
 
 
4889}
4890
4891static void rq_offline_fair(struct rq *rq)
4892{
4893	update_sysctl();
 
 
 
4894}
4895
4896#endif /* CONFIG_SMP */
4897
4898/*
4899 * scheduler tick hitting a task of our scheduling class:
 
 
 
 
 
4900 */
4901static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4902{
4903	struct cfs_rq *cfs_rq;
4904	struct sched_entity *se = &curr->se;
4905
4906	for_each_sched_entity(se) {
4907		cfs_rq = cfs_rq_of(se);
4908		entity_tick(cfs_rq, se, queued);
4909	}
 
 
 
 
 
 
4910}
4911
4912/*
4913 * called on fork with the child task as argument from the parent's context
4914 *  - child not yet on the tasklist
4915 *  - preemption disabled
4916 */
4917static void task_fork_fair(struct task_struct *p)
4918{
4919	struct cfs_rq *cfs_rq;
4920	struct sched_entity *se = &p->se, *curr;
4921	int this_cpu = smp_processor_id();
4922	struct rq *rq = this_rq();
4923	unsigned long flags;
4924
4925	raw_spin_lock_irqsave(&rq->lock, flags);
4926
 
4927	update_rq_clock(rq);
4928
4929	cfs_rq = task_cfs_rq(current);
4930	curr = cfs_rq->curr;
4931
4932	if (unlikely(task_cpu(p) != this_cpu)) {
4933		rcu_read_lock();
4934		__set_task_cpu(p, this_cpu);
4935		rcu_read_unlock();
4936	}
4937
4938	update_curr(cfs_rq);
4939
4940	if (curr)
4941		se->vruntime = curr->vruntime;
4942	place_entity(cfs_rq, se, 1);
4943
4944	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
4945		/*
4946		 * Upon rescheduling, sched_class::put_prev_task() will place
4947		 * 'current' within the tree based on its new key value.
4948		 */
4949		swap(curr->vruntime, se->vruntime);
4950		resched_task(rq->curr);
4951	}
4952
4953	se->vruntime -= cfs_rq->min_vruntime;
4954
4955	raw_spin_unlock_irqrestore(&rq->lock, flags);
4956}
4957
4958/*
4959 * Priority of the task has changed. Check to see if we preempt
4960 * the current task.
4961 */
4962static void
4963prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4964{
4965	if (!p->se.on_rq)
4966		return;
4967
4968	/*
4969	 * Reschedule if we are currently running on this runqueue and
4970	 * our priority decreased, or if we are not currently running on
4971	 * this runqueue and our priority is higher than the current's
4972	 */
4973	if (rq->curr == p) {
4974		if (p->prio > oldprio)
4975			resched_task(rq->curr);
4976	} else
4977		check_preempt_curr(rq, p, 0);
4978}
4979
4980static void switched_from_fair(struct rq *rq, struct task_struct *p)
4981{
4982	struct sched_entity *se = &p->se;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4983	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4984
 
4985	/*
4986	 * Ensure the task's vruntime is normalized, so that when its
4987	 * switched back to the fair class the enqueue_entity(.flags=0) will
4988	 * do the right thing.
4989	 *
4990	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4991	 * have normalized the vruntime, if it was !on_rq, then only when
4992	 * the task is sleeping will it still have non-normalized vruntime.
4993	 */
4994	if (!se->on_rq && p->state != TASK_RUNNING) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4995		/*
4996		 * Fix up our vruntime so that the current sleep doesn't
4997		 * cause 'unlimited' sleep bonus.
4998		 */
4999		place_entity(cfs_rq, se, 0);
5000		se->vruntime -= cfs_rq->min_vruntime;
5001	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5002}
5003
5004/*
5005 * We switched to the sched_fair class.
5006 */
5007static void switched_to_fair(struct rq *rq, struct task_struct *p)
5008{
5009	if (!p->se.on_rq)
5010		return;
5011
5012	/*
5013	 * We were most likely switched from sched_rt, so
5014	 * kick off the schedule if running, otherwise just see
5015	 * if we can still preempt the current task.
5016	 */
5017	if (rq->curr == p)
5018		resched_task(rq->curr);
5019	else
5020		check_preempt_curr(rq, p, 0);
 
 
5021}
5022
5023/* Account for a task changing its policy or group.
5024 *
5025 * This routine is mostly called to set cfs_rq->curr field when a task
5026 * migrates between groups/classes.
5027 */
5028static void set_curr_task_fair(struct rq *rq)
5029{
5030	struct sched_entity *se = &rq->curr->se;
 
 
 
 
 
 
 
 
 
 
5031
5032	for_each_sched_entity(se) {
5033		struct cfs_rq *cfs_rq = cfs_rq_of(se);
5034
5035		set_next_entity(cfs_rq, se);
5036		/* ensure bandwidth has been allocated on our new cfs_rq */
5037		account_cfs_rq_runtime(cfs_rq, 0);
5038	}
5039}
5040
5041void init_cfs_rq(struct cfs_rq *cfs_rq)
5042{
5043	cfs_rq->tasks_timeline = RB_ROOT;
5044	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5045#ifndef CONFIG_64BIT
5046	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5047#endif
 
 
 
5048}
5049
5050#ifdef CONFIG_FAIR_GROUP_SCHED
5051static void task_move_group_fair(struct task_struct *p, int on_rq)
5052{
5053	/*
5054	 * If the task was not on the rq at the time of this cgroup movement
5055	 * it must have been asleep, sleeping tasks keep their ->vruntime
5056	 * absolute on their old rq until wakeup (needed for the fair sleeper
5057	 * bonus in place_entity()).
5058	 *
5059	 * If it was on the rq, we've just 'preempted' it, which does convert
5060	 * ->vruntime to a relative base.
5061	 *
5062	 * Make sure both cases convert their relative position when migrating
5063	 * to another cgroup's rq. This does somewhat interfere with the
5064	 * fair sleeper stuff for the first placement, but who cares.
5065	 */
5066	/*
5067	 * When !on_rq, vruntime of the task has usually NOT been normalized.
5068	 * But there are some cases where it has already been normalized:
5069	 *
5070	 * - Moving a forked child which is waiting for being woken up by
5071	 *   wake_up_new_task().
5072	 * - Moving a task which has been woken up by try_to_wake_up() and
5073	 *   waiting for actually being woken up by sched_ttwu_pending().
5074	 *
5075	 * To prevent boost or penalty in the new cfs_rq caused by delta
5076	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5077	 */
5078	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5079		on_rq = 1;
5080
5081	if (!on_rq)
5082		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
 
5083	set_task_rq(p, task_cpu(p));
5084	if (!on_rq)
5085		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5086}
5087
5088void free_fair_sched_group(struct task_group *tg)
5089{
5090	int i;
5091
5092	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5093
5094	for_each_possible_cpu(i) {
5095		if (tg->cfs_rq)
5096			kfree(tg->cfs_rq[i]);
5097		if (tg->se)
5098			kfree(tg->se[i]);
5099	}
5100
5101	kfree(tg->cfs_rq);
5102	kfree(tg->se);
5103}
5104
5105int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5106{
 
5107	struct cfs_rq *cfs_rq;
5108	struct sched_entity *se;
5109	int i;
5110
5111	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5112	if (!tg->cfs_rq)
5113		goto err;
5114	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5115	if (!tg->se)
5116		goto err;
5117
5118	tg->shares = NICE_0_LOAD;
5119
5120	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5121
5122	for_each_possible_cpu(i) {
5123		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5124				      GFP_KERNEL, cpu_to_node(i));
5125		if (!cfs_rq)
5126			goto err;
5127
5128		se = kzalloc_node(sizeof(struct sched_entity),
5129				  GFP_KERNEL, cpu_to_node(i));
5130		if (!se)
5131			goto err_free_rq;
5132
5133		init_cfs_rq(cfs_rq);
5134		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 
5135	}
5136
5137	return 1;
5138
5139err_free_rq:
5140	kfree(cfs_rq);
5141err:
5142	return 0;
5143}
5144
5145void unregister_fair_sched_group(struct task_group *tg, int cpu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5146{
5147	struct rq *rq = cpu_rq(cpu);
5148	unsigned long flags;
 
 
 
 
 
 
5149
5150	/*
5151	* Only empty task groups can be destroyed; so we can speculatively
5152	* check on_list without danger of it being re-added.
5153	*/
5154	if (!tg->cfs_rq[cpu]->on_list)
5155		return;
 
 
5156
5157	raw_spin_lock_irqsave(&rq->lock, flags);
5158	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5159	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
5160}
5161
5162void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5163			struct sched_entity *se, int cpu,
5164			struct sched_entity *parent)
5165{
5166	struct rq *rq = cpu_rq(cpu);
5167
5168	cfs_rq->tg = tg;
5169	cfs_rq->rq = rq;
5170#ifdef CONFIG_SMP
5171	/* allow initial update_cfs_load() to truncate */
5172	cfs_rq->load_stamp = 1;
5173#endif
5174	init_cfs_rq_runtime(cfs_rq);
5175
5176	tg->cfs_rq[cpu] = cfs_rq;
5177	tg->se[cpu] = se;
5178
5179	/* se could be NULL for root_task_group */
5180	if (!se)
5181		return;
5182
5183	if (!parent)
5184		se->cfs_rq = &rq->cfs;
5185	else
 
5186		se->cfs_rq = parent->my_q;
 
 
5187
5188	se->my_q = cfs_rq;
5189	update_load_set(&se->load, 0);
 
5190	se->parent = parent;
5191}
5192
5193static DEFINE_MUTEX(shares_mutex);
5194
5195int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5196{
5197	int i;
5198	unsigned long flags;
5199
5200	/*
5201	 * We can't change the weight of the root cgroup.
5202	 */
5203	if (!tg->se[0])
5204		return -EINVAL;
5205
5206	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5207
5208	mutex_lock(&shares_mutex);
5209	if (tg->shares == shares)
5210		goto done;
5211
5212	tg->shares = shares;
5213	for_each_possible_cpu(i) {
5214		struct rq *rq = cpu_rq(i);
5215		struct sched_entity *se;
 
5216
5217		se = tg->se[i];
5218		/* Propagate contribution to hierarchy */
5219		raw_spin_lock_irqsave(&rq->lock, flags);
5220		for_each_sched_entity(se)
5221			update_cfs_shares(group_cfs_rq(se));
5222		raw_spin_unlock_irqrestore(&rq->lock, flags);
 
 
 
5223	}
5224
5225done:
5226	mutex_unlock(&shares_mutex);
5227	return 0;
5228}
5229#else /* CONFIG_FAIR_GROUP_SCHED */
5230
5231void free_fair_sched_group(struct task_group *tg) { }
5232
5233int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5234{
5235	return 1;
5236}
5237
5238void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
 
 
5239
5240#endif /* CONFIG_FAIR_GROUP_SCHED */
5241
5242
5243static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5244{
5245	struct sched_entity *se = &task->se;
5246	unsigned int rr_interval = 0;
5247
5248	/*
5249	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
5250	 * idle runqueue:
5251	 */
5252	if (rq->cfs.load.weight)
5253		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5254
5255	return rr_interval;
5256}
5257
5258/*
5259 * All the scheduling class methods:
5260 */
5261const struct sched_class fair_sched_class = {
5262	.next			= &idle_sched_class,
5263	.enqueue_task		= enqueue_task_fair,
5264	.dequeue_task		= dequeue_task_fair,
5265	.yield_task		= yield_task_fair,
5266	.yield_to_task		= yield_to_task_fair,
5267
5268	.check_preempt_curr	= check_preempt_wakeup,
5269
5270	.pick_next_task		= pick_next_task_fair,
5271	.put_prev_task		= put_prev_task_fair,
 
5272
5273#ifdef CONFIG_SMP
 
5274	.select_task_rq		= select_task_rq_fair,
 
5275
5276	.rq_online		= rq_online_fair,
5277	.rq_offline		= rq_offline_fair,
5278
5279	.task_waking		= task_waking_fair,
 
5280#endif
5281
5282	.set_curr_task          = set_curr_task_fair,
5283	.task_tick		= task_tick_fair,
5284	.task_fork		= task_fork_fair,
5285
5286	.prio_changed		= prio_changed_fair,
5287	.switched_from		= switched_from_fair,
5288	.switched_to		= switched_to_fair,
5289
5290	.get_rr_interval	= get_rr_interval_fair,
5291
 
 
5292#ifdef CONFIG_FAIR_GROUP_SCHED
5293	.task_move_group	= task_move_group_fair,
 
 
 
 
5294#endif
5295};
5296
5297#ifdef CONFIG_SCHED_DEBUG
5298void print_cfs_stats(struct seq_file *m, int cpu)
5299{
5300	struct cfs_rq *cfs_rq;
5301
5302	rcu_read_lock();
5303	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5304		print_cfs_rq(m, cpu, cfs_rq);
5305	rcu_read_unlock();
5306}
5307#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5308
5309__init void init_sched_fair_class(void)
5310{
5311#ifdef CONFIG_SMP
5312	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5313
5314#ifdef CONFIG_NO_HZ
5315	nohz.next_balance = jiffies;
 
5316	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5317	cpu_notifier(sched_ilb_notifier, 0);
5318#endif
5319#endif /* SMP */
5320
5321}