sched_rt.c - kernel/sched_rt.c - Linux source code v6.13.7

Note: File does not exist in v6.13.7.
   1/*
   2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3 * policies)
   4 */
   5
   6#ifdef CONFIG_RT_GROUP_SCHED
   7
   8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
   9
  10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  11{
  12#ifdef CONFIG_SCHED_DEBUG
  13	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
  14#endif
  15	return container_of(rt_se, struct task_struct, rt);
  16}
  17
  18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  19{
  20	return rt_rq->rq;
  21}
  22
  23static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  24{
  25	return rt_se->rt_rq;
  26}
  27
  28#else /* CONFIG_RT_GROUP_SCHED */
  29
  30#define rt_entity_is_task(rt_se) (1)
  31
  32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  33{
  34	return container_of(rt_se, struct task_struct, rt);
  35}
  36
  37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  38{
  39	return container_of(rt_rq, struct rq, rt);
  40}
  41
  42static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  43{
  44	struct task_struct *p = rt_task_of(rt_se);
  45	struct rq *rq = task_rq(p);
  46
  47	return &rq->rt;
  48}
  49
  50#endif /* CONFIG_RT_GROUP_SCHED */
  51
  52#ifdef CONFIG_SMP
  53
  54static inline int rt_overloaded(struct rq *rq)
  55{
  56	return atomic_read(&rq->rd->rto_count);
  57}
  58
  59static inline void rt_set_overload(struct rq *rq)
  60{
  61	if (!rq->online)
  62		return;
  63
  64	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  65	/*
  66	 * Make sure the mask is visible before we set
  67	 * the overload count. That is checked to determine
  68	 * if we should look at the mask. It would be a shame
  69	 * if we looked at the mask, but the mask was not
  70	 * updated yet.
  71	 */
  72	wmb();
  73	atomic_inc(&rq->rd->rto_count);
  74}
  75
  76static inline void rt_clear_overload(struct rq *rq)
  77{
  78	if (!rq->online)
  79		return;
  80
  81	/* the order here really doesn't matter */
  82	atomic_dec(&rq->rd->rto_count);
  83	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  84}
  85
  86static void update_rt_migration(struct rt_rq *rt_rq)
  87{
  88	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
  89		if (!rt_rq->overloaded) {
  90			rt_set_overload(rq_of_rt_rq(rt_rq));
  91			rt_rq->overloaded = 1;
  92		}
  93	} else if (rt_rq->overloaded) {
  94		rt_clear_overload(rq_of_rt_rq(rt_rq));
  95		rt_rq->overloaded = 0;
  96	}
  97}
  98
  99static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 100{
 101	if (!rt_entity_is_task(rt_se))
 102		return;
 103
 104	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 105
 106	rt_rq->rt_nr_total++;
 107	if (rt_se->nr_cpus_allowed > 1)
 108		rt_rq->rt_nr_migratory++;
 109
 110	update_rt_migration(rt_rq);
 111}
 112
 113static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 114{
 115	if (!rt_entity_is_task(rt_se))
 116		return;
 117
 118	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 119
 120	rt_rq->rt_nr_total--;
 121	if (rt_se->nr_cpus_allowed > 1)
 122		rt_rq->rt_nr_migratory--;
 123
 124	update_rt_migration(rt_rq);
 125}
 126
 127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 128{
 129	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 130	plist_node_init(&p->pushable_tasks, p->prio);
 131	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 132}
 133
 134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 135{
 136	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 137}
 138
 139static inline int has_pushable_tasks(struct rq *rq)
 140{
 141	return !plist_head_empty(&rq->rt.pushable_tasks);
 142}
 143
 144#else
 145
 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 147{
 148}
 149
 150static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 151{
 152}
 153
 154static inline
 155void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 156{
 157}
 158
 159static inline
 160void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 161{
 162}
 163
 164#endif /* CONFIG_SMP */
 165
 166static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 167{
 168	return !list_empty(&rt_se->run_list);
 169}
 170
 171#ifdef CONFIG_RT_GROUP_SCHED
 172
 173static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 174{
 175	if (!rt_rq->tg)
 176		return RUNTIME_INF;
 177
 178	return rt_rq->rt_runtime;
 179}
 180
 181static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 182{
 183	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 184}
 185
 186typedef struct task_group *rt_rq_iter_t;
 187
 188static inline struct task_group *next_task_group(struct task_group *tg)
 189{
 190	do {
 191		tg = list_entry_rcu(tg->list.next,
 192			typeof(struct task_group), list);
 193	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 194
 195	if (&tg->list == &task_groups)
 196		tg = NULL;
 197
 198	return tg;
 199}
 200
 201#define for_each_rt_rq(rt_rq, iter, rq)					\
 202	for (iter = container_of(&task_groups, typeof(*iter), list);	\
 203		(iter = next_task_group(iter)) &&			\
 204		(rt_rq = iter->rt_rq[cpu_of(rq)]);)
 205
 206static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 207{
 208	list_add_rcu(&rt_rq->leaf_rt_rq_list,
 209			&rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
 210}
 211
 212static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
 213{
 214	list_del_rcu(&rt_rq->leaf_rt_rq_list);
 215}
 216
 217#define for_each_leaf_rt_rq(rt_rq, rq) \
 218	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 219
 220#define for_each_sched_rt_entity(rt_se) \
 221	for (; rt_se; rt_se = rt_se->parent)
 222
 223static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 224{
 225	return rt_se->my_q;
 226}
 227
 228static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 229static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 230
 231static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 232{
 233	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 234	struct sched_rt_entity *rt_se;
 235
 236	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 237
 238	rt_se = rt_rq->tg->rt_se[cpu];
 239
 240	if (rt_rq->rt_nr_running) {
 241		if (rt_se && !on_rt_rq(rt_se))
 242			enqueue_rt_entity(rt_se, false);
 243		if (rt_rq->highest_prio.curr < curr->prio)
 244			resched_task(curr);
 245	}
 246}
 247
 248static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 249{
 250	struct sched_rt_entity *rt_se;
 251	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 252
 253	rt_se = rt_rq->tg->rt_se[cpu];
 254
 255	if (rt_se && on_rt_rq(rt_se))
 256		dequeue_rt_entity(rt_se);
 257}
 258
 259static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 260{
 261	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 262}
 263
 264static int rt_se_boosted(struct sched_rt_entity *rt_se)
 265{
 266	struct rt_rq *rt_rq = group_rt_rq(rt_se);
 267	struct task_struct *p;
 268
 269	if (rt_rq)
 270		return !!rt_rq->rt_nr_boosted;
 271
 272	p = rt_task_of(rt_se);
 273	return p->prio != p->normal_prio;
 274}
 275
 276#ifdef CONFIG_SMP
 277static inline const struct cpumask *sched_rt_period_mask(void)
 278{
 279	return cpu_rq(smp_processor_id())->rd->span;
 280}
 281#else
 282static inline const struct cpumask *sched_rt_period_mask(void)
 283{
 284	return cpu_online_mask;
 285}
 286#endif
 287
 288static inline
 289struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 290{
 291	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 292}
 293
 294static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 295{
 296	return &rt_rq->tg->rt_bandwidth;
 297}
 298
 299#else /* !CONFIG_RT_GROUP_SCHED */
 300
 301static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 302{
 303	return rt_rq->rt_runtime;
 304}
 305
 306static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 307{
 308	return ktime_to_ns(def_rt_bandwidth.rt_period);
 309}
 310
 311typedef struct rt_rq *rt_rq_iter_t;
 312
 313#define for_each_rt_rq(rt_rq, iter, rq) \
 314	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 315
 316static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 317{
 318}
 319
 320static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
 321{
 322}
 323
 324#define for_each_leaf_rt_rq(rt_rq, rq) \
 325	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 326
 327#define for_each_sched_rt_entity(rt_se) \
 328	for (; rt_se; rt_se = NULL)
 329
 330static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 331{
 332	return NULL;
 333}
 334
 335static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 336{
 337	if (rt_rq->rt_nr_running)
 338		resched_task(rq_of_rt_rq(rt_rq)->curr);
 339}
 340
 341static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 342{
 343}
 344
 345static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 346{
 347	return rt_rq->rt_throttled;
 348}
 349
 350static inline const struct cpumask *sched_rt_period_mask(void)
 351{
 352	return cpu_online_mask;
 353}
 354
 355static inline
 356struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 357{
 358	return &cpu_rq(cpu)->rt;
 359}
 360
 361static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 362{
 363	return &def_rt_bandwidth;
 364}
 365
 366#endif /* CONFIG_RT_GROUP_SCHED */
 367
 368#ifdef CONFIG_SMP
 369/*
 370 * We ran out of runtime, see if we can borrow some from our neighbours.
 371 */
 372static int do_balance_runtime(struct rt_rq *rt_rq)
 373{
 374	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 375	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 376	int i, weight, more = 0;
 377	u64 rt_period;
 378
 379	weight = cpumask_weight(rd->span);
 380
 381	raw_spin_lock(&rt_b->rt_runtime_lock);
 382	rt_period = ktime_to_ns(rt_b->rt_period);
 383	for_each_cpu(i, rd->span) {
 384		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 385		s64 diff;
 386
 387		if (iter == rt_rq)
 388			continue;
 389
 390		raw_spin_lock(&iter->rt_runtime_lock);
 391		/*
 392		 * Either all rqs have inf runtime and there's nothing to steal
 393		 * or __disable_runtime() below sets a specific rq to inf to
 394		 * indicate its been disabled and disalow stealing.
 395		 */
 396		if (iter->rt_runtime == RUNTIME_INF)
 397			goto next;
 398
 399		/*
 400		 * From runqueues with spare time, take 1/n part of their
 401		 * spare time, but no more than our period.
 402		 */
 403		diff = iter->rt_runtime - iter->rt_time;
 404		if (diff > 0) {
 405			diff = div_u64((u64)diff, weight);
 406			if (rt_rq->rt_runtime + diff > rt_period)
 407				diff = rt_period - rt_rq->rt_runtime;
 408			iter->rt_runtime -= diff;
 409			rt_rq->rt_runtime += diff;
 410			more = 1;
 411			if (rt_rq->rt_runtime == rt_period) {
 412				raw_spin_unlock(&iter->rt_runtime_lock);
 413				break;
 414			}
 415		}
 416next:
 417		raw_spin_unlock(&iter->rt_runtime_lock);
 418	}
 419	raw_spin_unlock(&rt_b->rt_runtime_lock);
 420
 421	return more;
 422}
 423
 424/*
 425 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 426 */
 427static void __disable_runtime(struct rq *rq)
 428{
 429	struct root_domain *rd = rq->rd;
 430	rt_rq_iter_t iter;
 431	struct rt_rq *rt_rq;
 432
 433	if (unlikely(!scheduler_running))
 434		return;
 435
 436	for_each_rt_rq(rt_rq, iter, rq) {
 437		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 438		s64 want;
 439		int i;
 440
 441		raw_spin_lock(&rt_b->rt_runtime_lock);
 442		raw_spin_lock(&rt_rq->rt_runtime_lock);
 443		/*
 444		 * Either we're all inf and nobody needs to borrow, or we're
 445		 * already disabled and thus have nothing to do, or we have
 446		 * exactly the right amount of runtime to take out.
 447		 */
 448		if (rt_rq->rt_runtime == RUNTIME_INF ||
 449				rt_rq->rt_runtime == rt_b->rt_runtime)
 450			goto balanced;
 451		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 452
 453		/*
 454		 * Calculate the difference between what we started out with
 455		 * and what we current have, that's the amount of runtime
 456		 * we lend and now have to reclaim.
 457		 */
 458		want = rt_b->rt_runtime - rt_rq->rt_runtime;
 459
 460		/*
 461		 * Greedy reclaim, take back as much as we can.
 462		 */
 463		for_each_cpu(i, rd->span) {
 464			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 465			s64 diff;
 466
 467			/*
 468			 * Can't reclaim from ourselves or disabled runqueues.
 469			 */
 470			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 471				continue;
 472
 473			raw_spin_lock(&iter->rt_runtime_lock);
 474			if (want > 0) {
 475				diff = min_t(s64, iter->rt_runtime, want);
 476				iter->rt_runtime -= diff;
 477				want -= diff;
 478			} else {
 479				iter->rt_runtime -= want;
 480				want -= want;
 481			}
 482			raw_spin_unlock(&iter->rt_runtime_lock);
 483
 484			if (!want)
 485				break;
 486		}
 487
 488		raw_spin_lock(&rt_rq->rt_runtime_lock);
 489		/*
 490		 * We cannot be left wanting - that would mean some runtime
 491		 * leaked out of the system.
 492		 */
 493		BUG_ON(want);
 494balanced:
 495		/*
 496		 * Disable all the borrow logic by pretending we have inf
 497		 * runtime - in which case borrowing doesn't make sense.
 498		 */
 499		rt_rq->rt_runtime = RUNTIME_INF;
 500		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 501		raw_spin_unlock(&rt_b->rt_runtime_lock);
 502	}
 503}
 504
 505static void disable_runtime(struct rq *rq)
 506{
 507	unsigned long flags;
 508
 509	raw_spin_lock_irqsave(&rq->lock, flags);
 510	__disable_runtime(rq);
 511	raw_spin_unlock_irqrestore(&rq->lock, flags);
 512}
 513
 514static void __enable_runtime(struct rq *rq)
 515{
 516	rt_rq_iter_t iter;
 517	struct rt_rq *rt_rq;
 518
 519	if (unlikely(!scheduler_running))
 520		return;
 521
 522	/*
 523	 * Reset each runqueue's bandwidth settings
 524	 */
 525	for_each_rt_rq(rt_rq, iter, rq) {
 526		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 527
 528		raw_spin_lock(&rt_b->rt_runtime_lock);
 529		raw_spin_lock(&rt_rq->rt_runtime_lock);
 530		rt_rq->rt_runtime = rt_b->rt_runtime;
 531		rt_rq->rt_time = 0;
 532		rt_rq->rt_throttled = 0;
 533		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 534		raw_spin_unlock(&rt_b->rt_runtime_lock);
 535	}
 536}
 537
 538static void enable_runtime(struct rq *rq)
 539{
 540	unsigned long flags;
 541
 542	raw_spin_lock_irqsave(&rq->lock, flags);
 543	__enable_runtime(rq);
 544	raw_spin_unlock_irqrestore(&rq->lock, flags);
 545}
 546
 547static int balance_runtime(struct rt_rq *rt_rq)
 548{
 549	int more = 0;
 550
 551	if (rt_rq->rt_time > rt_rq->rt_runtime) {
 552		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 553		more = do_balance_runtime(rt_rq);
 554		raw_spin_lock(&rt_rq->rt_runtime_lock);
 555	}
 556
 557	return more;
 558}
 559#else /* !CONFIG_SMP */
 560static inline int balance_runtime(struct rt_rq *rt_rq)
 561{
 562	return 0;
 563}
 564#endif /* CONFIG_SMP */
 565
 566static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 567{
 568	int i, idle = 1;
 569	const struct cpumask *span;
 570
 571	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 572		return 1;
 573
 574	span = sched_rt_period_mask();
 575	for_each_cpu(i, span) {
 576		int enqueue = 0;
 577		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 578		struct rq *rq = rq_of_rt_rq(rt_rq);
 579
 580		raw_spin_lock(&rq->lock);
 581		if (rt_rq->rt_time) {
 582			u64 runtime;
 583
 584			raw_spin_lock(&rt_rq->rt_runtime_lock);
 585			if (rt_rq->rt_throttled)
 586				balance_runtime(rt_rq);
 587			runtime = rt_rq->rt_runtime;
 588			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 589			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 590				rt_rq->rt_throttled = 0;
 591				enqueue = 1;
 592
 593				/*
 594				 * Force a clock update if the CPU was idle,
 595				 * lest wakeup -> unthrottle time accumulate.
 596				 */
 597				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 598					rq->skip_clock_update = -1;
 599			}
 600			if (rt_rq->rt_time || rt_rq->rt_nr_running)
 601				idle = 0;
 602			raw_spin_unlock(&rt_rq->rt_runtime_lock);
 603		} else if (rt_rq->rt_nr_running) {
 604			idle = 0;
 605			if (!rt_rq_throttled(rt_rq))
 606				enqueue = 1;
 607		}
 608
 609		if (enqueue)
 610			sched_rt_rq_enqueue(rt_rq);
 611		raw_spin_unlock(&rq->lock);
 612	}
 613
 614	return idle;
 615}
 616
 617static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 618{
 619#ifdef CONFIG_RT_GROUP_SCHED
 620	struct rt_rq *rt_rq = group_rt_rq(rt_se);
 621
 622	if (rt_rq)
 623		return rt_rq->highest_prio.curr;
 624#endif
 625
 626	return rt_task_of(rt_se)->prio;
 627}
 628
 629static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 630{
 631	u64 runtime = sched_rt_runtime(rt_rq);
 632
 633	if (rt_rq->rt_throttled)
 634		return rt_rq_throttled(rt_rq);
 635
 636	if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
 637		return 0;
 638
 639	balance_runtime(rt_rq);
 640	runtime = sched_rt_runtime(rt_rq);
 641	if (runtime == RUNTIME_INF)
 642		return 0;
 643
 644	if (rt_rq->rt_time > runtime) {
 645		rt_rq->rt_throttled = 1;
 646		if (rt_rq_throttled(rt_rq)) {
 647			sched_rt_rq_dequeue(rt_rq);
 648			return 1;
 649		}
 650	}
 651
 652	return 0;
 653}
 654
 655/*
 656 * Update the current task's runtime statistics. Skip current tasks that
 657 * are not in our scheduling class.
 658 */
 659static void update_curr_rt(struct rq *rq)
 660{
 661	struct task_struct *curr = rq->curr;
 662	struct sched_rt_entity *rt_se = &curr->rt;
 663	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 664	u64 delta_exec;
 665
 666	if (curr->sched_class != &rt_sched_class)
 667		return;
 668
 669	delta_exec = rq->clock_task - curr->se.exec_start;
 670	if (unlikely((s64)delta_exec < 0))
 671		delta_exec = 0;
 672
 673	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 674
 675	curr->se.sum_exec_runtime += delta_exec;
 676	account_group_exec_runtime(curr, delta_exec);
 677
 678	curr->se.exec_start = rq->clock_task;
 679	cpuacct_charge(curr, delta_exec);
 680
 681	sched_rt_avg_update(rq, delta_exec);
 682
 683	if (!rt_bandwidth_enabled())
 684		return;
 685
 686	for_each_sched_rt_entity(rt_se) {
 687		rt_rq = rt_rq_of_se(rt_se);
 688
 689		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 690			raw_spin_lock(&rt_rq->rt_runtime_lock);
 691			rt_rq->rt_time += delta_exec;
 692			if (sched_rt_runtime_exceeded(rt_rq))
 693				resched_task(curr);
 694			raw_spin_unlock(&rt_rq->rt_runtime_lock);
 695		}
 696	}
 697}
 698
 699#if defined CONFIG_SMP
 700
 701static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
 702
 703static inline int next_prio(struct rq *rq)
 704{
 705	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
 706
 707	if (next && rt_prio(next->prio))
 708		return next->prio;
 709	else
 710		return MAX_RT_PRIO;
 711}
 712
 713static void
 714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 715{
 716	struct rq *rq = rq_of_rt_rq(rt_rq);
 717
 718	if (prio < prev_prio) {
 719
 720		/*
 721		 * If the new task is higher in priority than anything on the
 722		 * run-queue, we know that the previous high becomes our
 723		 * next-highest.
 724		 */
 725		rt_rq->highest_prio.next = prev_prio;
 726
 727		if (rq->online)
 728			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 729
 730	} else if (prio == rt_rq->highest_prio.curr)
 731		/*
 732		 * If the next task is equal in priority to the highest on
 733		 * the run-queue, then we implicitly know that the next highest
 734		 * task cannot be any lower than current
 735		 */
 736		rt_rq->highest_prio.next = prio;
 737	else if (prio < rt_rq->highest_prio.next)
 738		/*
 739		 * Otherwise, we need to recompute next-highest
 740		 */
 741		rt_rq->highest_prio.next = next_prio(rq);
 742}
 743
 744static void
 745dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 746{
 747	struct rq *rq = rq_of_rt_rq(rt_rq);
 748
 749	if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
 750		rt_rq->highest_prio.next = next_prio(rq);
 751
 752	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 753		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 754}
 755
 756#else /* CONFIG_SMP */
 757
 758static inline
 759void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 760static inline
 761void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 762
 763#endif /* CONFIG_SMP */
 764
 765#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 766static void
 767inc_rt_prio(struct rt_rq *rt_rq, int prio)
 768{
 769	int prev_prio = rt_rq->highest_prio.curr;
 770
 771	if (prio < prev_prio)
 772		rt_rq->highest_prio.curr = prio;
 773
 774	inc_rt_prio_smp(rt_rq, prio, prev_prio);
 775}
 776
 777static void
 778dec_rt_prio(struct rt_rq *rt_rq, int prio)
 779{
 780	int prev_prio = rt_rq->highest_prio.curr;
 781
 782	if (rt_rq->rt_nr_running) {
 783
 784		WARN_ON(prio < prev_prio);
 785
 786		/*
 787		 * This may have been our highest task, and therefore
 788		 * we may have some recomputation to do
 789		 */
 790		if (prio == prev_prio) {
 791			struct rt_prio_array *array = &rt_rq->active;
 792
 793			rt_rq->highest_prio.curr =
 794				sched_find_first_bit(array->bitmap);
 795		}
 796
 797	} else
 798		rt_rq->highest_prio.curr = MAX_RT_PRIO;
 799
 800	dec_rt_prio_smp(rt_rq, prio, prev_prio);
 801}
 802
 803#else
 804
 805static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 806static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 807
 808#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 809
 810#ifdef CONFIG_RT_GROUP_SCHED
 811
 812static void
 813inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 814{
 815	if (rt_se_boosted(rt_se))
 816		rt_rq->rt_nr_boosted++;
 817
 818	if (rt_rq->tg)
 819		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
 820}
 821
 822static void
 823dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 824{
 825	if (rt_se_boosted(rt_se))
 826		rt_rq->rt_nr_boosted--;
 827
 828	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 829}
 830
 831#else /* CONFIG_RT_GROUP_SCHED */
 832
 833static void
 834inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 835{
 836	start_rt_bandwidth(&def_rt_bandwidth);
 837}
 838
 839static inline
 840void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 841
 842#endif /* CONFIG_RT_GROUP_SCHED */
 843
 844static inline
 845void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 846{
 847	int prio = rt_se_prio(rt_se);
 848
 849	WARN_ON(!rt_prio(prio));
 850	rt_rq->rt_nr_running++;
 851
 852	inc_rt_prio(rt_rq, prio);
 853	inc_rt_migration(rt_se, rt_rq);
 854	inc_rt_group(rt_se, rt_rq);
 855}
 856
 857static inline
 858void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 859{
 860	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 861	WARN_ON(!rt_rq->rt_nr_running);
 862	rt_rq->rt_nr_running--;
 863
 864	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
 865	dec_rt_migration(rt_se, rt_rq);
 866	dec_rt_group(rt_se, rt_rq);
 867}
 868
 869static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 870{
 871	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 872	struct rt_prio_array *array = &rt_rq->active;
 873	struct rt_rq *group_rq = group_rt_rq(rt_se);
 874	struct list_head *queue = array->queue + rt_se_prio(rt_se);
 875
 876	/*
 877	 * Don't enqueue the group if its throttled, or when empty.
 878	 * The latter is a consequence of the former when a child group
 879	 * get throttled and the current group doesn't have any other
 880	 * active members.
 881	 */
 882	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 883		return;
 884
 885	if (!rt_rq->rt_nr_running)
 886		list_add_leaf_rt_rq(rt_rq);
 887
 888	if (head)
 889		list_add(&rt_se->run_list, queue);
 890	else
 891		list_add_tail(&rt_se->run_list, queue);
 892	__set_bit(rt_se_prio(rt_se), array->bitmap);
 893
 894	inc_rt_tasks(rt_se, rt_rq);
 895}
 896
 897static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 898{
 899	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 900	struct rt_prio_array *array = &rt_rq->active;
 901
 902	list_del_init(&rt_se->run_list);
 903	if (list_empty(array->queue + rt_se_prio(rt_se)))
 904		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 905
 906	dec_rt_tasks(rt_se, rt_rq);
 907	if (!rt_rq->rt_nr_running)
 908		list_del_leaf_rt_rq(rt_rq);
 909}
 910
 911/*
 912 * Because the prio of an upper entry depends on the lower
 913 * entries, we must remove entries top - down.
 914 */
 915static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 916{
 917	struct sched_rt_entity *back = NULL;
 918
 919	for_each_sched_rt_entity(rt_se) {
 920		rt_se->back = back;
 921		back = rt_se;
 922	}
 923
 924	for (rt_se = back; rt_se; rt_se = rt_se->back) {
 925		if (on_rt_rq(rt_se))
 926			__dequeue_rt_entity(rt_se);
 927	}
 928}
 929
 930static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 931{
 932	dequeue_rt_stack(rt_se);
 933	for_each_sched_rt_entity(rt_se)
 934		__enqueue_rt_entity(rt_se, head);
 935}
 936
 937static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 938{
 939	dequeue_rt_stack(rt_se);
 940
 941	for_each_sched_rt_entity(rt_se) {
 942		struct rt_rq *rt_rq = group_rt_rq(rt_se);
 943
 944		if (rt_rq && rt_rq->rt_nr_running)
 945			__enqueue_rt_entity(rt_se, false);
 946	}
 947}
 948
 949/*
 950 * Adding/removing a task to/from a priority array:
 951 */
 952static void
 953enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 954{
 955	struct sched_rt_entity *rt_se = &p->rt;
 956
 957	if (flags & ENQUEUE_WAKEUP)
 958		rt_se->timeout = 0;
 959
 960	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 961
 962	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 963		enqueue_pushable_task(rq, p);
 964}
 965
 966static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 967{
 968	struct sched_rt_entity *rt_se = &p->rt;
 969
 970	update_curr_rt(rq);
 971	dequeue_rt_entity(rt_se);
 972
 973	dequeue_pushable_task(rq, p);
 974}
 975
 976/*
 977 * Put task to the end of the run list without the overhead of dequeue
 978 * followed by enqueue.
 979 */
 980static void
 981requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 982{
 983	if (on_rt_rq(rt_se)) {
 984		struct rt_prio_array *array = &rt_rq->active;
 985		struct list_head *queue = array->queue + rt_se_prio(rt_se);
 986
 987		if (head)
 988			list_move(&rt_se->run_list, queue);
 989		else
 990			list_move_tail(&rt_se->run_list, queue);
 991	}
 992}
 993
 994static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 995{
 996	struct sched_rt_entity *rt_se = &p->rt;
 997	struct rt_rq *rt_rq;
 998
 999	for_each_sched_rt_entity(rt_se) {
1000		rt_rq = rt_rq_of_se(rt_se);
1001		requeue_rt_entity(rt_rq, rt_se, head);
1002	}
1003}
1004
1005static void yield_task_rt(struct rq *rq)
1006{
1007	requeue_task_rt(rq, rq->curr, 0);
1008}
1009
1010#ifdef CONFIG_SMP
1011static int find_lowest_rq(struct task_struct *task);
1012
1013static int
1014select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1015{
1016	struct task_struct *curr;
1017	struct rq *rq;
1018	int cpu;
1019
1020	if (sd_flag != SD_BALANCE_WAKE)
1021		return smp_processor_id();
1022
1023	cpu = task_cpu(p);
1024	rq = cpu_rq(cpu);
1025
1026	rcu_read_lock();
1027	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1028
1029	/*
1030	 * If the current task on @p's runqueue is an RT task, then
1031	 * try to see if we can wake this RT task up on another
1032	 * runqueue. Otherwise simply start this RT task
1033	 * on its current runqueue.
1034	 *
1035	 * We want to avoid overloading runqueues. If the woken
1036	 * task is a higher priority, then it will stay on this CPU
1037	 * and the lower prio task should be moved to another CPU.
1038	 * Even though this will probably make the lower prio task
1039	 * lose its cache, we do not want to bounce a higher task
1040	 * around just because it gave up its CPU, perhaps for a
1041	 * lock?
1042	 *
1043	 * For equal prio tasks, we just let the scheduler sort it out.
1044	 *
1045	 * Otherwise, just let it ride on the affined RQ and the
1046	 * post-schedule router will push the preempted task away
1047	 *
1048	 * This test is optimistic, if we get it wrong the load-balancer
1049	 * will have to sort it out.
1050	 */
1051	if (curr && unlikely(rt_task(curr)) &&
1052	    (curr->rt.nr_cpus_allowed < 2 ||
1053	     curr->prio <= p->prio) &&
1054	    (p->rt.nr_cpus_allowed > 1)) {
1055		int target = find_lowest_rq(p);
1056
1057		if (target != -1)
1058			cpu = target;
1059	}
1060	rcu_read_unlock();
1061
1062	return cpu;
1063}
1064
1065static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1066{
1067	if (rq->curr->rt.nr_cpus_allowed == 1)
1068		return;
1069
1070	if (p->rt.nr_cpus_allowed != 1
1071	    && cpupri_find(&rq->rd->cpupri, p, NULL))
1072		return;
1073
1074	if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1075		return;
1076
1077	/*
1078	 * There appears to be other cpus that can accept
1079	 * current and none to run 'p', so lets reschedule
1080	 * to try and push current away:
1081	 */
1082	requeue_task_rt(rq, p, 1);
1083	resched_task(rq->curr);
1084}
1085
1086#endif /* CONFIG_SMP */
1087
1088/*
1089 * Preempt the current task with a newly woken task if needed:
1090 */
1091static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1092{
1093	if (p->prio < rq->curr->prio) {
1094		resched_task(rq->curr);
1095		return;
1096	}
1097
1098#ifdef CONFIG_SMP
1099	/*
1100	 * If:
1101	 *
1102	 * - the newly woken task is of equal priority to the current task
1103	 * - the newly woken task is non-migratable while current is migratable
1104	 * - current will be preempted on the next reschedule
1105	 *
1106	 * we should check to see if current can readily move to a different
1107	 * cpu.  If so, we will reschedule to allow the push logic to try
1108	 * to move current somewhere else, making room for our non-migratable
1109	 * task.
1110	 */
1111	if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1112		check_preempt_equal_prio(rq, p);
1113#endif
1114}
1115
1116static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1117						   struct rt_rq *rt_rq)
1118{
1119	struct rt_prio_array *array = &rt_rq->active;
1120	struct sched_rt_entity *next = NULL;
1121	struct list_head *queue;
1122	int idx;
1123
1124	idx = sched_find_first_bit(array->bitmap);
1125	BUG_ON(idx >= MAX_RT_PRIO);
1126
1127	queue = array->queue + idx;
1128	next = list_entry(queue->next, struct sched_rt_entity, run_list);
1129
1130	return next;
1131}
1132
1133static struct task_struct *_pick_next_task_rt(struct rq *rq)
1134{
1135	struct sched_rt_entity *rt_se;
1136	struct task_struct *p;
1137	struct rt_rq *rt_rq;
1138
1139	rt_rq = &rq->rt;
1140
1141	if (!rt_rq->rt_nr_running)
1142		return NULL;
1143
1144	if (rt_rq_throttled(rt_rq))
1145		return NULL;
1146
1147	do {
1148		rt_se = pick_next_rt_entity(rq, rt_rq);
1149		BUG_ON(!rt_se);
1150		rt_rq = group_rt_rq(rt_se);
1151	} while (rt_rq);
1152
1153	p = rt_task_of(rt_se);
1154	p->se.exec_start = rq->clock_task;
1155
1156	return p;
1157}
1158
1159static struct task_struct *pick_next_task_rt(struct rq *rq)
1160{
1161	struct task_struct *p = _pick_next_task_rt(rq);
1162
1163	/* The running task is never eligible for pushing */
1164	if (p)
1165		dequeue_pushable_task(rq, p);
1166
1167#ifdef CONFIG_SMP
1168	/*
1169	 * We detect this state here so that we can avoid taking the RQ
1170	 * lock again later if there is no need to push
1171	 */
1172	rq->post_schedule = has_pushable_tasks(rq);
1173#endif
1174
1175	return p;
1176}
1177
1178static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1179{
1180	update_curr_rt(rq);
1181	p->se.exec_start = 0;
1182
1183	/*
1184	 * The previous task needs to be made eligible for pushing
1185	 * if it is still active
1186	 */
1187	if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1188		enqueue_pushable_task(rq, p);
1189}
1190
1191#ifdef CONFIG_SMP
1192
1193/* Only try algorithms three times */
1194#define RT_MAX_TRIES 3
1195
1196static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1197
1198static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1199{
1200	if (!task_running(rq, p) &&
1201	    (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1202	    (p->rt.nr_cpus_allowed > 1))
1203		return 1;
1204	return 0;
1205}
1206
1207/* Return the second highest RT task, NULL otherwise */
1208static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1209{
1210	struct task_struct *next = NULL;
1211	struct sched_rt_entity *rt_se;
1212	struct rt_prio_array *array;
1213	struct rt_rq *rt_rq;
1214	int idx;
1215
1216	for_each_leaf_rt_rq(rt_rq, rq) {
1217		array = &rt_rq->active;
1218		idx = sched_find_first_bit(array->bitmap);
1219next_idx:
1220		if (idx >= MAX_RT_PRIO)
1221			continue;
1222		if (next && next->prio < idx)
1223			continue;
1224		list_for_each_entry(rt_se, array->queue + idx, run_list) {
1225			struct task_struct *p;
1226
1227			if (!rt_entity_is_task(rt_se))
1228				continue;
1229
1230			p = rt_task_of(rt_se);
1231			if (pick_rt_task(rq, p, cpu)) {
1232				next = p;
1233				break;
1234			}
1235		}
1236		if (!next) {
1237			idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1238			goto next_idx;
1239		}
1240	}
1241
1242	return next;
1243}
1244
1245static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1246
1247static int find_lowest_rq(struct task_struct *task)
1248{
1249	struct sched_domain *sd;
1250	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1251	int this_cpu = smp_processor_id();
1252	int cpu      = task_cpu(task);
1253
1254	/* Make sure the mask is initialized first */
1255	if (unlikely(!lowest_mask))
1256		return -1;
1257
1258	if (task->rt.nr_cpus_allowed == 1)
1259		return -1; /* No other targets possible */
1260
1261	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1262		return -1; /* No targets found */
1263
1264	/*
1265	 * At this point we have built a mask of cpus representing the
1266	 * lowest priority tasks in the system.  Now we want to elect
1267	 * the best one based on our affinity and topology.
1268	 *
1269	 * We prioritize the last cpu that the task executed on since
1270	 * it is most likely cache-hot in that location.
1271	 */
1272	if (cpumask_test_cpu(cpu, lowest_mask))
1273		return cpu;
1274
1275	/*
1276	 * Otherwise, we consult the sched_domains span maps to figure
1277	 * out which cpu is logically closest to our hot cache data.
1278	 */
1279	if (!cpumask_test_cpu(this_cpu, lowest_mask))
1280		this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1281
1282	rcu_read_lock();
1283	for_each_domain(cpu, sd) {
1284		if (sd->flags & SD_WAKE_AFFINE) {
1285			int best_cpu;
1286
1287			/*
1288			 * "this_cpu" is cheaper to preempt than a
1289			 * remote processor.
1290			 */
1291			if (this_cpu != -1 &&
1292			    cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1293				rcu_read_unlock();
1294				return this_cpu;
1295			}
1296
1297			best_cpu = cpumask_first_and(lowest_mask,
1298						     sched_domain_span(sd));
1299			if (best_cpu < nr_cpu_ids) {
1300				rcu_read_unlock();
1301				return best_cpu;
1302			}
1303		}
1304	}
1305	rcu_read_unlock();
1306
1307	/*
1308	 * And finally, if there were no matches within the domains
1309	 * just give the caller *something* to work with from the compatible
1310	 * locations.
1311	 */
1312	if (this_cpu != -1)
1313		return this_cpu;
1314
1315	cpu = cpumask_any(lowest_mask);
1316	if (cpu < nr_cpu_ids)
1317		return cpu;
1318	return -1;
1319}
1320
1321/* Will lock the rq it finds */
1322static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1323{
1324	struct rq *lowest_rq = NULL;
1325	int tries;
1326	int cpu;
1327
1328	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1329		cpu = find_lowest_rq(task);
1330
1331		if ((cpu == -1) || (cpu == rq->cpu))
1332			break;
1333
1334		lowest_rq = cpu_rq(cpu);
1335
1336		/* if the prio of this runqueue changed, try again */
1337		if (double_lock_balance(rq, lowest_rq)) {
1338			/*
1339			 * We had to unlock the run queue. In
1340			 * the mean time, task could have
1341			 * migrated already or had its affinity changed.
1342			 * Also make sure that it wasn't scheduled on its rq.
1343			 */
1344			if (unlikely(task_rq(task) != rq ||
1345				     !cpumask_test_cpu(lowest_rq->cpu,
1346						       &task->cpus_allowed) ||
1347				     task_running(rq, task) ||
1348				     !task->on_rq)) {
1349
1350				raw_spin_unlock(&lowest_rq->lock);
1351				lowest_rq = NULL;
1352				break;
1353			}
1354		}
1355
1356		/* If this rq is still suitable use it. */
1357		if (lowest_rq->rt.highest_prio.curr > task->prio)
1358			break;
1359
1360		/* try again */
1361		double_unlock_balance(rq, lowest_rq);
1362		lowest_rq = NULL;
1363	}
1364
1365	return lowest_rq;
1366}
1367
1368static struct task_struct *pick_next_pushable_task(struct rq *rq)
1369{
1370	struct task_struct *p;
1371
1372	if (!has_pushable_tasks(rq))
1373		return NULL;
1374
1375	p = plist_first_entry(&rq->rt.pushable_tasks,
1376			      struct task_struct, pushable_tasks);
1377
1378	BUG_ON(rq->cpu != task_cpu(p));
1379	BUG_ON(task_current(rq, p));
1380	BUG_ON(p->rt.nr_cpus_allowed <= 1);
1381
1382	BUG_ON(!p->on_rq);
1383	BUG_ON(!rt_task(p));
1384
1385	return p;
1386}
1387
1388/*
1389 * If the current CPU has more than one RT task, see if the non
1390 * running task can migrate over to a CPU that is running a task
1391 * of lesser priority.
1392 */
1393static int push_rt_task(struct rq *rq)
1394{
1395	struct task_struct *next_task;
1396	struct rq *lowest_rq;
1397
1398	if (!rq->rt.overloaded)
1399		return 0;
1400
1401	next_task = pick_next_pushable_task(rq);
1402	if (!next_task)
1403		return 0;
1404
1405retry:
1406	if (unlikely(next_task == rq->curr)) {
1407		WARN_ON(1);
1408		return 0;
1409	}
1410
1411	/*
1412	 * It's possible that the next_task slipped in of
1413	 * higher priority than current. If that's the case
1414	 * just reschedule current.
1415	 */
1416	if (unlikely(next_task->prio < rq->curr->prio)) {
1417		resched_task(rq->curr);
1418		return 0;
1419	}
1420
1421	/* We might release rq lock */
1422	get_task_struct(next_task);
1423
1424	/* find_lock_lowest_rq locks the rq if found */
1425	lowest_rq = find_lock_lowest_rq(next_task, rq);
1426	if (!lowest_rq) {
1427		struct task_struct *task;
1428		/*
1429		 * find lock_lowest_rq releases rq->lock
1430		 * so it is possible that next_task has migrated.
1431		 *
1432		 * We need to make sure that the task is still on the same
1433		 * run-queue and is also still the next task eligible for
1434		 * pushing.
1435		 */
1436		task = pick_next_pushable_task(rq);
1437		if (task_cpu(next_task) == rq->cpu && task == next_task) {
1438			/*
1439			 * If we get here, the task hasn't moved at all, but
1440			 * it has failed to push.  We will not try again,
1441			 * since the other cpus will pull from us when they
1442			 * are ready.
1443			 */
1444			dequeue_pushable_task(rq, next_task);
1445			goto out;
1446		}
1447
1448		if (!task)
1449			/* No more tasks, just exit */
1450			goto out;
1451
1452		/*
1453		 * Something has shifted, try again.
1454		 */
1455		put_task_struct(next_task);
1456		next_task = task;
1457		goto retry;
1458	}
1459
1460	deactivate_task(rq, next_task, 0);
1461	set_task_cpu(next_task, lowest_rq->cpu);
1462	activate_task(lowest_rq, next_task, 0);
1463
1464	resched_task(lowest_rq->curr);
1465
1466	double_unlock_balance(rq, lowest_rq);
1467
1468out:
1469	put_task_struct(next_task);
1470
1471	return 1;
1472}
1473
1474static void push_rt_tasks(struct rq *rq)
1475{
1476	/* push_rt_task will return true if it moved an RT */
1477	while (push_rt_task(rq))
1478		;
1479}
1480
1481static int pull_rt_task(struct rq *this_rq)
1482{
1483	int this_cpu = this_rq->cpu, ret = 0, cpu;
1484	struct task_struct *p;
1485	struct rq *src_rq;
1486
1487	if (likely(!rt_overloaded(this_rq)))
1488		return 0;
1489
1490	for_each_cpu(cpu, this_rq->rd->rto_mask) {
1491		if (this_cpu == cpu)
1492			continue;
1493
1494		src_rq = cpu_rq(cpu);
1495
1496		/*
1497		 * Don't bother taking the src_rq->lock if the next highest
1498		 * task is known to be lower-priority than our current task.
1499		 * This may look racy, but if this value is about to go
1500		 * logically higher, the src_rq will push this task away.
1501		 * And if its going logically lower, we do not care
1502		 */
1503		if (src_rq->rt.highest_prio.next >=
1504		    this_rq->rt.highest_prio.curr)
1505			continue;
1506
1507		/*
1508		 * We can potentially drop this_rq's lock in
1509		 * double_lock_balance, and another CPU could
1510		 * alter this_rq
1511		 */
1512		double_lock_balance(this_rq, src_rq);
1513
1514		/*
1515		 * Are there still pullable RT tasks?
1516		 */
1517		if (src_rq->rt.rt_nr_running <= 1)
1518			goto skip;
1519
1520		p = pick_next_highest_task_rt(src_rq, this_cpu);
1521
1522		/*
1523		 * Do we have an RT task that preempts
1524		 * the to-be-scheduled task?
1525		 */
1526		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1527			WARN_ON(p == src_rq->curr);
1528			WARN_ON(!p->on_rq);
1529
1530			/*
1531			 * There's a chance that p is higher in priority
1532			 * than what's currently running on its cpu.
1533			 * This is just that p is wakeing up and hasn't
1534			 * had a chance to schedule. We only pull
1535			 * p if it is lower in priority than the
1536			 * current task on the run queue
1537			 */
1538			if (p->prio < src_rq->curr->prio)
1539				goto skip;
1540
1541			ret = 1;
1542
1543			deactivate_task(src_rq, p, 0);
1544			set_task_cpu(p, this_cpu);
1545			activate_task(this_rq, p, 0);
1546			/*
1547			 * We continue with the search, just in
1548			 * case there's an even higher prio task
1549			 * in another runqueue. (low likelihood
1550			 * but possible)
1551			 */
1552		}
1553skip:
1554		double_unlock_balance(this_rq, src_rq);
1555	}
1556
1557	return ret;
1558}
1559
1560static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1561{
1562	/* Try to pull RT tasks here if we lower this rq's prio */
1563	if (rq->rt.highest_prio.curr > prev->prio)
1564		pull_rt_task(rq);
1565}
1566
1567static void post_schedule_rt(struct rq *rq)
1568{
1569	push_rt_tasks(rq);
1570}
1571
1572/*
1573 * If we are not running and we are not going to reschedule soon, we should
1574 * try to push tasks away now
1575 */
1576static void task_woken_rt(struct rq *rq, struct task_struct *p)
1577{
1578	if (!task_running(rq, p) &&
1579	    !test_tsk_need_resched(rq->curr) &&
1580	    has_pushable_tasks(rq) &&
1581	    p->rt.nr_cpus_allowed > 1 &&
1582	    rt_task(rq->curr) &&
1583	    (rq->curr->rt.nr_cpus_allowed < 2 ||
1584	     rq->curr->prio <= p->prio))
1585		push_rt_tasks(rq);
1586}
1587
1588static void set_cpus_allowed_rt(struct task_struct *p,
1589				const struct cpumask *new_mask)
1590{
1591	int weight = cpumask_weight(new_mask);
1592
1593	BUG_ON(!rt_task(p));
1594
1595	/*
1596	 * Update the migration status of the RQ if we have an RT task
1597	 * which is running AND changing its weight value.
1598	 */
1599	if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1600		struct rq *rq = task_rq(p);
1601
1602		if (!task_current(rq, p)) {
1603			/*
1604			 * Make sure we dequeue this task from the pushable list
1605			 * before going further.  It will either remain off of
1606			 * the list because we are no longer pushable, or it
1607			 * will be requeued.
1608			 */
1609			if (p->rt.nr_cpus_allowed > 1)
1610				dequeue_pushable_task(rq, p);
1611
1612			/*
1613			 * Requeue if our weight is changing and still > 1
1614			 */
1615			if (weight > 1)
1616				enqueue_pushable_task(rq, p);
1617
1618		}
1619
1620		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1621			rq->rt.rt_nr_migratory++;
1622		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1623			BUG_ON(!rq->rt.rt_nr_migratory);
1624			rq->rt.rt_nr_migratory--;
1625		}
1626
1627		update_rt_migration(&rq->rt);
1628	}
1629
1630	cpumask_copy(&p->cpus_allowed, new_mask);
1631	p->rt.nr_cpus_allowed = weight;
1632}
1633
1634/* Assumes rq->lock is held */
1635static void rq_online_rt(struct rq *rq)
1636{
1637	if (rq->rt.overloaded)
1638		rt_set_overload(rq);
1639
1640	__enable_runtime(rq);
1641
1642	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1643}
1644
1645/* Assumes rq->lock is held */
1646static void rq_offline_rt(struct rq *rq)
1647{
1648	if (rq->rt.overloaded)
1649		rt_clear_overload(rq);
1650
1651	__disable_runtime(rq);
1652
1653	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1654}
1655
1656/*
1657 * When switch from the rt queue, we bring ourselves to a position
1658 * that we might want to pull RT tasks from other runqueues.
1659 */
1660static void switched_from_rt(struct rq *rq, struct task_struct *p)
1661{
1662	/*
1663	 * If there are other RT tasks then we will reschedule
1664	 * and the scheduling of the other RT tasks will handle
1665	 * the balancing. But if we are the last RT task
1666	 * we may need to handle the pulling of RT tasks
1667	 * now.
1668	 */
1669	if (p->on_rq && !rq->rt.rt_nr_running)
1670		pull_rt_task(rq);
1671}
1672
1673static inline void init_sched_rt_class(void)
1674{
1675	unsigned int i;
1676
1677	for_each_possible_cpu(i)
1678		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1679					GFP_KERNEL, cpu_to_node(i));
1680}
1681#endif /* CONFIG_SMP */
1682
1683/*
1684 * When switching a task to RT, we may overload the runqueue
1685 * with RT tasks. In this case we try to push them off to
1686 * other runqueues.
1687 */
1688static void switched_to_rt(struct rq *rq, struct task_struct *p)
1689{
1690	int check_resched = 1;
1691
1692	/*
1693	 * If we are already running, then there's nothing
1694	 * that needs to be done. But if we are not running
1695	 * we may need to preempt the current running task.
1696	 * If that current running task is also an RT task
1697	 * then see if we can move to another run queue.
1698	 */
1699	if (p->on_rq && rq->curr != p) {
1700#ifdef CONFIG_SMP
1701		if (rq->rt.overloaded && push_rt_task(rq) &&
1702		    /* Don't resched if we changed runqueues */
1703		    rq != task_rq(p))
1704			check_resched = 0;
1705#endif /* CONFIG_SMP */
1706		if (check_resched && p->prio < rq->curr->prio)
1707			resched_task(rq->curr);
1708	}
1709}
1710
1711/*
1712 * Priority of the task has changed. This may cause
1713 * us to initiate a push or pull.
1714 */
1715static void
1716prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1717{
1718	if (!p->on_rq)
1719		return;
1720
1721	if (rq->curr == p) {
1722#ifdef CONFIG_SMP
1723		/*
1724		 * If our priority decreases while running, we
1725		 * may need to pull tasks to this runqueue.
1726		 */
1727		if (oldprio < p->prio)
1728			pull_rt_task(rq);
1729		/*
1730		 * If there's a higher priority task waiting to run
1731		 * then reschedule. Note, the above pull_rt_task
1732		 * can release the rq lock and p could migrate.
1733		 * Only reschedule if p is still on the same runqueue.
1734		 */
1735		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1736			resched_task(p);
1737#else
1738		/* For UP simply resched on drop of prio */
1739		if (oldprio < p->prio)
1740			resched_task(p);
1741#endif /* CONFIG_SMP */
1742	} else {
1743		/*
1744		 * This task is not running, but if it is
1745		 * greater than the current running task
1746		 * then reschedule.
1747		 */
1748		if (p->prio < rq->curr->prio)
1749			resched_task(rq->curr);
1750	}
1751}
1752
1753static void watchdog(struct rq *rq, struct task_struct *p)
1754{
1755	unsigned long soft, hard;
1756
1757	/* max may change after cur was read, this will be fixed next tick */
1758	soft = task_rlimit(p, RLIMIT_RTTIME);
1759	hard = task_rlimit_max(p, RLIMIT_RTTIME);
1760
1761	if (soft != RLIM_INFINITY) {
1762		unsigned long next;
1763
1764		p->rt.timeout++;
1765		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1766		if (p->rt.timeout > next)
1767			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1768	}
1769}
1770
1771static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1772{
1773	update_curr_rt(rq);
1774
1775	watchdog(rq, p);
1776
1777	/*
1778	 * RR tasks need a special form of timeslice management.
1779	 * FIFO tasks have no timeslices.
1780	 */
1781	if (p->policy != SCHED_RR)
1782		return;
1783
1784	if (--p->rt.time_slice)
1785		return;
1786
1787	p->rt.time_slice = DEF_TIMESLICE;
1788
1789	/*
1790	 * Requeue to the end of queue if we are not the only element
1791	 * on the queue:
1792	 */
1793	if (p->rt.run_list.prev != p->rt.run_list.next) {
1794		requeue_task_rt(rq, p, 0);
1795		set_tsk_need_resched(p);
1796	}
1797}
1798
1799static void set_curr_task_rt(struct rq *rq)
1800{
1801	struct task_struct *p = rq->curr;
1802
1803	p->se.exec_start = rq->clock_task;
1804
1805	/* The running task is never eligible for pushing */
1806	dequeue_pushable_task(rq, p);
1807}
1808
1809static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1810{
1811	/*
1812	 * Time slice is 0 for SCHED_FIFO tasks
1813	 */
1814	if (task->policy == SCHED_RR)
1815		return DEF_TIMESLICE;
1816	else
1817		return 0;
1818}
1819
1820static const struct sched_class rt_sched_class = {
1821	.next			= &fair_sched_class,
1822	.enqueue_task		= enqueue_task_rt,
1823	.dequeue_task		= dequeue_task_rt,
1824	.yield_task		= yield_task_rt,
1825
1826	.check_preempt_curr	= check_preempt_curr_rt,
1827
1828	.pick_next_task		= pick_next_task_rt,
1829	.put_prev_task		= put_prev_task_rt,
1830
1831#ifdef CONFIG_SMP
1832	.select_task_rq		= select_task_rq_rt,
1833
1834	.set_cpus_allowed       = set_cpus_allowed_rt,
1835	.rq_online              = rq_online_rt,
1836	.rq_offline             = rq_offline_rt,
1837	.pre_schedule		= pre_schedule_rt,
1838	.post_schedule		= post_schedule_rt,
1839	.task_woken		= task_woken_rt,
1840	.switched_from		= switched_from_rt,
1841#endif
1842
1843	.set_curr_task          = set_curr_task_rt,
1844	.task_tick		= task_tick_rt,
1845
1846	.get_rr_interval	= get_rr_interval_rt,
1847
1848	.prio_changed		= prio_changed_rt,
1849	.switched_to		= switched_to_rt,
1850};
1851
1852#ifdef CONFIG_SCHED_DEBUG
1853extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1854
1855static void print_rt_stats(struct seq_file *m, int cpu)
1856{
1857	rt_rq_iter_t iter;
1858	struct rt_rq *rt_rq;
1859
1860	rcu_read_lock();
1861	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1862		print_rt_rq(m, cpu, rt_rq);
1863	rcu_read_unlock();
1864}
1865#endif /* CONFIG_SCHED_DEBUG */
1866