rcutree_plugin.h - kernel/rcutree_plugin.h - Linux source code v3.15

Note: File does not exist in v3.15.
   1/*
   2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
   3 * Internal non-public definitions that provide either classic
   4 * or preemptible semantics.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  19 *
  20 * Copyright Red Hat, 2009
  21 * Copyright IBM Corporation, 2009
  22 *
  23 * Author: Ingo Molnar <mingo@elte.hu>
  24 *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  25 */
  26
  27#include <linux/delay.h>
  28#include <linux/stop_machine.h>
  29
  30/*
  31 * Check the RCU kernel configuration parameters and print informative
  32 * messages about anything out of the ordinary.  If you like #ifdef, you
  33 * will love this function.
  34 */
  35static void __init rcu_bootup_announce_oddness(void)
  36{
  37#ifdef CONFIG_RCU_TRACE
  38	printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
  39#endif
  40#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
  41	printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
  42	       CONFIG_RCU_FANOUT);
  43#endif
  44#ifdef CONFIG_RCU_FANOUT_EXACT
  45	printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
  46#endif
  47#ifdef CONFIG_RCU_FAST_NO_HZ
  48	printk(KERN_INFO
  49	       "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
  50#endif
  51#ifdef CONFIG_PROVE_RCU
  52	printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
  53#endif
  54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
  55	printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
  56#endif
  57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
  58	printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
  59#endif
  60#if NUM_RCU_LVL_4 != 0
  61	printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
  62#endif
  63}
  64
  65#ifdef CONFIG_TREE_PREEMPT_RCU
  66
  67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
  68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
  69static struct rcu_state *rcu_state = &rcu_preempt_state;
  70
  71static void rcu_read_unlock_special(struct task_struct *t);
  72static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  73
  74/*
  75 * Tell them what RCU they are running.
  76 */
  77static void __init rcu_bootup_announce(void)
  78{
  79	printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
  80	rcu_bootup_announce_oddness();
  81}
  82
  83/*
  84 * Return the number of RCU-preempt batches processed thus far
  85 * for debug and statistics.
  86 */
  87long rcu_batches_completed_preempt(void)
  88{
  89	return rcu_preempt_state.completed;
  90}
  91EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
  92
  93/*
  94 * Return the number of RCU batches processed thus far for debug & stats.
  95 */
  96long rcu_batches_completed(void)
  97{
  98	return rcu_batches_completed_preempt();
  99}
 100EXPORT_SYMBOL_GPL(rcu_batches_completed);
 101
 102/*
 103 * Force a quiescent state for preemptible RCU.
 104 */
 105void rcu_force_quiescent_state(void)
 106{
 107	force_quiescent_state(&rcu_preempt_state, 0);
 108}
 109EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 110
 111/*
 112 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 113 * that this just means that the task currently running on the CPU is
 114 * not in a quiescent state.  There might be any number of tasks blocked
 115 * while in an RCU read-side critical section.
 116 *
 117 * Unlike the other rcu_*_qs() functions, callers to this function
 118 * must disable irqs in order to protect the assignment to
 119 * ->rcu_read_unlock_special.
 120 */
 121static void rcu_preempt_qs(int cpu)
 122{
 123	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
 124
 125	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 126	barrier();
 127	rdp->passed_quiesc = 1;
 128	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 129}
 130
 131/*
 132 * We have entered the scheduler, and the current task might soon be
 133 * context-switched away from.  If this task is in an RCU read-side
 134 * critical section, we will no longer be able to rely on the CPU to
 135 * record that fact, so we enqueue the task on the blkd_tasks list.
 136 * The task will dequeue itself when it exits the outermost enclosing
 137 * RCU read-side critical section.  Therefore, the current grace period
 138 * cannot be permitted to complete until the blkd_tasks list entries
 139 * predating the current grace period drain, in other words, until
 140 * rnp->gp_tasks becomes NULL.
 141 *
 142 * Caller must disable preemption.
 143 */
 144static void rcu_preempt_note_context_switch(int cpu)
 145{
 146	struct task_struct *t = current;
 147	unsigned long flags;
 148	struct rcu_data *rdp;
 149	struct rcu_node *rnp;
 150
 151	if (t->rcu_read_lock_nesting > 0 &&
 152	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
 153
 154		/* Possibly blocking in an RCU read-side critical section. */
 155		rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
 156		rnp = rdp->mynode;
 157		raw_spin_lock_irqsave(&rnp->lock, flags);
 158		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
 159		t->rcu_blocked_node = rnp;
 160
 161		/*
 162		 * If this CPU has already checked in, then this task
 163		 * will hold up the next grace period rather than the
 164		 * current grace period.  Queue the task accordingly.
 165		 * If the task is queued for the current grace period
 166		 * (i.e., this CPU has not yet passed through a quiescent
 167		 * state for the current grace period), then as long
 168		 * as that task remains queued, the current grace period
 169		 * cannot end.  Note that there is some uncertainty as
 170		 * to exactly when the current grace period started.
 171		 * We take a conservative approach, which can result
 172		 * in unnecessarily waiting on tasks that started very
 173		 * slightly after the current grace period began.  C'est
 174		 * la vie!!!
 175		 *
 176		 * But first, note that the current CPU must still be
 177		 * on line!
 178		 */
 179		WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
 180		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 181		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 182			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
 183			rnp->gp_tasks = &t->rcu_node_entry;
 184#ifdef CONFIG_RCU_BOOST
 185			if (rnp->boost_tasks != NULL)
 186				rnp->boost_tasks = rnp->gp_tasks;
 187#endif /* #ifdef CONFIG_RCU_BOOST */
 188		} else {
 189			list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
 190			if (rnp->qsmask & rdp->grpmask)
 191				rnp->gp_tasks = &t->rcu_node_entry;
 192		}
 193		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 194	} else if (t->rcu_read_lock_nesting < 0 &&
 195		   t->rcu_read_unlock_special) {
 196
 197		/*
 198		 * Complete exit from RCU read-side critical section on
 199		 * behalf of preempted instance of __rcu_read_unlock().
 200		 */
 201		rcu_read_unlock_special(t);
 202	}
 203
 204	/*
 205	 * Either we were not in an RCU read-side critical section to
 206	 * begin with, or we have now recorded that critical section
 207	 * globally.  Either way, we can now note a quiescent state
 208	 * for this CPU.  Again, if we were in an RCU read-side critical
 209	 * section, and if that critical section was blocking the current
 210	 * grace period, then the fact that the task has been enqueued
 211	 * means that we continue to block the current grace period.
 212	 */
 213	local_irq_save(flags);
 214	rcu_preempt_qs(cpu);
 215	local_irq_restore(flags);
 216}
 217
 218/*
 219 * Tree-preemptible RCU implementation for rcu_read_lock().
 220 * Just increment ->rcu_read_lock_nesting, shared state will be updated
 221 * if we block.
 222 */
 223void __rcu_read_lock(void)
 224{
 225	current->rcu_read_lock_nesting++;
 226	barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
 227}
 228EXPORT_SYMBOL_GPL(__rcu_read_lock);
 229
 230/*
 231 * Check for preempted RCU readers blocking the current grace period
 232 * for the specified rcu_node structure.  If the caller needs a reliable
 233 * answer, it must hold the rcu_node's ->lock.
 234 */
 235static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 236{
 237	return rnp->gp_tasks != NULL;
 238}
 239
 240/*
 241 * Record a quiescent state for all tasks that were previously queued
 242 * on the specified rcu_node structure and that were blocking the current
 243 * RCU grace period.  The caller must hold the specified rnp->lock with
 244 * irqs disabled, and this lock is released upon return, but irqs remain
 245 * disabled.
 246 */
 247static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 248	__releases(rnp->lock)
 249{
 250	unsigned long mask;
 251	struct rcu_node *rnp_p;
 252
 253	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 254		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 255		return;  /* Still need more quiescent states! */
 256	}
 257
 258	rnp_p = rnp->parent;
 259	if (rnp_p == NULL) {
 260		/*
 261		 * Either there is only one rcu_node in the tree,
 262		 * or tasks were kicked up to root rcu_node due to
 263		 * CPUs going offline.
 264		 */
 265		rcu_report_qs_rsp(&rcu_preempt_state, flags);
 266		return;
 267	}
 268
 269	/* Report up the rest of the hierarchy. */
 270	mask = rnp->grpmask;
 271	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 272	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
 273	rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 274}
 275
 276/*
 277 * Advance a ->blkd_tasks-list pointer to the next entry, instead
 278 * returning NULL if at the end of the list.
 279 */
 280static struct list_head *rcu_next_node_entry(struct task_struct *t,
 281					     struct rcu_node *rnp)
 282{
 283	struct list_head *np;
 284
 285	np = t->rcu_node_entry.next;
 286	if (np == &rnp->blkd_tasks)
 287		np = NULL;
 288	return np;
 289}
 290
 291/*
 292 * Handle special cases during rcu_read_unlock(), such as needing to
 293 * notify RCU core processing or task having blocked during the RCU
 294 * read-side critical section.
 295 */
 296static noinline void rcu_read_unlock_special(struct task_struct *t)
 297{
 298	int empty;
 299	int empty_exp;
 300	unsigned long flags;
 301	struct list_head *np;
 302	struct rcu_node *rnp;
 303	int special;
 304
 305	/* NMI handlers cannot block and cannot safely manipulate state. */
 306	if (in_nmi())
 307		return;
 308
 309	local_irq_save(flags);
 310
 311	/*
 312	 * If RCU core is waiting for this CPU to exit critical section,
 313	 * let it know that we have done so.
 314	 */
 315	special = t->rcu_read_unlock_special;
 316	if (special & RCU_READ_UNLOCK_NEED_QS) {
 317		rcu_preempt_qs(smp_processor_id());
 318	}
 319
 320	/* Hardware IRQ handlers cannot block. */
 321	if (in_irq() || in_serving_softirq()) {
 322		local_irq_restore(flags);
 323		return;
 324	}
 325
 326	/* Clean up if blocked during RCU read-side critical section. */
 327	if (special & RCU_READ_UNLOCK_BLOCKED) {
 328		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
 329
 330		/*
 331		 * Remove this task from the list it blocked on.  The
 332		 * task can migrate while we acquire the lock, but at
 333		 * most one time.  So at most two passes through loop.
 334		 */
 335		for (;;) {
 336			rnp = t->rcu_blocked_node;
 337			raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
 338			if (rnp == t->rcu_blocked_node)
 339				break;
 340			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 341		}
 342		empty = !rcu_preempt_blocked_readers_cgp(rnp);
 343		empty_exp = !rcu_preempted_readers_exp(rnp);
 344		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 345		np = rcu_next_node_entry(t, rnp);
 346		list_del_init(&t->rcu_node_entry);
 347		if (&t->rcu_node_entry == rnp->gp_tasks)
 348			rnp->gp_tasks = np;
 349		if (&t->rcu_node_entry == rnp->exp_tasks)
 350			rnp->exp_tasks = np;
 351#ifdef CONFIG_RCU_BOOST
 352		if (&t->rcu_node_entry == rnp->boost_tasks)
 353			rnp->boost_tasks = np;
 354		/* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
 355		if (t->rcu_boosted) {
 356			special |= RCU_READ_UNLOCK_BOOSTED;
 357			t->rcu_boosted = 0;
 358		}
 359#endif /* #ifdef CONFIG_RCU_BOOST */
 360		t->rcu_blocked_node = NULL;
 361
 362		/*
 363		 * If this was the last task on the current list, and if
 364		 * we aren't waiting on any CPUs, report the quiescent state.
 365		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
 366		 */
 367		if (empty)
 368			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 369		else
 370			rcu_report_unblock_qs_rnp(rnp, flags);
 371
 372#ifdef CONFIG_RCU_BOOST
 373		/* Unboost if we were boosted. */
 374		if (special & RCU_READ_UNLOCK_BOOSTED) {
 375			rt_mutex_unlock(t->rcu_boost_mutex);
 376			t->rcu_boost_mutex = NULL;
 377		}
 378#endif /* #ifdef CONFIG_RCU_BOOST */
 379
 380		/*
 381		 * If this was the last task on the expedited lists,
 382		 * then we need to report up the rcu_node hierarchy.
 383		 */
 384		if (!empty_exp && !rcu_preempted_readers_exp(rnp))
 385			rcu_report_exp_rnp(&rcu_preempt_state, rnp);
 386	} else {
 387		local_irq_restore(flags);
 388	}
 389}
 390
 391/*
 392 * Tree-preemptible RCU implementation for rcu_read_unlock().
 393 * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
 394 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
 395 * invoke rcu_read_unlock_special() to clean up after a context switch
 396 * in an RCU read-side critical section and other special cases.
 397 */
 398void __rcu_read_unlock(void)
 399{
 400	struct task_struct *t = current;
 401
 402	barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
 403	if (t->rcu_read_lock_nesting != 1)
 404		--t->rcu_read_lock_nesting;
 405	else {
 406		t->rcu_read_lock_nesting = INT_MIN;
 407		barrier();  /* assign before ->rcu_read_unlock_special load */
 408		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
 409			rcu_read_unlock_special(t);
 410		barrier();  /* ->rcu_read_unlock_special load before assign */
 411		t->rcu_read_lock_nesting = 0;
 412	}
 413#ifdef CONFIG_PROVE_LOCKING
 414	{
 415		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
 416
 417		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
 418	}
 419#endif /* #ifdef CONFIG_PROVE_LOCKING */
 420}
 421EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 422
 423#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 424
 425/*
 426 * Dump detailed information for all tasks blocking the current RCU
 427 * grace period on the specified rcu_node structure.
 428 */
 429static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 430{
 431	unsigned long flags;
 432	struct task_struct *t;
 433
 434	if (!rcu_preempt_blocked_readers_cgp(rnp))
 435		return;
 436	raw_spin_lock_irqsave(&rnp->lock, flags);
 437	t = list_entry(rnp->gp_tasks,
 438		       struct task_struct, rcu_node_entry);
 439	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 440		sched_show_task(t);
 441	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 442}
 443
 444/*
 445 * Dump detailed information for all tasks blocking the current RCU
 446 * grace period.
 447 */
 448static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 449{
 450	struct rcu_node *rnp = rcu_get_root(rsp);
 451
 452	rcu_print_detail_task_stall_rnp(rnp);
 453	rcu_for_each_leaf_node(rsp, rnp)
 454		rcu_print_detail_task_stall_rnp(rnp);
 455}
 456
 457#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 458
 459static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 460{
 461}
 462
 463#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 464
 465/*
 466 * Scan the current list of tasks blocked within RCU read-side critical
 467 * sections, printing out the tid of each.
 468 */
 469static void rcu_print_task_stall(struct rcu_node *rnp)
 470{
 471	struct task_struct *t;
 472
 473	if (!rcu_preempt_blocked_readers_cgp(rnp))
 474		return;
 475	t = list_entry(rnp->gp_tasks,
 476		       struct task_struct, rcu_node_entry);
 477	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 478		printk(" P%d", t->pid);
 479}
 480
 481/*
 482 * Suppress preemptible RCU's CPU stall warnings by pushing the
 483 * time of the next stall-warning message comfortably far into the
 484 * future.
 485 */
 486static void rcu_preempt_stall_reset(void)
 487{
 488	rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
 489}
 490
 491/*
 492 * Check that the list of blocked tasks for the newly completed grace
 493 * period is in fact empty.  It is a serious bug to complete a grace
 494 * period that still has RCU readers blocked!  This function must be
 495 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
 496 * must be held by the caller.
 497 *
 498 * Also, if there are blocked tasks on the list, they automatically
 499 * block the newly created grace period, so set up ->gp_tasks accordingly.
 500 */
 501static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 502{
 503	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 504	if (!list_empty(&rnp->blkd_tasks))
 505		rnp->gp_tasks = rnp->blkd_tasks.next;
 506	WARN_ON_ONCE(rnp->qsmask);
 507}
 508
 509#ifdef CONFIG_HOTPLUG_CPU
 510
 511/*
 512 * Handle tasklist migration for case in which all CPUs covered by the
 513 * specified rcu_node have gone offline.  Move them up to the root
 514 * rcu_node.  The reason for not just moving them to the immediate
 515 * parent is to remove the need for rcu_read_unlock_special() to
 516 * make more than two attempts to acquire the target rcu_node's lock.
 517 * Returns true if there were tasks blocking the current RCU grace
 518 * period.
 519 *
 520 * Returns 1 if there was previously a task blocking the current grace
 521 * period on the specified rcu_node structure.
 522 *
 523 * The caller must hold rnp->lock with irqs disabled.
 524 */
 525static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 526				     struct rcu_node *rnp,
 527				     struct rcu_data *rdp)
 528{
 529	struct list_head *lp;
 530	struct list_head *lp_root;
 531	int retval = 0;
 532	struct rcu_node *rnp_root = rcu_get_root(rsp);
 533	struct task_struct *t;
 534
 535	if (rnp == rnp_root) {
 536		WARN_ONCE(1, "Last CPU thought to be offlined?");
 537		return 0;  /* Shouldn't happen: at least one CPU online. */
 538	}
 539
 540	/* If we are on an internal node, complain bitterly. */
 541	WARN_ON_ONCE(rnp != rdp->mynode);
 542
 543	/*
 544	 * Move tasks up to root rcu_node.  Don't try to get fancy for
 545	 * this corner-case operation -- just put this node's tasks
 546	 * at the head of the root node's list, and update the root node's
 547	 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
 548	 * if non-NULL.  This might result in waiting for more tasks than
 549	 * absolutely necessary, but this is a good performance/complexity
 550	 * tradeoff.
 551	 */
 552	if (rcu_preempt_blocked_readers_cgp(rnp))
 553		retval |= RCU_OFL_TASKS_NORM_GP;
 554	if (rcu_preempted_readers_exp(rnp))
 555		retval |= RCU_OFL_TASKS_EXP_GP;
 556	lp = &rnp->blkd_tasks;
 557	lp_root = &rnp_root->blkd_tasks;
 558	while (!list_empty(lp)) {
 559		t = list_entry(lp->next, typeof(*t), rcu_node_entry);
 560		raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 561		list_del(&t->rcu_node_entry);
 562		t->rcu_blocked_node = rnp_root;
 563		list_add(&t->rcu_node_entry, lp_root);
 564		if (&t->rcu_node_entry == rnp->gp_tasks)
 565			rnp_root->gp_tasks = rnp->gp_tasks;
 566		if (&t->rcu_node_entry == rnp->exp_tasks)
 567			rnp_root->exp_tasks = rnp->exp_tasks;
 568#ifdef CONFIG_RCU_BOOST
 569		if (&t->rcu_node_entry == rnp->boost_tasks)
 570			rnp_root->boost_tasks = rnp->boost_tasks;
 571#endif /* #ifdef CONFIG_RCU_BOOST */
 572		raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 573	}
 574
 575#ifdef CONFIG_RCU_BOOST
 576	/* In case root is being boosted and leaf is not. */
 577	raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 578	if (rnp_root->boost_tasks != NULL &&
 579	    rnp_root->boost_tasks != rnp_root->gp_tasks)
 580		rnp_root->boost_tasks = rnp_root->gp_tasks;
 581	raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 582#endif /* #ifdef CONFIG_RCU_BOOST */
 583
 584	rnp->gp_tasks = NULL;
 585	rnp->exp_tasks = NULL;
 586	return retval;
 587}
 588
 589/*
 590 * Do CPU-offline processing for preemptible RCU.
 591 */
 592static void rcu_preempt_offline_cpu(int cpu)
 593{
 594	__rcu_offline_cpu(cpu, &rcu_preempt_state);
 595}
 596
 597#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 598
 599/*
 600 * Check for a quiescent state from the current CPU.  When a task blocks,
 601 * the task is recorded in the corresponding CPU's rcu_node structure,
 602 * which is checked elsewhere.
 603 *
 604 * Caller must disable hard irqs.
 605 */
 606static void rcu_preempt_check_callbacks(int cpu)
 607{
 608	struct task_struct *t = current;
 609
 610	if (t->rcu_read_lock_nesting == 0) {
 611		rcu_preempt_qs(cpu);
 612		return;
 613	}
 614	if (t->rcu_read_lock_nesting > 0 &&
 615	    per_cpu(rcu_preempt_data, cpu).qs_pending)
 616		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 617}
 618
 619/*
 620 * Process callbacks for preemptible RCU.
 621 */
 622static void rcu_preempt_process_callbacks(void)
 623{
 624	__rcu_process_callbacks(&rcu_preempt_state,
 625				&__get_cpu_var(rcu_preempt_data));
 626}
 627
 628#ifdef CONFIG_RCU_BOOST
 629
 630static void rcu_preempt_do_callbacks(void)
 631{
 632	rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
 633}
 634
 635#endif /* #ifdef CONFIG_RCU_BOOST */
 636
 637/*
 638 * Queue a preemptible-RCU callback for invocation after a grace period.
 639 */
 640void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 641{
 642	__call_rcu(head, func, &rcu_preempt_state);
 643}
 644EXPORT_SYMBOL_GPL(call_rcu);
 645
 646/**
 647 * synchronize_rcu - wait until a grace period has elapsed.
 648 *
 649 * Control will return to the caller some time after a full grace
 650 * period has elapsed, in other words after all currently executing RCU
 651 * read-side critical sections have completed.  Note, however, that
 652 * upon return from synchronize_rcu(), the caller might well be executing
 653 * concurrently with new RCU read-side critical sections that began while
 654 * synchronize_rcu() was waiting.  RCU read-side critical sections are
 655 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
 656 */
 657void synchronize_rcu(void)
 658{
 659	struct rcu_synchronize rcu;
 660
 661	if (!rcu_scheduler_active)
 662		return;
 663
 664	init_rcu_head_on_stack(&rcu.head);
 665	init_completion(&rcu.completion);
 666	/* Will wake me after RCU finished. */
 667	call_rcu(&rcu.head, wakeme_after_rcu);
 668	/* Wait for it. */
 669	wait_for_completion(&rcu.completion);
 670	destroy_rcu_head_on_stack(&rcu.head);
 671}
 672EXPORT_SYMBOL_GPL(synchronize_rcu);
 673
 674static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
 675static long sync_rcu_preempt_exp_count;
 676static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 677
 678/*
 679 * Return non-zero if there are any tasks in RCU read-side critical
 680 * sections blocking the current preemptible-RCU expedited grace period.
 681 * If there is no preemptible-RCU expedited grace period currently in
 682 * progress, returns zero unconditionally.
 683 */
 684static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 685{
 686	return rnp->exp_tasks != NULL;
 687}
 688
 689/*
 690 * return non-zero if there is no RCU expedited grace period in progress
 691 * for the specified rcu_node structure, in other words, if all CPUs and
 692 * tasks covered by the specified rcu_node structure have done their bit
 693 * for the current expedited grace period.  Works only for preemptible
 694 * RCU -- other RCU implementation use other means.
 695 *
 696 * Caller must hold sync_rcu_preempt_exp_mutex.
 697 */
 698static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 699{
 700	return !rcu_preempted_readers_exp(rnp) &&
 701	       ACCESS_ONCE(rnp->expmask) == 0;
 702}
 703
 704/*
 705 * Report the exit from RCU read-side critical section for the last task
 706 * that queued itself during or before the current expedited preemptible-RCU
 707 * grace period.  This event is reported either to the rcu_node structure on
 708 * which the task was queued or to one of that rcu_node structure's ancestors,
 709 * recursively up the tree.  (Calm down, calm down, we do the recursion
 710 * iteratively!)
 711 *
 712 * Caller must hold sync_rcu_preempt_exp_mutex.
 713 */
 714static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 715{
 716	unsigned long flags;
 717	unsigned long mask;
 718
 719	raw_spin_lock_irqsave(&rnp->lock, flags);
 720	for (;;) {
 721		if (!sync_rcu_preempt_exp_done(rnp)) {
 722			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 723			break;
 724		}
 725		if (rnp->parent == NULL) {
 726			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 727			wake_up(&sync_rcu_preempt_exp_wq);
 728			break;
 729		}
 730		mask = rnp->grpmask;
 731		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 732		rnp = rnp->parent;
 733		raw_spin_lock(&rnp->lock); /* irqs already disabled */
 734		rnp->expmask &= ~mask;
 735	}
 736}
 737
 738/*
 739 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
 740 * grace period for the specified rcu_node structure.  If there are no such
 741 * tasks, report it up the rcu_node hierarchy.
 742 *
 743 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
 744 */
 745static void
 746sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 747{
 748	unsigned long flags;
 749	int must_wait = 0;
 750
 751	raw_spin_lock_irqsave(&rnp->lock, flags);
 752	if (list_empty(&rnp->blkd_tasks))
 753		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 754	else {
 755		rnp->exp_tasks = rnp->blkd_tasks.next;
 756		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
 757		must_wait = 1;
 758	}
 759	if (!must_wait)
 760		rcu_report_exp_rnp(rsp, rnp);
 761}
 762
 763/*
 764 * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
 765 * is to invoke synchronize_sched_expedited() to push all the tasks to
 766 * the ->blkd_tasks lists and wait for this list to drain.
 767 */
 768void synchronize_rcu_expedited(void)
 769{
 770	unsigned long flags;
 771	struct rcu_node *rnp;
 772	struct rcu_state *rsp = &rcu_preempt_state;
 773	long snap;
 774	int trycount = 0;
 775
 776	smp_mb(); /* Caller's modifications seen first by other CPUs. */
 777	snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
 778	smp_mb(); /* Above access cannot bleed into critical section. */
 779
 780	/*
 781	 * Acquire lock, falling back to synchronize_rcu() if too many
 782	 * lock-acquisition failures.  Of course, if someone does the
 783	 * expedited grace period for us, just leave.
 784	 */
 785	while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
 786		if (trycount++ < 10)
 787			udelay(trycount * num_online_cpus());
 788		else {
 789			synchronize_rcu();
 790			return;
 791		}
 792		if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
 793			goto mb_ret; /* Others did our work for us. */
 794	}
 795	if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
 796		goto unlock_mb_ret; /* Others did our work for us. */
 797
 798	/* force all RCU readers onto ->blkd_tasks lists. */
 799	synchronize_sched_expedited();
 800
 801	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 802
 803	/* Initialize ->expmask for all non-leaf rcu_node structures. */
 804	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
 805		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 806		rnp->expmask = rnp->qsmaskinit;
 807		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 808	}
 809
 810	/* Snapshot current state of ->blkd_tasks lists. */
 811	rcu_for_each_leaf_node(rsp, rnp)
 812		sync_rcu_preempt_exp_init(rsp, rnp);
 813	if (NUM_RCU_NODES > 1)
 814		sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
 815
 816	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 817
 818	/* Wait for snapshotted ->blkd_tasks lists to drain. */
 819	rnp = rcu_get_root(rsp);
 820	wait_event(sync_rcu_preempt_exp_wq,
 821		   sync_rcu_preempt_exp_done(rnp));
 822
 823	/* Clean up and exit. */
 824	smp_mb(); /* ensure expedited GP seen before counter increment. */
 825	ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
 826unlock_mb_ret:
 827	mutex_unlock(&sync_rcu_preempt_exp_mutex);
 828mb_ret:
 829	smp_mb(); /* ensure subsequent action seen after grace period. */
 830}
 831EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 832
 833/*
 834 * Check to see if there is any immediate preemptible-RCU-related work
 835 * to be done.
 836 */
 837static int rcu_preempt_pending(int cpu)
 838{
 839	return __rcu_pending(&rcu_preempt_state,
 840			     &per_cpu(rcu_preempt_data, cpu));
 841}
 842
 843/*
 844 * Does preemptible RCU need the CPU to stay out of dynticks mode?
 845 */
 846static int rcu_preempt_needs_cpu(int cpu)
 847{
 848	return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
 849}
 850
 851/**
 852 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
 853 */
 854void rcu_barrier(void)
 855{
 856	_rcu_barrier(&rcu_preempt_state, call_rcu);
 857}
 858EXPORT_SYMBOL_GPL(rcu_barrier);
 859
 860/*
 861 * Initialize preemptible RCU's per-CPU data.
 862 */
 863static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 864{
 865	rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
 866}
 867
 868/*
 869 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
 870 */
 871static void rcu_preempt_send_cbs_to_online(void)
 872{
 873	rcu_send_cbs_to_online(&rcu_preempt_state);
 874}
 875
 876/*
 877 * Initialize preemptible RCU's state structures.
 878 */
 879static void __init __rcu_init_preempt(void)
 880{
 881	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 882}
 883
 884/*
 885 * Check for a task exiting while in a preemptible-RCU read-side
 886 * critical section, clean up if so.  No need to issue warnings,
 887 * as debug_check_no_locks_held() already does this if lockdep
 888 * is enabled.
 889 */
 890void exit_rcu(void)
 891{
 892	struct task_struct *t = current;
 893
 894	if (t->rcu_read_lock_nesting == 0)
 895		return;
 896	t->rcu_read_lock_nesting = 1;
 897	__rcu_read_unlock();
 898}
 899
 900#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 901
 902static struct rcu_state *rcu_state = &rcu_sched_state;
 903
 904/*
 905 * Tell them what RCU they are running.
 906 */
 907static void __init rcu_bootup_announce(void)
 908{
 909	printk(KERN_INFO "Hierarchical RCU implementation.\n");
 910	rcu_bootup_announce_oddness();
 911}
 912
 913/*
 914 * Return the number of RCU batches processed thus far for debug & stats.
 915 */
 916long rcu_batches_completed(void)
 917{
 918	return rcu_batches_completed_sched();
 919}
 920EXPORT_SYMBOL_GPL(rcu_batches_completed);
 921
 922/*
 923 * Force a quiescent state for RCU, which, because there is no preemptible
 924 * RCU, becomes the same as rcu-sched.
 925 */
 926void rcu_force_quiescent_state(void)
 927{
 928	rcu_sched_force_quiescent_state();
 929}
 930EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 931
 932/*
 933 * Because preemptible RCU does not exist, we never have to check for
 934 * CPUs being in quiescent states.
 935 */
 936static void rcu_preempt_note_context_switch(int cpu)
 937{
 938}
 939
 940/*
 941 * Because preemptible RCU does not exist, there are never any preempted
 942 * RCU readers.
 943 */
 944static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 945{
 946	return 0;
 947}
 948
 949#ifdef CONFIG_HOTPLUG_CPU
 950
 951/* Because preemptible RCU does not exist, no quieting of tasks. */
 952static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 953{
 954	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 955}
 956
 957#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 958
 959/*
 960 * Because preemptible RCU does not exist, we never have to check for
 961 * tasks blocked within RCU read-side critical sections.
 962 */
 963static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 964{
 965}
 966
 967/*
 968 * Because preemptible RCU does not exist, we never have to check for
 969 * tasks blocked within RCU read-side critical sections.
 970 */
 971static void rcu_print_task_stall(struct rcu_node *rnp)
 972{
 973}
 974
 975/*
 976 * Because preemptible RCU does not exist, there is no need to suppress
 977 * its CPU stall warnings.
 978 */
 979static void rcu_preempt_stall_reset(void)
 980{
 981}
 982
 983/*
 984 * Because there is no preemptible RCU, there can be no readers blocked,
 985 * so there is no need to check for blocked tasks.  So check only for
 986 * bogus qsmask values.
 987 */
 988static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 989{
 990	WARN_ON_ONCE(rnp->qsmask);
 991}
 992
 993#ifdef CONFIG_HOTPLUG_CPU
 994
 995/*
 996 * Because preemptible RCU does not exist, it never needs to migrate
 997 * tasks that were blocked within RCU read-side critical sections, and
 998 * such non-existent tasks cannot possibly have been blocking the current
 999 * grace period.
1000 */
1001static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1002				     struct rcu_node *rnp,
1003				     struct rcu_data *rdp)
1004{
1005	return 0;
1006}
1007
1008/*
1009 * Because preemptible RCU does not exist, it never needs CPU-offline
1010 * processing.
1011 */
1012static void rcu_preempt_offline_cpu(int cpu)
1013{
1014}
1015
1016#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1017
1018/*
1019 * Because preemptible RCU does not exist, it never has any callbacks
1020 * to check.
1021 */
1022static void rcu_preempt_check_callbacks(int cpu)
1023{
1024}
1025
1026/*
1027 * Because preemptible RCU does not exist, it never has any callbacks
1028 * to process.
1029 */
1030static void rcu_preempt_process_callbacks(void)
1031{
1032}
1033
1034/*
1035 * Wait for an rcu-preempt grace period, but make it happen quickly.
1036 * But because preemptible RCU does not exist, map to rcu-sched.
1037 */
1038void synchronize_rcu_expedited(void)
1039{
1040	synchronize_sched_expedited();
1041}
1042EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1043
1044#ifdef CONFIG_HOTPLUG_CPU
1045
1046/*
1047 * Because preemptible RCU does not exist, there is never any need to
1048 * report on tasks preempted in RCU read-side critical sections during
1049 * expedited RCU grace periods.
1050 */
1051static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
1052{
1053	return;
1054}
1055
1056#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1057
1058/*
1059 * Because preemptible RCU does not exist, it never has any work to do.
1060 */
1061static int rcu_preempt_pending(int cpu)
1062{
1063	return 0;
1064}
1065
1066/*
1067 * Because preemptible RCU does not exist, it never needs any CPU.
1068 */
1069static int rcu_preempt_needs_cpu(int cpu)
1070{
1071	return 0;
1072}
1073
1074/*
1075 * Because preemptible RCU does not exist, rcu_barrier() is just
1076 * another name for rcu_barrier_sched().
1077 */
1078void rcu_barrier(void)
1079{
1080	rcu_barrier_sched();
1081}
1082EXPORT_SYMBOL_GPL(rcu_barrier);
1083
1084/*
1085 * Because preemptible RCU does not exist, there is no per-CPU
1086 * data to initialize.
1087 */
1088static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1089{
1090}
1091
1092/*
1093 * Because there is no preemptible RCU, there are no callbacks to move.
1094 */
1095static void rcu_preempt_send_cbs_to_online(void)
1096{
1097}
1098
1099/*
1100 * Because preemptible RCU does not exist, it need not be initialized.
1101 */
1102static void __init __rcu_init_preempt(void)
1103{
1104}
1105
1106#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1107
1108#ifdef CONFIG_RCU_BOOST
1109
1110#include "rtmutex_common.h"
1111
1112#ifdef CONFIG_RCU_TRACE
1113
1114static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1115{
1116	if (list_empty(&rnp->blkd_tasks))
1117		rnp->n_balk_blkd_tasks++;
1118	else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1119		rnp->n_balk_exp_gp_tasks++;
1120	else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1121		rnp->n_balk_boost_tasks++;
1122	else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1123		rnp->n_balk_notblocked++;
1124	else if (rnp->gp_tasks != NULL &&
1125		 ULONG_CMP_LT(jiffies, rnp->boost_time))
1126		rnp->n_balk_notyet++;
1127	else
1128		rnp->n_balk_nos++;
1129}
1130
1131#else /* #ifdef CONFIG_RCU_TRACE */
1132
1133static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1134{
1135}
1136
1137#endif /* #else #ifdef CONFIG_RCU_TRACE */
1138
1139/*
1140 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1141 * or ->boost_tasks, advancing the pointer to the next task in the
1142 * ->blkd_tasks list.
1143 *
1144 * Note that irqs must be enabled: boosting the task can block.
1145 * Returns 1 if there are more tasks needing to be boosted.
1146 */
1147static int rcu_boost(struct rcu_node *rnp)
1148{
1149	unsigned long flags;
1150	struct rt_mutex mtx;
1151	struct task_struct *t;
1152	struct list_head *tb;
1153
1154	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1155		return 0;  /* Nothing left to boost. */
1156
1157	raw_spin_lock_irqsave(&rnp->lock, flags);
1158
1159	/*
1160	 * Recheck under the lock: all tasks in need of boosting
1161	 * might exit their RCU read-side critical sections on their own.
1162	 */
1163	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1164		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1165		return 0;
1166	}
1167
1168	/*
1169	 * Preferentially boost tasks blocking expedited grace periods.
1170	 * This cannot starve the normal grace periods because a second
1171	 * expedited grace period must boost all blocked tasks, including
1172	 * those blocking the pre-existing normal grace period.
1173	 */
1174	if (rnp->exp_tasks != NULL) {
1175		tb = rnp->exp_tasks;
1176		rnp->n_exp_boosts++;
1177	} else {
1178		tb = rnp->boost_tasks;
1179		rnp->n_normal_boosts++;
1180	}
1181	rnp->n_tasks_boosted++;
1182
1183	/*
1184	 * We boost task t by manufacturing an rt_mutex that appears to
1185	 * be held by task t.  We leave a pointer to that rt_mutex where
1186	 * task t can find it, and task t will release the mutex when it
1187	 * exits its outermost RCU read-side critical section.  Then
1188	 * simply acquiring this artificial rt_mutex will boost task
1189	 * t's priority.  (Thanks to tglx for suggesting this approach!)
1190	 *
1191	 * Note that task t must acquire rnp->lock to remove itself from
1192	 * the ->blkd_tasks list, which it will do from exit() if from
1193	 * nowhere else.  We therefore are guaranteed that task t will
1194	 * stay around at least until we drop rnp->lock.  Note that
1195	 * rnp->lock also resolves races between our priority boosting
1196	 * and task t's exiting its outermost RCU read-side critical
1197	 * section.
1198	 */
1199	t = container_of(tb, struct task_struct, rcu_node_entry);
1200	rt_mutex_init_proxy_locked(&mtx, t);
1201	t->rcu_boost_mutex = &mtx;
1202	t->rcu_boosted = 1;
1203	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1204	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
1205	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
1206
1207	return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1208}
1209
1210/*
1211 * Timer handler to initiate waking up of boost kthreads that
1212 * have yielded the CPU due to excessive numbers of tasks to
1213 * boost.  We wake up the per-rcu_node kthread, which in turn
1214 * will wake up the booster kthread.
1215 */
1216static void rcu_boost_kthread_timer(unsigned long arg)
1217{
1218	invoke_rcu_node_kthread((struct rcu_node *)arg);
1219}
1220
1221/*
1222 * Priority-boosting kthread.  One per leaf rcu_node and one for the
1223 * root rcu_node.
1224 */
1225static int rcu_boost_kthread(void *arg)
1226{
1227	struct rcu_node *rnp = (struct rcu_node *)arg;
1228	int spincnt = 0;
1229	int more2boost;
1230
1231	for (;;) {
1232		rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1233		rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1234		rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1235		more2boost = rcu_boost(rnp);
1236		if (more2boost)
1237			spincnt++;
1238		else
1239			spincnt = 0;
1240		if (spincnt > 10) {
1241			rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1242			spincnt = 0;
1243		}
1244	}
1245	/* NOTREACHED */
1246	return 0;
1247}
1248
1249/*
1250 * Check to see if it is time to start boosting RCU readers that are
1251 * blocking the current grace period, and, if so, tell the per-rcu_node
1252 * kthread to start boosting them.  If there is an expedited grace
1253 * period in progress, it is always time to boost.
1254 *
1255 * The caller must hold rnp->lock, which this function releases,
1256 * but irqs remain disabled.  The ->boost_kthread_task is immortal,
1257 * so we don't need to worry about it going away.
1258 */
1259static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1260{
1261	struct task_struct *t;
1262
1263	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1264		rnp->n_balk_exp_gp_tasks++;
1265		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1266		return;
1267	}
1268	if (rnp->exp_tasks != NULL ||
1269	    (rnp->gp_tasks != NULL &&
1270	     rnp->boost_tasks == NULL &&
1271	     rnp->qsmask == 0 &&
1272	     ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1273		if (rnp->exp_tasks == NULL)
1274			rnp->boost_tasks = rnp->gp_tasks;
1275		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1276		t = rnp->boost_kthread_task;
1277		if (t != NULL)
1278			wake_up_process(t);
1279	} else {
1280		rcu_initiate_boost_trace(rnp);
1281		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1282	}
1283}
1284
1285/*
1286 * Wake up the per-CPU kthread to invoke RCU callbacks.
1287 */
1288static void invoke_rcu_callbacks_kthread(void)
1289{
1290	unsigned long flags;
1291
1292	local_irq_save(flags);
1293	__this_cpu_write(rcu_cpu_has_work, 1);
1294	if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1295		local_irq_restore(flags);
1296		return;
1297	}
1298	wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1299	local_irq_restore(flags);
1300}
1301
1302/*
1303 * Set the affinity of the boost kthread.  The CPU-hotplug locks are
1304 * held, so no one should be messing with the existence of the boost
1305 * kthread.
1306 */
1307static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1308					  cpumask_var_t cm)
1309{
1310	struct task_struct *t;
1311
1312	t = rnp->boost_kthread_task;
1313	if (t != NULL)
1314		set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1315}
1316
1317#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1318
1319/*
1320 * Do priority-boost accounting for the start of a new grace period.
1321 */
1322static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1323{
1324	rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1325}
1326
1327/*
1328 * Create an RCU-boost kthread for the specified node if one does not
1329 * already exist.  We only create this kthread for preemptible RCU.
1330 * Returns zero if all is well, a negated errno otherwise.
1331 */
1332static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1333						 struct rcu_node *rnp,
1334						 int rnp_index)
1335{
1336	unsigned long flags;
1337	struct sched_param sp;
1338	struct task_struct *t;
1339
1340	if (&rcu_preempt_state != rsp)
1341		return 0;
1342	rsp->boost = 1;
1343	if (rnp->boost_kthread_task != NULL)
1344		return 0;
1345	t = kthread_create(rcu_boost_kthread, (void *)rnp,
1346			   "rcub%d", rnp_index);
1347	if (IS_ERR(t))
1348		return PTR_ERR(t);
1349	raw_spin_lock_irqsave(&rnp->lock, flags);
1350	rnp->boost_kthread_task = t;
1351	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1352	sp.sched_priority = RCU_KTHREAD_PRIO;
1353	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1354	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1355	return 0;
1356}
1357
1358#ifdef CONFIG_HOTPLUG_CPU
1359
1360/*
1361 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1362 */
1363static void rcu_stop_cpu_kthread(int cpu)
1364{
1365	struct task_struct *t;
1366
1367	/* Stop the CPU's kthread. */
1368	t = per_cpu(rcu_cpu_kthread_task, cpu);
1369	if (t != NULL) {
1370		per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1371		kthread_stop(t);
1372	}
1373}
1374
1375#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1376
1377static void rcu_kthread_do_work(void)
1378{
1379	rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
1380	rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1381	rcu_preempt_do_callbacks();
1382}
1383
1384/*
1385 * Wake up the specified per-rcu_node-structure kthread.
1386 * Because the per-rcu_node kthreads are immortal, we don't need
1387 * to do anything to keep them alive.
1388 */
1389static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1390{
1391	struct task_struct *t;
1392
1393	t = rnp->node_kthread_task;
1394	if (t != NULL)
1395		wake_up_process(t);
1396}
1397
1398/*
1399 * Set the specified CPU's kthread to run RT or not, as specified by
1400 * the to_rt argument.  The CPU-hotplug locks are held, so the task
1401 * is not going away.
1402 */
1403static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1404{
1405	int policy;
1406	struct sched_param sp;
1407	struct task_struct *t;
1408
1409	t = per_cpu(rcu_cpu_kthread_task, cpu);
1410	if (t == NULL)
1411		return;
1412	if (to_rt) {
1413		policy = SCHED_FIFO;
1414		sp.sched_priority = RCU_KTHREAD_PRIO;
1415	} else {
1416		policy = SCHED_NORMAL;
1417		sp.sched_priority = 0;
1418	}
1419	sched_setscheduler_nocheck(t, policy, &sp);
1420}
1421
1422/*
1423 * Timer handler to initiate the waking up of per-CPU kthreads that
1424 * have yielded the CPU due to excess numbers of RCU callbacks.
1425 * We wake up the per-rcu_node kthread, which in turn will wake up
1426 * the booster kthread.
1427 */
1428static void rcu_cpu_kthread_timer(unsigned long arg)
1429{
1430	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1431	struct rcu_node *rnp = rdp->mynode;
1432
1433	atomic_or(rdp->grpmask, &rnp->wakemask);
1434	invoke_rcu_node_kthread(rnp);
1435}
1436
1437/*
1438 * Drop to non-real-time priority and yield, but only after posting a
1439 * timer that will cause us to regain our real-time priority if we
1440 * remain preempted.  Either way, we restore our real-time priority
1441 * before returning.
1442 */
1443static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1444{
1445	struct sched_param sp;
1446	struct timer_list yield_timer;
1447
1448	setup_timer_on_stack(&yield_timer, f, arg);
1449	mod_timer(&yield_timer, jiffies + 2);
1450	sp.sched_priority = 0;
1451	sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1452	set_user_nice(current, 19);
1453	schedule();
1454	sp.sched_priority = RCU_KTHREAD_PRIO;
1455	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1456	del_timer(&yield_timer);
1457}
1458
1459/*
1460 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1461 * This can happen while the corresponding CPU is either coming online
1462 * or going offline.  We cannot wait until the CPU is fully online
1463 * before starting the kthread, because the various notifier functions
1464 * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
1465 * the corresponding CPU is online.
1466 *
1467 * Return 1 if the kthread needs to stop, 0 otherwise.
1468 *
1469 * Caller must disable bh.  This function can momentarily enable it.
1470 */
1471static int rcu_cpu_kthread_should_stop(int cpu)
1472{
1473	while (cpu_is_offline(cpu) ||
1474	       !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1475	       smp_processor_id() != cpu) {
1476		if (kthread_should_stop())
1477			return 1;
1478		per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1479		per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1480		local_bh_enable();
1481		schedule_timeout_uninterruptible(1);
1482		if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1483			set_cpus_allowed_ptr(current, cpumask_of(cpu));
1484		local_bh_disable();
1485	}
1486	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1487	return 0;
1488}
1489
1490/*
1491 * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
1492 * earlier RCU softirq.
1493 */
1494static int rcu_cpu_kthread(void *arg)
1495{
1496	int cpu = (int)(long)arg;
1497	unsigned long flags;
1498	int spincnt = 0;
1499	unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1500	char work;
1501	char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1502
1503	for (;;) {
1504		*statusp = RCU_KTHREAD_WAITING;
1505		rcu_wait(*workp != 0 || kthread_should_stop());
1506		local_bh_disable();
1507		if (rcu_cpu_kthread_should_stop(cpu)) {
1508			local_bh_enable();
1509			break;
1510		}
1511		*statusp = RCU_KTHREAD_RUNNING;
1512		per_cpu(rcu_cpu_kthread_loops, cpu)++;
1513		local_irq_save(flags);
1514		work = *workp;
1515		*workp = 0;
1516		local_irq_restore(flags);
1517		if (work)
1518			rcu_kthread_do_work();
1519		local_bh_enable();
1520		if (*workp != 0)
1521			spincnt++;
1522		else
1523			spincnt = 0;
1524		if (spincnt > 10) {
1525			*statusp = RCU_KTHREAD_YIELDING;
1526			rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1527			spincnt = 0;
1528		}
1529	}
1530	*statusp = RCU_KTHREAD_STOPPED;
1531	return 0;
1532}
1533
1534/*
1535 * Spawn a per-CPU kthread, setting up affinity and priority.
1536 * Because the CPU hotplug lock is held, no other CPU will be attempting
1537 * to manipulate rcu_cpu_kthread_task.  There might be another CPU
1538 * attempting to access it during boot, but the locking in kthread_bind()
1539 * will enforce sufficient ordering.
1540 *
1541 * Please note that we cannot simply refuse to wake up the per-CPU
1542 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1543 * which can result in softlockup complaints if the task ends up being
1544 * idle for more than a couple of minutes.
1545 *
1546 * However, please note also that we cannot bind the per-CPU kthread to its
1547 * CPU until that CPU is fully online.  We also cannot wait until the
1548 * CPU is fully online before we create its per-CPU kthread, as this would
1549 * deadlock the system when CPU notifiers tried waiting for grace
1550 * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
1551 * is online.  If its CPU is not yet fully online, then the code in
1552 * rcu_cpu_kthread() will wait until it is fully online, and then do
1553 * the binding.
1554 */
1555static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1556{
1557	struct sched_param sp;
1558	struct task_struct *t;
1559
1560	if (!rcu_scheduler_fully_active ||
1561	    per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1562		return 0;
1563	t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1564	if (IS_ERR(t))
1565		return PTR_ERR(t);
1566	if (cpu_online(cpu))
1567		kthread_bind(t, cpu);
1568	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1569	WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1570	sp.sched_priority = RCU_KTHREAD_PRIO;
1571	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1572	per_cpu(rcu_cpu_kthread_task, cpu) = t;
1573	wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1574	return 0;
1575}
1576
1577/*
1578 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1579 * kthreads when needed.  We ignore requests to wake up kthreads
1580 * for offline CPUs, which is OK because force_quiescent_state()
1581 * takes care of this case.
1582 */
1583static int rcu_node_kthread(void *arg)
1584{
1585	int cpu;
1586	unsigned long flags;
1587	unsigned long mask;
1588	struct rcu_node *rnp = (struct rcu_node *)arg;
1589	struct sched_param sp;
1590	struct task_struct *t;
1591
1592	for (;;) {
1593		rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1594		rcu_wait(atomic_read(&rnp->wakemask) != 0);
1595		rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1596		raw_spin_lock_irqsave(&rnp->lock, flags);
1597		mask = atomic_xchg(&rnp->wakemask, 0);
1598		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1599		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1600			if ((mask & 0x1) == 0)
1601				continue;
1602			preempt_disable();
1603			t = per_cpu(rcu_cpu_kthread_task, cpu);
1604			if (!cpu_online(cpu) || t == NULL) {
1605				preempt_enable();
1606				continue;
1607			}
1608			per_cpu(rcu_cpu_has_work, cpu) = 1;
1609			sp.sched_priority = RCU_KTHREAD_PRIO;
1610			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1611			preempt_enable();
1612		}
1613	}
1614	/* NOTREACHED */
1615	rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1616	return 0;
1617}
1618
1619/*
1620 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1621 * served by the rcu_node in question.  The CPU hotplug lock is still
1622 * held, so the value of rnp->qsmaskinit will be stable.
1623 *
1624 * We don't include outgoingcpu in the affinity set, use -1 if there is
1625 * no outgoing CPU.  If there are no CPUs left in the affinity set,
1626 * this function allows the kthread to execute on any CPU.
1627 */
1628static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1629{
1630	cpumask_var_t cm;
1631	int cpu;
1632	unsigned long mask = rnp->qsmaskinit;
1633
1634	if (rnp->node_kthread_task == NULL)
1635		return;
1636	if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1637		return;
1638	cpumask_clear(cm);
1639	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1640		if ((mask & 0x1) && cpu != outgoingcpu)
1641			cpumask_set_cpu(cpu, cm);
1642	if (cpumask_weight(cm) == 0) {
1643		cpumask_setall(cm);
1644		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1645			cpumask_clear_cpu(cpu, cm);
1646		WARN_ON_ONCE(cpumask_weight(cm) == 0);
1647	}
1648	set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1649	rcu_boost_kthread_setaffinity(rnp, cm);
1650	free_cpumask_var(cm);
1651}
1652
1653/*
1654 * Spawn a per-rcu_node kthread, setting priority and affinity.
1655 * Called during boot before online/offline can happen, or, if
1656 * during runtime, with the main CPU-hotplug locks held.  So only
1657 * one of these can be executing at a time.
1658 */
1659static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1660						struct rcu_node *rnp)
1661{
1662	unsigned long flags;
1663	int rnp_index = rnp - &rsp->node[0];
1664	struct sched_param sp;
1665	struct task_struct *t;
1666
1667	if (!rcu_scheduler_fully_active ||
1668	    rnp->qsmaskinit == 0)
1669		return 0;
1670	if (rnp->node_kthread_task == NULL) {
1671		t = kthread_create(rcu_node_kthread, (void *)rnp,
1672				   "rcun%d", rnp_index);
1673		if (IS_ERR(t))
1674			return PTR_ERR(t);
1675		raw_spin_lock_irqsave(&rnp->lock, flags);
1676		rnp->node_kthread_task = t;
1677		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1678		sp.sched_priority = 99;
1679		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1680		wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1681	}
1682	return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1683}
1684
1685/*
1686 * Spawn all kthreads -- called as soon as the scheduler is running.
1687 */
1688static int __init rcu_spawn_kthreads(void)
1689{
1690	int cpu;
1691	struct rcu_node *rnp;
1692
1693	rcu_scheduler_fully_active = 1;
1694	for_each_possible_cpu(cpu) {
1695		per_cpu(rcu_cpu_has_work, cpu) = 0;
1696		if (cpu_online(cpu))
1697			(void)rcu_spawn_one_cpu_kthread(cpu);
1698	}
1699	rnp = rcu_get_root(rcu_state);
1700	(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1701	if (NUM_RCU_NODES > 1) {
1702		rcu_for_each_leaf_node(rcu_state, rnp)
1703			(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1704	}
1705	return 0;
1706}
1707early_initcall(rcu_spawn_kthreads);
1708
1709static void __cpuinit rcu_prepare_kthreads(int cpu)
1710{
1711	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1712	struct rcu_node *rnp = rdp->mynode;
1713
1714	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1715	if (rcu_scheduler_fully_active) {
1716		(void)rcu_spawn_one_cpu_kthread(cpu);
1717		if (rnp->node_kthread_task == NULL)
1718			(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1719	}
1720}
1721
1722#else /* #ifdef CONFIG_RCU_BOOST */
1723
1724static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1725{
1726	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1727}
1728
1729static void invoke_rcu_callbacks_kthread(void)
1730{
1731	WARN_ON_ONCE(1);
1732}
1733
1734static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1735{
1736}
1737
1738#ifdef CONFIG_HOTPLUG_CPU
1739
1740static void rcu_stop_cpu_kthread(int cpu)
1741{
1742}
1743
1744#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1745
1746static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1747{
1748}
1749
1750static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1751{
1752}
1753
1754static int __init rcu_scheduler_really_started(void)
1755{
1756	rcu_scheduler_fully_active = 1;
1757	return 0;
1758}
1759early_initcall(rcu_scheduler_really_started);
1760
1761static void __cpuinit rcu_prepare_kthreads(int cpu)
1762{
1763}
1764
1765#endif /* #else #ifdef CONFIG_RCU_BOOST */
1766
1767#ifndef CONFIG_SMP
1768
1769void synchronize_sched_expedited(void)
1770{
1771	cond_resched();
1772}
1773EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1774
1775#else /* #ifndef CONFIG_SMP */
1776
1777static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1778static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1779
1780static int synchronize_sched_expedited_cpu_stop(void *data)
1781{
1782	/*
1783	 * There must be a full memory barrier on each affected CPU
1784	 * between the time that try_stop_cpus() is called and the
1785	 * time that it returns.
1786	 *
1787	 * In the current initial implementation of cpu_stop, the
1788	 * above condition is already met when the control reaches
1789	 * this point and the following smp_mb() is not strictly
1790	 * necessary.  Do smp_mb() anyway for documentation and
1791	 * robustness against future implementation changes.
1792	 */
1793	smp_mb(); /* See above comment block. */
1794	return 0;
1795}
1796
1797/*
1798 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1799 * approach to force grace period to end quickly.  This consumes
1800 * significant time on all CPUs, and is thus not recommended for
1801 * any sort of common-case code.
1802 *
1803 * Note that it is illegal to call this function while holding any
1804 * lock that is acquired by a CPU-hotplug notifier.  Failing to
1805 * observe this restriction will result in deadlock.
1806 *
1807 * This implementation can be thought of as an application of ticket
1808 * locking to RCU, with sync_sched_expedited_started and
1809 * sync_sched_expedited_done taking on the roles of the halves
1810 * of the ticket-lock word.  Each task atomically increments
1811 * sync_sched_expedited_started upon entry, snapshotting the old value,
1812 * then attempts to stop all the CPUs.  If this succeeds, then each
1813 * CPU will have executed a context switch, resulting in an RCU-sched
1814 * grace period.  We are then done, so we use atomic_cmpxchg() to
1815 * update sync_sched_expedited_done to match our snapshot -- but
1816 * only if someone else has not already advanced past our snapshot.
1817 *
1818 * On the other hand, if try_stop_cpus() fails, we check the value
1819 * of sync_sched_expedited_done.  If it has advanced past our
1820 * initial snapshot, then someone else must have forced a grace period
1821 * some time after we took our snapshot.  In this case, our work is
1822 * done for us, and we can simply return.  Otherwise, we try again,
1823 * but keep our initial snapshot for purposes of checking for someone
1824 * doing our work for us.
1825 *
1826 * If we fail too many times in a row, we fall back to synchronize_sched().
1827 */
1828void synchronize_sched_expedited(void)
1829{
1830	int firstsnap, s, snap, trycount = 0;
1831
1832	/* Note that atomic_inc_return() implies full memory barrier. */
1833	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1834	get_online_cpus();
1835
1836	/*
1837	 * Each pass through the following loop attempts to force a
1838	 * context switch on each CPU.
1839	 */
1840	while (try_stop_cpus(cpu_online_mask,
1841			     synchronize_sched_expedited_cpu_stop,
1842			     NULL) == -EAGAIN) {
1843		put_online_cpus();
1844
1845		/* No joy, try again later.  Or just synchronize_sched(). */
1846		if (trycount++ < 10)
1847			udelay(trycount * num_online_cpus());
1848		else {
1849			synchronize_sched();
1850			return;
1851		}
1852
1853		/* Check to see if someone else did our work for us. */
1854		s = atomic_read(&sync_sched_expedited_done);
1855		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1856			smp_mb(); /* ensure test happens before caller kfree */
1857			return;
1858		}
1859
1860		/*
1861		 * Refetching sync_sched_expedited_started allows later
1862		 * callers to piggyback on our grace period.  We subtract
1863		 * 1 to get the same token that the last incrementer got.
1864		 * We retry after they started, so our grace period works
1865		 * for them, and they started after our first try, so their
1866		 * grace period works for us.
1867		 */
1868		get_online_cpus();
1869		snap = atomic_read(&sync_sched_expedited_started) - 1;
1870		smp_mb(); /* ensure read is before try_stop_cpus(). */
1871	}
1872
1873	/*
1874	 * Everyone up to our most recent fetch is covered by our grace
1875	 * period.  Update the counter, but only if our work is still
1876	 * relevant -- which it won't be if someone who started later
1877	 * than we did beat us to the punch.
1878	 */
1879	do {
1880		s = atomic_read(&sync_sched_expedited_done);
1881		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1882			smp_mb(); /* ensure test happens before caller kfree */
1883			break;
1884		}
1885	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1886
1887	put_online_cpus();
1888}
1889EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1890
1891#endif /* #else #ifndef CONFIG_SMP */
1892
1893#if !defined(CONFIG_RCU_FAST_NO_HZ)
1894
1895/*
1896 * Check to see if any future RCU-related work will need to be done
1897 * by the current CPU, even if none need be done immediately, returning
1898 * 1 if so.  This function is part of the RCU implementation; it is -not-
1899 * an exported member of the RCU API.
1900 *
1901 * Because we have preemptible RCU, just check whether this CPU needs
1902 * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
1903 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1904 */
1905int rcu_needs_cpu(int cpu)
1906{
1907	return rcu_needs_cpu_quick_check(cpu);
1908}
1909
1910/*
1911 * Check to see if we need to continue a callback-flush operations to
1912 * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
1913 * entry is not configured, so we never do need to.
1914 */
1915static void rcu_needs_cpu_flush(void)
1916{
1917}
1918
1919#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1920
1921#define RCU_NEEDS_CPU_FLUSHES 5
1922static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1923static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1924
1925/*
1926 * Check to see if any future RCU-related work will need to be done
1927 * by the current CPU, even if none need be done immediately, returning
1928 * 1 if so.  This function is part of the RCU implementation; it is -not-
1929 * an exported member of the RCU API.
1930 *
1931 * Because we are not supporting preemptible RCU, attempt to accelerate
1932 * any current grace periods so that RCU no longer needs this CPU, but
1933 * only if all other CPUs are already in dynticks-idle mode.  This will
1934 * allow the CPU cores to be powered down immediately, as opposed to after
1935 * waiting many milliseconds for grace periods to elapse.
1936 *
1937 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1938 * disabled, we do one pass of force_quiescent_state(), then do a
1939 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1940 * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
1941 */
1942int rcu_needs_cpu(int cpu)
1943{
1944	int c = 0;
1945	int snap;
1946	int thatcpu;
1947
1948	/* Check for being in the holdoff period. */
1949	if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1950		return rcu_needs_cpu_quick_check(cpu);
1951
1952	/* Don't bother unless we are the last non-dyntick-idle CPU. */
1953	for_each_online_cpu(thatcpu) {
1954		if (thatcpu == cpu)
1955			continue;
1956		snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1957						     thatcpu).dynticks);
1958		smp_mb(); /* Order sampling of snap with end of grace period. */
1959		if ((snap & 0x1) != 0) {
1960			per_cpu(rcu_dyntick_drain, cpu) = 0;
1961			per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1962			return rcu_needs_cpu_quick_check(cpu);
1963		}
1964	}
1965
1966	/* Check and update the rcu_dyntick_drain sequencing. */
1967	if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1968		/* First time through, initialize the counter. */
1969		per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1970	} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1971		/* We have hit the limit, so time to give up. */
1972		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1973		return rcu_needs_cpu_quick_check(cpu);
1974	}
1975
1976	/* Do one step pushing remaining RCU callbacks through. */
1977	if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1978		rcu_sched_qs(cpu);
1979		force_quiescent_state(&rcu_sched_state, 0);
1980		c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1981	}
1982	if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1983		rcu_bh_qs(cpu);
1984		force_quiescent_state(&rcu_bh_state, 0);
1985		c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1986	}
1987
1988	/* If RCU callbacks are still pending, RCU still needs this CPU. */
1989	if (c)
1990		invoke_rcu_core();
1991	return c;
1992}
1993
1994/*
1995 * Check to see if we need to continue a callback-flush operations to
1996 * allow the last CPU to enter dyntick-idle mode.
1997 */
1998static void rcu_needs_cpu_flush(void)
1999{
2000	int cpu = smp_processor_id();
2001	unsigned long flags;
2002
2003	if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
2004		return;
2005	local_irq_save(flags);
2006	(void)rcu_needs_cpu(cpu);
2007	local_irq_restore(flags);
2008}
2009
2010#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */