Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / kernel / rcutree.c
index b579e4f..d0c5baf 100644 (file)
@@ -36,9 +36,9 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/nmi.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/kernel_stat.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
 
 #include "rcutree.h"
+#include <trace/events/rcu.h>
+
+#include "rcu.h"
 
 /* Data structures. */
 
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 
 #define RCU_STATE_INITIALIZER(structname) { \
-       .level = { &structname.node[0] }, \
+       .level = { &structname##_state.node[0] }, \
        .levelcnt = { \
                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
                NUM_RCU_LVL_1, \
@@ -65,28 +71,53 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
                NUM_RCU_LVL_3, \
                NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
        }, \
-       .signaled = RCU_GP_IDLE, \
+       .fqs_state = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
-       .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-       .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
+       .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+       .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
        .name = #structname, \
 }
 
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
 
+/*
+ * The rcu_scheduler_active variable transitions from zero to one just
+ * before the first task is spawned.  So when this variable is zero, RCU
+ * can assume that there is but one task, allowing RCU to (for example)
+ * optimized synchronize_sched() to a simple barrier().  When this variable
+ * is one, RCU must actually do all the hard work required to detect real
+ * grace periods.  This variable is also used to suppress boot-time false
+ * positives from lockdep-RCU error checking.
+ */
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 
 /*
+ * The rcu_scheduler_fully_active variable transitions from zero to one
+ * during the early_initcall() processing, which is after the scheduler
+ * is capable of creating new tasks.  So RCU processing (for example,
+ * creating tasks for RCU priority boosting) must be delayed until after
+ * rcu_scheduler_fully_active transitions from zero to one.  We also
+ * currently delay invocation of any RCU callbacks until after this point.
+ *
+ * It might later prove better for people registering RCU callbacks during
+ * early boot to take responsibility for these callbacks, but one step at
+ * a time.
+ */
+static int rcu_scheduler_fully_active __read_mostly;
+
+#ifdef CONFIG_RCU_BOOST
+
+/*
  * Control variables for per-CPU and per-rcu_node kthreads.  These
  * handle all flavors of RCU.
  */
@@ -94,14 +125,13 @@ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
-static char rcu_kthreads_spawnable;
 
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
-static void invoke_rcu_cpu_kthread(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
 
-#define RCU_KTHREAD_PRIO 1     /* RT priority for per-CPU kthreads. */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void invoke_rcu_core(void);
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
 /*
  * Track the rcutorture test sequence number and the update version
@@ -129,43 +159,50 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  * Note a quiescent state.  Because we do not need to know
  * how many quiescent states passed, just if there was at least
  * one since the start of the grace period, this just sets a flag.
+ * The caller must have disabled preemption.
  */
 void rcu_sched_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 
-       rdp->passed_quiesc_completed = rdp->gpnum - 1;
+       rdp->passed_quiesce_gpnum = rdp->gpnum;
        barrier();
-       rdp->passed_quiesc = 1;
+       if (rdp->passed_quiesce == 0)
+               trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
+       rdp->passed_quiesce = 1;
 }
 
 void rcu_bh_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 
-       rdp->passed_quiesc_completed = rdp->gpnum - 1;
+       rdp->passed_quiesce_gpnum = rdp->gpnum;
        barrier();
-       rdp->passed_quiesc = 1;
+       if (rdp->passed_quiesce == 0)
+               trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
+       rdp->passed_quiesce = 1;
 }
 
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
  * and requires special handling for preemptible RCU.
+ * The caller must have disabled preemption.
  */
 void rcu_note_context_switch(int cpu)
 {
+       trace_rcu_utilization("Start context switch");
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
+       trace_rcu_utilization("End context switch");
 }
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
-#ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-       .dynticks_nesting = 1,
+       .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
        .dynticks = ATOMIC_INIT(1),
 };
-#endif /* #ifdef CONFIG_NO_HZ */
 
-static int blimit = 10;                /* Maximum callbacks per softirq. */
+static int blimit = 10;                /* Maximum callbacks per rcu_do_batch. */
 static int qhimark = 10000;    /* If this many pending, ignore blimit. */
 static int qlowmark = 100;     /* Once only this many pending, use blimit. */
 
@@ -173,8 +210,11 @@ module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
 
-int rcu_cpu_stall_suppress __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+
 module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
 
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
@@ -266,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
        return &rsp->node[0];
 }
 
-#ifdef CONFIG_SMP
-
 /*
  * If the specified CPU is offline, tell the caller that it is in
  * a quiescent state.  Otherwise, whack it with a reschedule IPI.
@@ -282,87 +320,216 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 {
        /*
-        * If the CPU is offline, it is in a quiescent state.  We can
-        * trust its state not to change because interrupts are disabled.
+        * If the CPU is offline for more than a jiffy, it is in a quiescent
+        * state.  We can trust its state not to change because interrupts
+        * are disabled.  The reason for the jiffy's worth of slack is to
+        * handle CPUs initializing on the way up and finding their way
+        * to the idle loop on the way down.
         */
-       if (cpu_is_offline(rdp->cpu)) {
+       if (cpu_is_offline(rdp->cpu) &&
+           ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
+               trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
                rdp->offline_fqs++;
                return 1;
        }
-
-       /* If preemptible RCU, no point in sending reschedule IPI. */
-       if (rdp->preemptible)
-               return 0;
-
-       /* The CPU is online, so send it a reschedule IPI. */
-       if (rdp->cpu != smp_processor_id())
-               smp_send_reschedule(rdp->cpu);
-       else
-               set_need_resched();
-       rdp->resched_ipi++;
        return 0;
 }
 
-#endif /* #ifdef CONFIG_SMP */
+/*
+ * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
+ *
+ * If the new value of the ->dynticks_nesting counter now is zero,
+ * we really have entered idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
+{
+       trace_rcu_dyntick("Start", oldval, 0);
+       if (!is_idle_task(current)) {
+               struct task_struct *idle = idle_task(smp_processor_id());
 
-#ifdef CONFIG_NO_HZ
+               trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
+               ftrace_dump(DUMP_ALL);
+               WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                         current->pid, current->comm,
+                         idle->pid, idle->comm); /* must be idle task! */
+       }
+       rcu_prepare_for_idle(smp_processor_id());
+       /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+       smp_mb__before_atomic_inc();  /* See above. */
+       atomic_inc(&rdtp->dynticks);
+       smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+       WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+
+       /*
+        * The idle task is not permitted to enter the idle loop while
+        * in an RCU read-side critical section.
+        */
+       rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
+                          "Illegal idle entry in RCU read-side critical section.");
+       rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
+                          "Illegal idle entry in RCU-bh read-side critical section.");
+       rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
+                          "Illegal idle entry in RCU-sched read-side critical section.");
+}
 
 /**
- * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
  *
- * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * Enter idle mode, in other words, -leave- the mode in which RCU
  * read-side critical sections can occur.  (Though RCU read-side
- * critical sections can occur in irq handlers in nohz mode, a possibility
- * handled by rcu_irq_enter() and rcu_irq_exit()).
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * We crowbar the ->dynticks_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
  */
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
 {
        unsigned long flags;
+       long long oldval;
        struct rcu_dynticks *rdtp;
 
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-       if (--rdtp->dynticks_nesting) {
-               local_irq_restore(flags);
-               return;
-       }
-       /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-       smp_mb__before_atomic_inc();  /* See above. */
-       atomic_inc(&rdtp->dynticks);
-       smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
-       WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+       oldval = rdtp->dynticks_nesting;
+       WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+       if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
+               rdtp->dynticks_nesting = 0;
+       else
+               rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
+       rcu_idle_enter_common(rdtp, oldval);
        local_irq_restore(flags);
-
-       /* If the interrupt queued a callback, get out of dyntick mode. */
-       if (in_irq() &&
-           (__get_cpu_var(rcu_sched_data).nxtlist ||
-            __get_cpu_var(rcu_bh_data).nxtlist ||
-            rcu_preempt_needs_cpu(smp_processor_id())))
-               set_need_resched();
 }
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
-/*
- * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+/**
+ * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit().  If your
+ * architecture violates this assumption, RCU will give you what you
+ * deserve, good and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
  *
- * Exit nohz mode, in other words, -enter- the mode in which RCU
- * read-side critical sections normally occur.
+ * You have been warned.
  */
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
 {
        unsigned long flags;
+       long long oldval;
        struct rcu_dynticks *rdtp;
 
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-       if (rdtp->dynticks_nesting++) {
-               local_irq_restore(flags);
-               return;
-       }
+       oldval = rdtp->dynticks_nesting;
+       rdtp->dynticks_nesting--;
+       WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+       if (rdtp->dynticks_nesting)
+               trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
+       else
+               rcu_idle_enter_common(rdtp, oldval);
+       local_irq_restore(flags);
+}
+
+/*
+ * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
+ *
+ * If the new value of the ->dynticks_nesting counter was previously zero,
+ * we really have exited idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
+{
        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
        atomic_inc(&rdtp->dynticks);
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
        smp_mb__after_atomic_inc();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+       rcu_cleanup_after_idle(smp_processor_id());
+       trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
+       if (!is_idle_task(current)) {
+               struct task_struct *idle = idle_task(smp_processor_id());
+
+               trace_rcu_dyntick("Error on exit: not idle task",
+                                 oldval, rdtp->dynticks_nesting);
+               ftrace_dump(DUMP_ALL);
+               WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                         current->pid, current->comm,
+                         idle->pid, idle->comm); /* must be idle task! */
+       }
+}
+
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
+ * allow for the possibility of usermode upcalls messing up our count
+ * of interrupt nesting level during the busy period that is just
+ * now starting.
+ */
+void rcu_idle_exit(void)
+{
+       unsigned long flags;
+       struct rcu_dynticks *rdtp;
+       long long oldval;
+
+       local_irq_save(flags);
+       rdtp = &__get_cpu_var(rcu_dynticks);
+       oldval = rdtp->dynticks_nesting;
+       WARN_ON_ONCE(oldval < 0);
+       if (oldval & DYNTICK_TASK_NEST_MASK)
+               rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
+       else
+               rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+       rcu_idle_exit_common(rdtp, oldval);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
+
+/**
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to
+ * user mode!  This code assumes that the idle loop never does upcalls to
+ * user mode.  If your architecture does do upcalls from the idle loop (or
+ * does anything else that results in unbalanced calls to the irq_enter()
+ * and irq_exit() functions), RCU will give you what you deserve, good
+ * and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_enter(void)
+{
+       unsigned long flags;
+       struct rcu_dynticks *rdtp;
+       long long oldval;
+
+       local_irq_save(flags);
+       rdtp = &__get_cpu_var(rcu_dynticks);
+       oldval = rdtp->dynticks_nesting;
+       rdtp->dynticks_nesting++;
+       WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+       if (oldval)
+               trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
+       else
+               rcu_idle_exit_common(rdtp, oldval);
        local_irq_restore(flags);
 }
 
@@ -409,30 +576,81 @@ void rcu_nmi_exit(void)
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 
+#ifdef CONFIG_PROVE_RCU
+
 /**
- * rcu_irq_enter - inform RCU of entry to hard irq context
+ * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
  *
- * If the CPU was idle with dynamic ticks active, this updates the
- * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ * If the current CPU is in its idle loop and is neither in an interrupt
+ * or NMI handler, return true.
  */
-void rcu_irq_enter(void)
+int rcu_is_cpu_idle(void)
 {
-       rcu_exit_nohz();
+       int ret;
+
+       preempt_disable();
+       ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+       preempt_enable();
+       return ret;
 }
+EXPORT_SYMBOL(rcu_is_cpu_idle);
 
-/**
- * rcu_irq_exit - inform RCU of exit from hard irq context
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Is the current CPU online?  Disable preemption to avoid false positives
+ * that could otherwise happen due to the current CPU number being sampled,
+ * this task being preempted, its old CPU being taken offline, resuming
+ * on some other CPU, then determining that its old CPU is now offline.
+ * It is OK to use RCU on an offline processor during initial boot, hence
+ * the check for rcu_scheduler_fully_active.  Note also that it is OK
+ * for a CPU coming online to use RCU for one jiffy prior to marking itself
+ * online in the cpu_online_mask.  Similarly, it is OK for a CPU going
+ * offline to continue to use RCU for one jiffy after marking itself
+ * offline in the cpu_online_mask.  This leniency is necessary given the
+ * non-atomic nature of the online and offline processing, for example,
+ * the fact that a CPU enters the scheduler after completing the CPU_DYING
+ * notifiers.
+ *
+ * This is also why RCU internally marks CPUs online during the
+ * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
  *
- * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
- * to put let the RCU handling be aware that the CPU is going back to idle
- * with no ticks.
+ * Disable checking if in an NMI handler because we cannot safely report
+ * errors from NMI handlers anyway.
  */
-void rcu_irq_exit(void)
+bool rcu_lockdep_current_cpu_online(void)
 {
-       rcu_enter_nohz();
+       struct rcu_data *rdp;
+       struct rcu_node *rnp;
+       bool ret;
+
+       if (in_nmi())
+               return 1;
+       preempt_disable();
+       rdp = &__get_cpu_var(rcu_sched_data);
+       rnp = rdp->mynode;
+       ret = (rdp->grpmask & rnp->qsmaskinit) ||
+             !rcu_scheduler_fully_active;
+       preempt_enable();
+       return ret;
 }
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
-#ifdef CONFIG_SMP
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+/**
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
+ *
+ * If the current CPU is idle or running at a first-level (not nested)
+ * interrupt from idle, return true.  The caller must have at least
+ * disabled preemption.
+ */
+int rcu_is_cpu_rrupt_from_idle(void)
+{
+       return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
+}
 
 /*
  * Snapshot the specified CPU's dynticks counter so that we can later
@@ -442,7 +660,7 @@ void rcu_irq_exit(void)
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
-       return 0;
+       return (rdp->dynticks_snap & 0x1) == 0;
 }
 
 /*
@@ -453,11 +671,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
  */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-       unsigned long curr;
-       unsigned long snap;
+       unsigned int curr;
+       unsigned int snap;
 
-       curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
-       snap = (unsigned long)rdp->dynticks_snap;
+       curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
+       snap = (unsigned int)rdp->dynticks_snap;
 
        /*
         * If the CPU passed through or entered a dynticks idle phase with
@@ -467,7 +685,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         * read-side critical section that started before the beginning
         * of the current RCU grace period.
         */
-       if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
+       if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+               trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
                rdp->dynticks_fqs++;
                return 1;
        }
@@ -476,32 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
        return rcu_implicit_offline_qs(rdp);
 }
 
-#endif /* #ifdef CONFIG_SMP */
-
-#else /* #ifdef CONFIG_NO_HZ */
-
-#ifdef CONFIG_SMP
-
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
+static int jiffies_till_stall_check(void)
 {
-       return 0;
-}
+       int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
 
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
-{
-       return rcu_implicit_offline_qs(rdp);
+       /*
+        * Limit check must be consistent with the Kconfig limits
+        * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+        */
+       if (till_stall_check < 3) {
+               ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+               till_stall_check = 3;
+       } else if (till_stall_check > 300) {
+               ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+               till_stall_check = 300;
+       }
+       return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
 }
 
-#endif /* #ifdef CONFIG_SMP */
-
-#endif /* #else #ifdef CONFIG_NO_HZ */
-
-int rcu_cpu_stall_suppress __read_mostly;
-
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
        rsp->gp_start = jiffies;
-       rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+       rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
 }
 
 static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -509,6 +724,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        int cpu;
        long delta;
        unsigned long flags;
+       int ndetected;
        struct rcu_node *rnp = rcu_get_root(rsp);
 
        /* Only let one CPU complain about others per time interval. */
@@ -519,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-       rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-
-       /*
-        * Now rat on any tasks that got kicked up to the root rcu_node
-        * due to CPU offlining.
-        */
-       rcu_print_task_stall(rnp);
+       rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
        /*
@@ -533,21 +743,38 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * See Documentation/RCU/stallwarn.txt for info on how to debug
         * RCU CPU stall warnings.
         */
-       printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
+       printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
               rsp->name);
+       print_cpu_stall_info_begin();
        rcu_for_each_leaf_node(rsp, rnp) {
                raw_spin_lock_irqsave(&rnp->lock, flags);
-               rcu_print_task_stall(rnp);
+               ndetected += rcu_print_task_stall(rnp);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                if (rnp->qsmask == 0)
                        continue;
                for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-                       if (rnp->qsmask & (1UL << cpu))
-                               printk(" %d", rnp->grplo + cpu);
+                       if (rnp->qsmask & (1UL << cpu)) {
+                               print_cpu_stall_info(rsp, rnp->grplo + cpu);
+                               ndetected++;
+                       }
        }
-       printk("} (detected by %d, t=%ld jiffies)\n",
+
+       /*
+        * Now rat on any tasks that got kicked up to the root rcu_node
+        * due to CPU offlining.
+        */
+       rnp = rcu_get_root(rsp);
+       raw_spin_lock_irqsave(&rnp->lock, flags);
+       ndetected = rcu_print_task_stall(rnp);
+       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+       print_cpu_stall_info_end();
+       printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start));
-       trigger_all_cpu_backtrace();
+       if (ndetected == 0)
+               printk(KERN_ERR "INFO: Stall ended before state dump start\n");
+       else if (!trigger_all_cpu_backtrace())
+               dump_stack();
 
        /* If so configured, complain about tasks blocking the grace period. */
 
@@ -566,14 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp)
         * See Documentation/RCU/stallwarn.txt for info on how to debug
         * RCU CPU stall warnings.
         */
-       printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
-              rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
-       trigger_all_cpu_backtrace();
+       printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
+       print_cpu_stall_info_begin();
+       print_cpu_stall_info(rsp, smp_processor_id());
+       print_cpu_stall_info_end();
+       printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);
+       if (!trigger_all_cpu_backtrace())
+               dump_stack();
 
        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
-               rsp->jiffies_stall =
-                       jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+               rsp->jiffies_stall = jiffies +
+                                    3 * jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
        set_need_resched();  /* kick ourselves to get things going. */
@@ -581,21 +812,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
 
 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-       long delta;
+       unsigned long j;
+       unsigned long js;
        struct rcu_node *rnp;
 
        if (rcu_cpu_stall_suppress)
                return;
-       delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+       j = ACCESS_ONCE(jiffies);
+       js = ACCESS_ONCE(rsp->jiffies_stall);
        rnp = rdp->mynode;
-       if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
+       if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
 
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall(rsp);
 
-       } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
+       } else if (rcu_gp_in_progress(rsp) &&
+                  ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
 
-               /* They had two time units to dump stack, so complain. */
+               /* They had a few time units to dump stack, so complain. */
                print_other_cpu_stall(rsp);
        }
 }
@@ -647,11 +881,13 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                 * go looking for one.
                 */
                rdp->gpnum = rnp->gpnum;
+               trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
                if (rnp->qsmask & rdp->grpmask) {
                        rdp->qs_pending = 1;
-                       rdp->passed_quiesc = 0;
+                       rdp->passed_quiesce = 0;
                } else
                        rdp->qs_pending = 0;
+               zero_cpu_stall_ticks(rdp);
        }
 }
 
@@ -710,6 +946,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
+               trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
 
                /*
                 * If we were in an extended quiescent state, we may have
@@ -787,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
  * in preparation for detecting the next grace period.  The caller must hold
  * the root node's ->lock, which is released before return.  Hard irqs must
  * be disabled.
+ *
+ * Note that it is legal for a dying CPU (which is marked as offline) to
+ * invoke this function.  This can happen when the dying CPU reports its
+ * quiescent state.
  */
 static void
 rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
@@ -795,52 +1036,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_node *rnp = rcu_get_root(rsp);
 
-       if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
-               if (cpu_needs_another_gp(rsp, rdp))
-                       rsp->fqs_need_gp = 1;
-               if (rnp->completed == rsp->completed) {
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                       return;
-               }
-               raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+       if (!rcu_scheduler_fully_active ||
+           !cpu_needs_another_gp(rsp, rdp)) {
+               /*
+                * Either the scheduler hasn't yet spawned the first
+                * non-idle task or this CPU does not need another
+                * grace period.  Either way, don't start a new grace
+                * period.
+                */
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               return;
+       }
 
+       if (rsp->fqs_active) {
                /*
-                * Propagate new ->completed value to rcu_node structures
-                * so that other CPUs don't have to wait until the start
-                * of the next grace period to process their callbacks.
+                * This CPU needs a grace period, but force_quiescent_state()
+                * is running.  Tell it to start one on this CPU's behalf.
                 */
-               rcu_for_each_node_breadth_first(rsp, rnp) {
-                       raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-                       rnp->completed = rsp->completed;
-                       raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-               }
-               local_irq_restore(flags);
+               rsp->fqs_need_gp = 1;
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
 
        /* Advance to a new grace period and initialize state. */
        rsp->gpnum++;
-       WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
-       rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+       trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
+       WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
+       rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
        record_gp_stall_check_time(rsp);
-
-       /* Special-case the common single-level case. */
-       if (NUM_RCU_NODES == 1) {
-               rcu_preempt_check_blocked_tasks(rnp);
-               rnp->qsmask = rnp->qsmaskinit;
-               rnp->gpnum = rsp->gpnum;
-               rnp->completed = rsp->completed;
-               rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
-               rcu_start_gp_per_cpu(rsp, rnp, rdp);
-               rcu_preempt_boost_start_gp(rnp);
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
-               return;
-       }
-
        raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
 
-
        /* Exclude any concurrent CPU-hotplug operations. */
        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
 
@@ -870,12 +1096,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
+               trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+                                           rnp->level, rnp->grplo,
+                                           rnp->grphi, rnp->qsmask);
                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        }
 
        rnp = rcu_get_root(rsp);
        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
-       rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+       rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
@@ -891,6 +1120,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
        unsigned long gp_duration;
+       struct rcu_node *rnp = rcu_get_root(rsp);
+       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 
@@ -902,8 +1133,42 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        gp_duration = jiffies - rsp->gp_start;
        if (gp_duration > rsp->gp_max)
                rsp->gp_max = gp_duration;
-       rsp->completed = rsp->gpnum;
-       rsp->signaled = RCU_GP_IDLE;
+
+       /*
+        * We know the grace period is complete, but to everyone else
+        * it appears to still be ongoing.  But it is also the case
+        * that to everyone else it looks like there is nothing that
+        * they can do to advance the grace period.  It is therefore
+        * safe for us to drop the lock in order to mark the grace
+        * period as completed in all of the rcu_node structures.
+        *
+        * But if this CPU needs another grace period, it will take
+        * care of this while initializing the next grace period.
+        * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+        * because the callbacks have not yet been advanced: Those
+        * callbacks are waiting on the grace period that just now
+        * completed.
+        */
+       if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+               raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+
+               /*
+                * Propagate new ->completed value to rcu_node structures
+                * so that other CPUs don't have to wait until the start
+                * of the next grace period to process their callbacks.
+                */
+               rcu_for_each_node_breadth_first(rsp, rnp) {
+                       raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                       rnp->completed = rsp->gpnum;
+                       raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+               }
+               rnp = rcu_get_root(rsp);
+               raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+       }
+
+       rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
+       trace_rcu_grace_period(rsp->name, rsp->completed, "end");
+       rsp->fqs_state = RCU_GP_IDLE;
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
 
@@ -931,6 +1196,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        return;
                }
                rnp->qsmask &= ~mask;
+               trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+                                                mask, rnp->qsmask, rnp->level,
+                                                rnp->grplo, rnp->grphi,
+                                                !!rnp->gp_tasks);
                if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 
                        /* Other bits still set at this level, so done. */
@@ -969,7 +1238,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
  * based on quiescent states detected in an earlier grace period!
  */
 static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
 {
        unsigned long flags;
        unsigned long mask;
@@ -977,17 +1246,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
 
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
-       if (lastcomp != rnp->completed) {
+       if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
 
                /*
-                * Someone beat us to it for this grace period, so leave.
-                * The race with GP start is resolved by the fact that we
-                * hold the leaf rcu_node lock, so that the per-CPU bits
-                * cannot yet be initialized -- so we would simply find our
-                * CPU's bit already cleared in rcu_report_qs_rnp() if this
-                * race occurred.
+                * The grace period in which this quiescent state was
+                * recorded has ended, so don't report it upwards.
+                * We will instead need a new quiescent state that lies
+                * within the current grace period.
                 */
-               rdp->passed_quiesc = 0; /* try again later! */
+               rdp->passed_quiesce = 0;        /* need qs for new gp. */
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -1031,73 +1298,129 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-       if (!rdp->passed_quiesc)
+       if (!rdp->passed_quiesce)
                return;
 
        /*
         * Tell RCU we are done (but rcu_report_qs_rdp() will be the
         * judge of that).
         */
-       rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+       rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
  * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * Synchronization is not required because this function executes
- * in stop_machine() context.
+ * Also record a quiescent state for this CPU for the current grace period.
+ * Synchronization and interrupt disabling are not required because
+ * this function executes in stop_machine() context.  Therefore, cleanup
+ * operations that might block must be done later from the CPU_DEAD
+ * notifier.
+ *
+ * Note that the outgoing CPU's bit has already been cleared in the
+ * cpu_online_mask.  This allows us to randomly pick a callback
+ * destination from the bits set in that mask.
  */
-static void rcu_send_cbs_to_online(struct rcu_state *rsp)
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
        int i;
-       /* current DYING CPU is cleared in the cpu_online_mask */
+       unsigned long mask;
        int receive_cpu = cpumask_any(cpu_online_mask);
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
+       RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
+
+       /* First, adjust the counts. */
+       if (rdp->nxtlist != NULL) {
+               receive_rdp->qlen_lazy += rdp->qlen_lazy;
+               receive_rdp->qlen += rdp->qlen;
+               rdp->qlen_lazy = 0;
+               rdp->qlen = 0;
+       }
 
-       if (rdp->nxtlist == NULL)
-               return;  /* irqs disabled, so comparison is stable. */
+       /*
+        * Next, move ready-to-invoke callbacks to be invoked on some
+        * other CPU.  These will not be required to pass through another
+        * grace period:  They are done, regardless of CPU.
+        */
+       if (rdp->nxtlist != NULL &&
+           rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
+               struct rcu_head *oldhead;
+               struct rcu_head **oldtail;
+               struct rcu_head **newtail;
+
+               oldhead = rdp->nxtlist;
+               oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
+               rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+               *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
+               *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
+               newtail = rdp->nxttail[RCU_DONE_TAIL];
+               for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
+                       if (receive_rdp->nxttail[i] == oldtail)
+                               receive_rdp->nxttail[i] = newtail;
+                       if (rdp->nxttail[i] == newtail)
+                               rdp->nxttail[i] = &rdp->nxtlist;
+               }
+       }
 
-       *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
-       receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-       receive_rdp->qlen += rdp->qlen;
-       receive_rdp->n_cbs_adopted += rdp->qlen;
-       rdp->n_cbs_orphaned += rdp->qlen;
+       /*
+        * Finally, put the rest of the callbacks at the end of the list.
+        * The ones that made it partway through get to start over:  We
+        * cannot assume that grace periods are synchronized across CPUs.
+        * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
+        * this does not seem compelling.  Not yet, anyway.)
+        */
+       if (rdp->nxtlist != NULL) {
+               *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+               receive_rdp->nxttail[RCU_NEXT_TAIL] =
+                               rdp->nxttail[RCU_NEXT_TAIL];
+               receive_rdp->n_cbs_adopted += rdp->qlen;
+               rdp->n_cbs_orphaned += rdp->qlen;
+
+               rdp->nxtlist = NULL;
+               for (i = 0; i < RCU_NEXT_SIZE; i++)
+                       rdp->nxttail[i] = &rdp->nxtlist;
+       }
 
-       rdp->nxtlist = NULL;
-       for (i = 0; i < RCU_NEXT_SIZE; i++)
-               rdp->nxttail[i] = &rdp->nxtlist;
-       rdp->qlen = 0;
+       /*
+        * Record a quiescent state for the dying CPU.  This is safe
+        * only because we have already cleared out the callbacks.
+        * (Otherwise, the RCU core might try to schedule the invocation
+        * of callbacks on this now-offline CPU, which would be bad.)
+        */
+       mask = rdp->grpmask;    /* rnp->grplo is constant. */
+       trace_rcu_grace_period(rsp->name,
+                              rnp->gpnum + 1 - !!(rnp->qsmask & mask),
+                              "cpuofl");
+       rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
+       /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
 }
 
 /*
- * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
- * and move all callbacks from the outgoing CPU to the current one.
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context.  Do the remainder of the cleanup.
  * There can only be one CPU hotplug operation at a time, so no other
  * CPU can be attempting to update rcu_cpu_kthread_task.
  */
-static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        unsigned long mask;
        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-       struct rcu_node *rnp;
-       struct task_struct *t;
+       struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rnp. */
 
-       /* Stop the CPU's kthread. */
-       t = per_cpu(rcu_cpu_kthread_task, cpu);
-       if (t != NULL) {
-               per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
-               kthread_stop(t);
-       }
+       /* Adjust any no-longer-needed kthreads. */
+       rcu_stop_cpu_kthread(cpu);
+       rcu_node_kthread_setaffinity(rnp, -1);
+
+       /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
 
        /* Exclude any attempts to start a new grace period. */
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
 
        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
-       rnp = rdp->mynode;      /* this is the outgoing CPU's rnp. */
        mask = rdp->grpmask;    /* rnp->grplo is constant. */
        do {
                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
@@ -1128,45 +1451,16 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        else
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
-               rcu_report_exp_rnp(rsp, rnp);
-
-       /*
-        * If there are no more online CPUs for this rcu_node structure,
-        * kill the rcu_node structure's kthread.  Otherwise, adjust its
-        * affinity.
-        */
-       t = rnp->node_kthread_task;
-       if (t != NULL &&
-           rnp->qsmaskinit == 0) {
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               rnp->node_kthread_task = NULL;
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
-               kthread_stop(t);
-               rcu_stop_boost_kthread(rnp);
-       } else
-               rcu_node_kthread_setaffinity(rnp, -1);
-}
-
-/*
- * Remove the specified CPU from the RCU hierarchy and move any pending
- * callbacks that it might have to the current CPU.  This code assumes
- * that at least one CPU in the system will remain running at all times.
- * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
- */
-static void rcu_offline_cpu(int cpu)
-{
-       __rcu_offline_cpu(cpu, &rcu_sched_state);
-       __rcu_offline_cpu(cpu, &rcu_bh_state);
-       rcu_preempt_offline_cpu(cpu);
+               rcu_report_exp_rnp(rsp, rnp, true);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_send_cbs_to_online(struct rcu_state *rsp)
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
 
-static void rcu_offline_cpu(int cpu)
+static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
 
@@ -1180,17 +1474,25 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-       int count;
+       int bl, count, count_lazy;
 
        /* If no callbacks are ready, just return.*/
-       if (!cpu_has_callbacks_ready_to_invoke(rdp))
+       if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+               trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
+               trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+                                   need_resched(), is_idle_task(current),
+                                   rcu_is_callbacks_kthread());
                return;
+       }
 
        /*
         * Extract the list of ready callbacks, disabling to prevent
         * races with call_rcu() from interrupt handlers.
         */
        local_irq_save(flags);
+       WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+       bl = rdp->blimit;
+       trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
        list = rdp->nxtlist;
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1201,20 +1503,28 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_restore(flags);
 
        /* Invoke callbacks. */
-       count = 0;
+       count = count_lazy = 0;
        while (list) {
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
-               list->func(list);
+               if (__rcu_reclaim(rsp->name, list))
+                       count_lazy++;
                list = next;
-               if (++count >= rdp->blimit)
+               /* Stop only if limit reached and CPU has something to do. */
+               if (++count >= bl &&
+                   (need_resched() ||
+                    (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                        break;
        }
 
        local_irq_save(flags);
+       trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
+                           is_idle_task(current),
+                           rcu_is_callbacks_kthread());
 
        /* Update count, and requeue any remaining callbacks. */
+       rdp->qlen_lazy -= count_lazy;
        rdp->qlen -= count;
        rdp->n_cbs_invoked += count;
        if (list != NULL) {
@@ -1240,25 +1550,25 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 
        local_irq_restore(flags);
 
-       /* Re-raise the RCU softirq if there are callbacks remaining. */
+       /* Re-invoke RCU core processing if there are callbacks remaining. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
-               invoke_rcu_cpu_kthread();
+               invoke_rcu_core();
 }
 
 /*
  * Check to see if this CPU is in a non-context-switch quiescent state
  * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule the RCU softirq handler.
+ * Also schedule RCU core processing.
  *
- * This function must be called with hardirqs disabled.  It is normally
+ * This function must be called from hardirq context.  It is normally
  * invoked from the scheduling-clock interrupt.  If rcu_pending returns
  * false, there is no point in invoking rcu_check_callbacks().
  */
 void rcu_check_callbacks(int cpu, int user)
 {
-       if (user ||
-           (idle_cpu(cpu) && rcu_scheduler_active &&
-            !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+       trace_rcu_utilization("Start scheduler-tick");
+       increment_cpu_stall_ticks();
+       if (user || rcu_is_cpu_rrupt_from_idle()) {
 
                /*
                 * Get here if this CPU took its interrupt from user
@@ -1288,11 +1598,10 @@ void rcu_check_callbacks(int cpu, int user)
        }
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
-               invoke_rcu_cpu_kthread();
+               invoke_rcu_core();
+       trace_rcu_utilization("End scheduler-tick");
 }
 
-#ifdef CONFIG_SMP
-
 /*
  * Scan the leaf rcu_node structures, processing dyntick state for any that
  * have not yet encountered a quiescent state, using the function specified.
@@ -1316,8 +1625,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                        return;
                }
                if (rnp->qsmask == 0) {
-                       rcu_initiate_boost(rnp);
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                       rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
                        continue;
                }
                cpu = rnp->grplo;
@@ -1336,10 +1644,10 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        rnp = rcu_get_root(rsp);
-       raw_spin_lock_irqsave(&rnp->lock, flags);
-       if (rnp->qsmask == 0)
-               rcu_initiate_boost(rnp);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       if (rnp->qsmask == 0) {
+               raw_spin_lock_irqsave(&rnp->lock, flags);
+               rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+       }
 }
 
 /*
@@ -1351,10 +1659,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
 
-       if (!rcu_gp_in_progress(rsp))
+       trace_rcu_utilization("Start fqs");
+       if (!rcu_gp_in_progress(rsp)) {
+               trace_rcu_utilization("End fqs");
                return;  /* No grace period in progress, nothing to force. */
+       }
        if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
                rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+               trace_rcu_utilization("End fqs");
                return; /* Someone else is already on the job. */
        }
        if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1368,7 +1680,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                goto unlock_fqs_ret;  /* no GP in progress, time updated. */
        }
        rsp->fqs_active = 1;
-       switch (rsp->signaled) {
+       switch (rsp->fqs_state) {
        case RCU_GP_IDLE:
        case RCU_GP_INIT:
 
@@ -1384,7 +1696,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                force_qs_rnp(rsp, dyntick_save_progress_counter);
                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
                if (rcu_gp_in_progress(rsp))
-                       rsp->signaled = RCU_FORCE_QS;
+                       rsp->fqs_state = RCU_FORCE_QS;
                break;
 
        case RCU_FORCE_QS:
@@ -1403,26 +1715,19 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
                rsp->fqs_need_gp = 0;
                rcu_start_gp(rsp, flags); /* releases rnp->lock */
+               trace_rcu_utilization("End fqs");
                return;
        }
        raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 unlock_fqs_ret:
        raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
+       trace_rcu_utilization("End fqs");
 }
 
-#else /* #ifdef CONFIG_SMP */
-
-static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
-{
-       set_need_resched();
-}
-
-#endif /* #else #ifdef CONFIG_SMP */
-
 /*
- * This does the RCU processing work from softirq context for the
- * specified rcu_state and rcu_data structures.  This may be called
- * only from the CPU to whom the rdp belongs.
+ * This does the RCU core processing work for the specified rcu_state
+ * and rcu_data structures.  This may be called only from the CPU to
+ * whom the rdp belongs.
  */
 static void
 __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1454,372 +1759,54 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        }
 
        /* If there are callbacks ready, invoke them. */
-       rcu_do_batch(rsp, rdp);
+       if (cpu_has_callbacks_ready_to_invoke(rdp))
+               invoke_rcu_callbacks(rsp, rdp);
 }
 
 /*
- * Do softirq processing for the current CPU.
+ * Do RCU core processing for the current CPU.
  */
-static void rcu_process_callbacks(void)
+static void rcu_process_callbacks(struct softirq_action *unused)
 {
+       trace_rcu_utilization("Start RCU core");
        __rcu_process_callbacks(&rcu_sched_state,
                                &__get_cpu_var(rcu_sched_data));
        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
        rcu_preempt_process_callbacks();
-
-       /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
-       rcu_needs_cpu_flush();
-}
-
-/*
- * Wake up the current CPU's kthread.  This replaces raise_softirq()
- * in earlier versions of RCU.  Note that because we are running on
- * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
- * cannot disappear out from under us.
- */
-static void invoke_rcu_cpu_kthread(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __this_cpu_write(rcu_cpu_has_work, 1);
-       if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
-               local_irq_restore(flags);
-               return;
-       }
-       wake_up(&__get_cpu_var(rcu_cpu_wq));
-       local_irq_restore(flags);
-}
-
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * The caller must hold ->lock.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
-{
-       struct task_struct *t;
-
-       t = rnp->node_kthread_task;
-       if (t != NULL)
-               wake_up_process(t);
-}
-
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument.  The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
-       int policy;
-       struct sched_param sp;
-       struct task_struct *t;
-
-       t = per_cpu(rcu_cpu_kthread_task, cpu);
-       if (t == NULL)
-               return;
-       if (to_rt) {
-               policy = SCHED_FIFO;
-               sp.sched_priority = RCU_KTHREAD_PRIO;
-       } else {
-               policy = SCHED_NORMAL;
-               sp.sched_priority = 0;
-       }
-       sched_setscheduler_nocheck(t, policy, &sp);
-}
-
-/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
-{
-       unsigned long flags;
-       struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
-       struct rcu_node *rnp = rdp->mynode;
-
-       raw_spin_lock_irqsave(&rnp->lock, flags);
-       rnp->wakemask |= rdp->grpmask;
-       invoke_rcu_node_kthread(rnp);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted.  Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
-{
-       struct sched_param sp;
-       struct timer_list yield_timer;
-
-       setup_timer_on_stack(&yield_timer, f, arg);
-       mod_timer(&yield_timer, jiffies + 2);
-       sp.sched_priority = 0;
-       sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
-       schedule();
-       sp.sched_priority = RCU_KTHREAD_PRIO;
-       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-       del_timer(&yield_timer);
-}
-
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline.  We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh.  This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
-{
-       while (cpu_is_offline(cpu) ||
-              !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
-              smp_processor_id() != cpu) {
-               if (kthread_should_stop())
-                       return 1;
-               per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-               per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
-               local_bh_enable();
-               schedule_timeout_uninterruptible(1);
-               if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
-                       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-               local_bh_disable();
-       }
-       per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
-       return 0;
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
- * earlier RCU softirq.
- */
-static int rcu_cpu_kthread(void *arg)
-{
-       int cpu = (int)(long)arg;
-       unsigned long flags;
-       int spincnt = 0;
-       unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
-       wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
-       char work;
-       char *workp = &per_cpu(rcu_cpu_has_work, cpu);
-
-       for (;;) {
-               *statusp = RCU_KTHREAD_WAITING;
-               wait_event_interruptible(*wqp,
-                                        *workp != 0 || kthread_should_stop());
-               local_bh_disable();
-               if (rcu_cpu_kthread_should_stop(cpu)) {
-                       local_bh_enable();
-                       break;
-               }
-               *statusp = RCU_KTHREAD_RUNNING;
-               per_cpu(rcu_cpu_kthread_loops, cpu)++;
-               local_irq_save(flags);
-               work = *workp;
-               *workp = 0;
-               local_irq_restore(flags);
-               if (work)
-                       rcu_process_callbacks();
-               local_bh_enable();
-               if (*workp != 0)
-                       spincnt++;
-               else
-                       spincnt = 0;
-               if (spincnt > 10) {
-                       *statusp = RCU_KTHREAD_YIELDING;
-                       rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
-                       spincnt = 0;
-               }
-       }
-       *statusp = RCU_KTHREAD_STOPPED;
-       return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task.  There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
-       struct sched_param sp;
-       struct task_struct *t;
-
-       if (!rcu_kthreads_spawnable ||
-           per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
-               return 0;
-       t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
-       if (IS_ERR(t))
-               return PTR_ERR(t);
-       kthread_bind(t, cpu);
-       per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
-       WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
-       per_cpu(rcu_cpu_kthread_task, cpu) = t;
-       wake_up_process(t);
-       sp.sched_priority = RCU_KTHREAD_PRIO;
-       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-       return 0;
+       trace_rcu_utilization("End RCU core");
 }
 
 /*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed.  We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
-       int cpu;
-       unsigned long flags;
-       unsigned long mask;
-       struct rcu_node *rnp = (struct rcu_node *)arg;
-       struct sched_param sp;
-       struct task_struct *t;
-
-       for (;;) {
-               rnp->node_kthread_status = RCU_KTHREAD_WAITING;
-               wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0 ||
-                                                      kthread_should_stop());
-               if (kthread_should_stop())
-                       break;
-               rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               mask = rnp->wakemask;
-               rnp->wakemask = 0;
-               rcu_initiate_boost(rnp);
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
-               for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
-                       if ((mask & 0x1) == 0)
-                               continue;
-                       preempt_disable();
-                       t = per_cpu(rcu_cpu_kthread_task, cpu);
-                       if (!cpu_online(cpu) || t == NULL) {
-                               preempt_enable();
-                               continue;
-                       }
-                       per_cpu(rcu_cpu_has_work, cpu) = 1;
-                       sp.sched_priority = RCU_KTHREAD_PRIO;
-                       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-                       preempt_enable();
-               }
-       }
-       rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
-       return 0;
-}
-
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question.  The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU.  If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
+ * Schedule RCU callback invocation.  If the specified type of RCU
+ * does not support RCU priority boosting, just do a direct call,
+ * otherwise wake up the per-CPU kernel kthread.  Note that because we
+ * are running on the current CPU with interrupts disabled, the
+ * rcu_cpu_kthread_task cannot disappear out from under us.
  */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-       cpumask_var_t cm;
-       int cpu;
-       unsigned long mask = rnp->qsmaskinit;
-
-       if (rnp->node_kthread_task == NULL || mask == 0)
+       if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
                return;
-       if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+       if (likely(!rsp->boost)) {
+               rcu_do_batch(rsp, rdp);
                return;
-       cpumask_clear(cm);
-       for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
-               if ((mask & 0x1) && cpu != outgoingcpu)
-                       cpumask_set_cpu(cpu, cm);
-       if (cpumask_weight(cm) == 0) {
-               cpumask_setall(cm);
-               for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
-                       cpumask_clear_cpu(cpu, cm);
-               WARN_ON_ONCE(cpumask_weight(cm) == 0);
        }
-       set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
-       rcu_boost_kthread_setaffinity(rnp, cm);
-       free_cpumask_var(cm);
+       invoke_rcu_callbacks_kthread();
 }
 
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held.  So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
-                                               struct rcu_node *rnp)
+static void invoke_rcu_core(void)
 {
-       unsigned long flags;
-       int rnp_index = rnp - &rsp->node[0];
-       struct sched_param sp;
-       struct task_struct *t;
-
-       if (!rcu_kthreads_spawnable ||
-           rnp->qsmaskinit == 0)
-               return 0;
-       if (rnp->node_kthread_task == NULL) {
-               t = kthread_create(rcu_node_kthread, (void *)rnp,
-                                  "rcun%d", rnp_index);
-               if (IS_ERR(t))
-                       return PTR_ERR(t);
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               rnp->node_kthread_task = t;
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
-               wake_up_process(t);
-               sp.sched_priority = 99;
-               sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-       }
-       return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
+       raise_softirq(RCU_SOFTIRQ);
 }
 
-/*
- * Spawn all kthreads -- called as soon as the scheduler is running.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-       int cpu;
-       struct rcu_node *rnp;
-
-       rcu_kthreads_spawnable = 1;
-       for_each_possible_cpu(cpu) {
-               init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
-               per_cpu(rcu_cpu_has_work, cpu) = 0;
-               if (cpu_online(cpu))
-                       (void)rcu_spawn_one_cpu_kthread(cpu);
-       }
-       rnp = rcu_get_root(rcu_state);
-       init_waitqueue_head(&rnp->node_wq);
-       rcu_init_boost_waitqueue(rnp);
-       (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-       if (NUM_RCU_NODES > 1)
-               rcu_for_each_leaf_node(rcu_state, rnp) {
-                       init_waitqueue_head(&rnp->node_wq);
-                       rcu_init_boost_waitqueue(rnp);
-                       (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-               }
-       return 0;
-}
-early_initcall(rcu_spawn_kthreads);
-
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
-          struct rcu_state *rsp)
+          struct rcu_state *rsp, bool lazy)
 {
        unsigned long flags;
        struct rcu_data *rdp;
 
+       WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
        debug_rcu_head_queue(head);
        head->func = func;
        head->next = NULL;
@@ -1838,6 +1825,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        /* Add the callback to our list. */
        *rdp->nxttail[RCU_NEXT_TAIL] = head;
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+       rdp->qlen++;
+       if (lazy)
+               rdp->qlen_lazy++;
+
+       if (__is_kfree_rcu_offset((unsigned long)func))
+               trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+                                        rdp->qlen_lazy, rdp->qlen);
+       else
+               trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
+
+       /* If interrupts were disabled, don't dive into RCU core. */
+       if (irqs_disabled_flags(flags)) {
+               local_irq_restore(flags);
+               return;
+       }
 
        /*
         * Force the grace period if too many callbacks or too long waiting.
@@ -1846,7 +1848,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * invoking force_quiescent_state() if the newly enqueued callback
         * is the only one waiting for a grace period to complete.
         */
-       if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+       if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
 
                /* Are we ignoring a completed grace period? */
                rcu_process_gp_end(rsp, rdp);
@@ -1878,16 +1880,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
  */
 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-       __call_rcu(head, func, &rcu_sched_state);
+       __call_rcu(head, func, &rcu_sched_state, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
 /*
- * Queue an RCU for invocation after a quicker grace period.
+ * Queue an RCU callback for invocation after a quicker grace period.
  */
 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-       __call_rcu(head, func, &rcu_bh_state);
+       __call_rcu(head, func, &rcu_bh_state, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 
@@ -1916,18 +1918,13 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
  */
 void synchronize_sched(void)
 {
-       struct rcu_synchronize rcu;
-
+       rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+                          !lock_is_held(&rcu_lock_map) &&
+                          !lock_is_held(&rcu_sched_lock_map),
+                          "Illegal synchronize_sched() in RCU-sched read-side critical section");
        if (rcu_blocking_is_gp())
                return;
-
-       init_rcu_head_on_stack(&rcu.head);
-       init_completion(&rcu.completion);
-       /* Will wake me after RCU finished. */
-       call_rcu_sched(&rcu.head, wakeme_after_rcu);
-       /* Wait for it. */
-       wait_for_completion(&rcu.completion);
-       destroy_rcu_head_on_stack(&rcu.head);
+       wait_rcu_gp(call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
@@ -1942,21 +1939,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
  */
 void synchronize_rcu_bh(void)
 {
-       struct rcu_synchronize rcu;
-
+       rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+                          !lock_is_held(&rcu_lock_map) &&
+                          !lock_is_held(&rcu_sched_lock_map),
+                          "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
        if (rcu_blocking_is_gp())
                return;
-
-       init_rcu_head_on_stack(&rcu.head);
-       init_completion(&rcu.completion);
-       /* Will wake me after RCU finished. */
-       call_rcu_bh(&rcu.head, wakeme_after_rcu);
-       /* Wait for it. */
-       wait_for_completion(&rcu.completion);
-       destroy_rcu_head_on_stack(&rcu.head);
+       wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+       /*
+        * There must be a full memory barrier on each affected CPU
+        * between the time that try_stop_cpus() is called and the
+        * time that it returns.
+        *
+        * In the current initial implementation of cpu_stop, the
+        * above condition is already met when the control reaches
+        * this point and the following smp_mb() is not strictly
+        * necessary.  Do smp_mb() anyway for documentation and
+        * robustness against future implementation changes.
+        */
+       smp_mb(); /* See above comment block. */
+       return 0;
+}
+
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly.  This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code.  In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * Note that it is illegal to call this function while holding any lock
+ * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
+ * to call this function from a CPU-hotplug notifier.  Failing to observe
+ * these restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+       int firstsnap, s, snap, trycount = 0;
+
+       /* Note that atomic_inc_return() implies full memory barrier. */
+       firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+       get_online_cpus();
+       WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
+
+       /*
+        * Each pass through the following loop attempts to force a
+        * context switch on each CPU.
+        */
+       while (try_stop_cpus(cpu_online_mask,
+                            synchronize_sched_expedited_cpu_stop,
+                            NULL) == -EAGAIN) {
+               put_online_cpus();
+
+               /* No joy, try again later.  Or just synchronize_sched(). */
+               if (trycount++ < 10)
+                       udelay(trycount * num_online_cpus());
+               else {
+                       synchronize_sched();
+                       return;
+               }
+
+               /* Check to see if someone else did our work for us. */
+               s = atomic_read(&sync_sched_expedited_done);
+               if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       return;
+               }
+
+               /*
+                * Refetching sync_sched_expedited_started allows later
+                * callers to piggyback on our grace period.  We subtract
+                * 1 to get the same token that the last incrementer got.
+                * We retry after they started, so our grace period works
+                * for them, and they started after our first try, so their
+                * grace period works for us.
+                */
+               get_online_cpus();
+               snap = atomic_read(&sync_sched_expedited_started);
+               smp_mb(); /* ensure read is before try_stop_cpus(). */
+       }
+
+       /*
+        * Everyone up to our most recent fetch is covered by our grace
+        * period.  Update the counter, but only if our work is still
+        * relevant -- which it won't be if someone who started later
+        * than we did beat us to the punch.
+        */
+       do {
+               s = atomic_read(&sync_sched_expedited_done);
+               if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       break;
+               }
+       } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+       put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1974,7 +2087,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        check_cpu_stall(rsp, rdp);
 
        /* Is the RCU core waiting for a quiescent state from this CPU? */
-       if (rdp->qs_pending && !rdp->passed_quiesc) {
+       if (rcu_scheduler_fully_active &&
+           rdp->qs_pending && !rdp->passed_quiesce) {
 
                /*
                 * If force_quiescent_state() coming soon and this CPU
@@ -1986,7 +2100,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
                    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
                                 jiffies))
                        set_need_resched();
-       } else if (rdp->qs_pending && rdp->passed_quiesc) {
+       } else if (rdp->qs_pending && rdp->passed_quiesce) {
                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -2044,12 +2158,12 @@ static int rcu_pending(int cpu)
  * by the current CPU, even if none need be done immediately, returning
  * 1 if so.
  */
-static int rcu_needs_cpu_quick_check(int cpu)
+static int rcu_cpu_has_callbacks(int cpu)
 {
        /* RCU callbacks either ready or pending? */
        return per_cpu(rcu_sched_data, cpu).nxtlist ||
               per_cpu(rcu_bh_data, cpu).nxtlist ||
-              rcu_preempt_needs_cpu(cpu);
+              rcu_preempt_cpu_has_callbacks(cpu);
 }
 
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -2144,11 +2258,13 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
+       rdp->qlen_lazy = 0;
        rdp->qlen = 0;
-#ifdef CONFIG_NO_HZ
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
+       WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
+       WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
        rdp->cpu = cpu;
+       rdp->rsp = rsp;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
@@ -2168,13 +2284,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
-       rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
-       rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
        rdp->preemptible = preemptible;
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
+       rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+       atomic_set(&rdp->dynticks->dynticks,
+                  (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+       rcu_prepare_for_idle_init(cpu);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
 
        /*
@@ -2194,9 +2312,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
                rnp->qsmaskinit |= mask;
                mask = rnp->grpmask;
                if (rnp == rdp->mynode) {
-                       rdp->gpnum = rnp->completed; /* if GP in progress... */
+                       /*
+                        * If there is a grace period in progress, we will
+                        * set up to wait for it next time we run the
+                        * RCU core code.
+                        */
+                       rdp->gpnum = rnp->completed;
                        rdp->completed = rnp->completed;
-                       rdp->passed_quiesc_completed = rnp->completed - 1;
+                       rdp->passed_quiesce = 0;
+                       rdp->qs_pending = 0;
+                       rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
+                       trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
                }
                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
@@ -2205,26 +2331,13 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
-static void __cpuinit rcu_online_cpu(int cpu)
+static void __cpuinit rcu_prepare_cpu(int cpu)
 {
        rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
        rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
        rcu_preempt_init_percpu_data(cpu);
 }
 
-static void __cpuinit rcu_online_kthreads(int cpu)
-{
-       struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
-       struct rcu_node *rnp = rdp->mynode;
-
-       /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
-       if (rcu_kthreads_spawnable) {
-               (void)rcu_spawn_one_cpu_kthread(cpu);
-               if (rnp->node_kthread_task == NULL)
-                       (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-       }
-}
-
 /*
  * Handle CPU online/offline notification events.
  */
@@ -2235,11 +2348,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
 
+       trace_rcu_utilization("Start CPU hotplug");
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-               rcu_online_cpu(cpu);
-               rcu_online_kthreads(cpu);
+               rcu_prepare_cpu(cpu);
+               rcu_prepare_kthreads(cpu);
                break;
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
@@ -2257,19 +2371,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                 * touch any data without introducing corruption. We send the
                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
                 */
-               rcu_send_cbs_to_online(&rcu_bh_state);
-               rcu_send_cbs_to_online(&rcu_sched_state);
-               rcu_preempt_send_cbs_to_online();
+               rcu_cleanup_dying_cpu(&rcu_bh_state);
+               rcu_cleanup_dying_cpu(&rcu_sched_state);
+               rcu_preempt_cleanup_dying_cpu();
+               rcu_cleanup_after_idle(cpu);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-               rcu_offline_cpu(cpu);
+               rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
+               rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
+               rcu_preempt_cleanup_dead_cpu(cpu);
                break;
        default:
                break;
        }
+       trace_rcu_utilization("End CPU hotplug");
        return NOTIFY_OK;
 }
 
@@ -2389,6 +2507,7 @@ void __init rcu_init(void)
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
+        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 
        /*
         * We don't need protection against CPU-hotplug here because