Update to 3.4-final.

[linux-flexiantxendom0-3.2.10.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 699ff14..9a7fe31 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,9 +71,12 @@
  #include <linux/ftrace.h>
  #include <linux/slab.h>
  #include <linux/init_task.h>
+#include <linux/binfmts.h>
  
+#include <asm/switch_to.h>
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
+#include <asm/mutex.h>
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt.h>
  #endif
@@ -149,7 +152,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
  {
         int i;
  
-       for (i = 0; sched_feat_names[i]; i++) {
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
                 if (!(sysctl_sched_features & (1UL << i)))
                         seq_puts(m, "NO_");
                 seq_printf(m, "%s ", sched_feat_names[i]);
@@ -159,6 +162,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
         return 0;
  }
  
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled)      \
+       jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+       if (static_key_enabled(&sched_feat_keys[i]))
+               static_key_slow_dec(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+       if (!static_key_enabled(&sched_feat_keys[i]))
+               static_key_slow_inc(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
  static ssize_t
  sched_feat_write(struct file *filp, const char __user *ubuf,
                 size_t cnt, loff_t *ppos)
@@ -182,17 +215,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                 cmp += 3;
         }
  
-       for (i = 0; sched_feat_names[i]; i++) {
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
                 if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                       if (neg)
+                       if (neg) {
                                 sysctl_sched_features &= ~(1UL << i);
-                       else
+                               sched_feat_disable(i);
+                       } else {
                                 sysctl_sched_features |= (1UL << i);
+                               sched_feat_enable(i);
+                       }
                         break;
                 }
         }
  
-       if (!sched_feat_names[i])
+       if (i == __SCHED_FEAT_NR)
                 return -EINVAL;
  
         *ppos += cnt;
@@ -221,8 +257,7 @@ static __init int sched_init_debug(void)
         return 0;
  }
  late_initcall(sched_init_debug);
-
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
  
  /*
   * Number of tasks to iterate in a single balance run.
@@ -691,9 +726,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
         p->sched_class->dequeue_task(rq, p, flags);
  }
  
-/*
- * activate_task - move a task to the runqueue.
- */
  void activate_task(struct rq *rq, struct task_struct *p, int flags)
  {
         if (task_contributes_to_load(p))
@@ -702,9 +734,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
         enqueue_task(rq, p, flags);
  }
  
-/*
- * deactivate_task - remove a task from the runqueue.
- */
  void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
  {
         if (task_contributes_to_load(p))
@@ -867,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
         delta -= irq_delta;
  #endif
  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       if (static_branch((&paravirt_steal_rq_enabled))) {
+       if (static_key_false((&paravirt_steal_rq_enabled))) {
                 u64 st;
  
                 steal = paravirt_steal_clock(cpu_of(rq));
@@ -896,14 +925,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  static int irqtime_account_hi_update(void)
  {
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
         unsigned long flags;
         u64 latest_ns;
         int ret = 0;
  
         local_irq_save(flags);
         latest_ns = this_cpu_read(cpu_hardirq_time);
-       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
                 ret = 1;
         local_irq_restore(flags);
         return ret;
@@ -911,14 +940,14 @@ static int irqtime_account_hi_update(void)
  
  static int irqtime_account_si_update(void)
  {
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
         unsigned long flags;
         u64 latest_ns;
         int ret = 0;
  
         local_irq_save(flags);
         latest_ns = this_cpu_read(cpu_softirq_time);
-       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
                 ret = 1;
         local_irq_restore(flags);
         return ret;
@@ -1236,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
   */
  static int select_fallback_rq(int cpu, struct task_struct *p)
  {
-       int dest_cpu;
         const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+       enum { cpuset, possible, fail } state = cpuset;
+       int dest_cpu;
  
         /* Look for allowed, online CPU in same node. */
-       for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+       for_each_cpu(dest_cpu, nodemask) {
+               if (!cpu_online(dest_cpu))
+                       continue;
+               if (!cpu_active(dest_cpu))
+                       continue;
                 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                         return dest_cpu;
+       }
  
-       /* Any allowed, online CPU? */
-       dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
-       if (dest_cpu < nr_cpu_ids)
-               return dest_cpu;
+       for (;;) {
+               /* Any allowed, online CPU? */
+               for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+                       if (!cpu_online(dest_cpu))
+                               continue;
+                       if (!cpu_active(dest_cpu))
+                               continue;
+                       goto out;
+               }
  
-       /* No more Mr. Nice Guy. */
-       dest_cpu = cpuset_cpus_allowed_fallback(p);
-       /*
-        * Don't tell them about moving exiting tasks or
-        * kernel threads (both mm NULL), since they never
-        * leave kernel.
-        */
-       if (p->mm && printk_ratelimit()) {
-               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, cpu);
+               switch (state) {
+               case cpuset:
+                       /* No more Mr. Nice Guy. */
+                       cpuset_cpus_allowed_fallback(p);
+                       state = possible;
+                       break;
+
+               case possible:
+                       do_set_cpus_allowed(p, cpu_possible_mask);
+                       state = fail;
+                       break;
+
+               case fail:
+                       BUG();
+                       break;
+               }
+       }
+
+out:
+       if (state != cpuset) {
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                                       task_pid_nr(p), p->comm, cpu);
+               }
         }
  
         return dest_cpu;
@@ -1479,6 +1538,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
  
  }
  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+       return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
  #endif /* CONFIG_SMP */
  
  static void ttwu_queue(struct task_struct *p, int cpu)
@@ -1486,7 +1550,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
         struct rq *rq = cpu_rq(cpu);
  
  #if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                 sched_clock_cpu(cpu); /* sync clocks x-cpu */
                 ttwu_queue_remote(p, cpu);
                 return;
@@ -1900,6 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         local_irq_enable();
  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
+       finish_arch_post_lock_switch();
  
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@ -2234,13 +2299,10 @@ calc_load_n(unsigned long load, unsigned long exp,
   * Once we've updated the global active value, we need to apply the exponential
   * weights adjusted to the number of cycles missed.
   */
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
  {
         long delta, active, n;
  
-       if (time_before(jiffies, calc_load_update))
-               return;
-
         /*
          * If we crossed a calc_load_update boundary, make sure to fold
          * any pending idle changes, the respective CPUs might have
@@ -2252,31 +2314,25 @@ static void calc_global_nohz(unsigned long ticks)
                 atomic_long_add(delta, &calc_load_tasks);
  
         /*
-        * If we were idle for multiple load cycles, apply them.
+        * It could be the one fold was all it took, we done!
          */
-       if (ticks >= LOAD_FREQ) {
-               n = ticks / LOAD_FREQ;
+       if (time_before(jiffies, calc_load_update + 10))
+               return;
  
-               active = atomic_long_read(&calc_load_tasks);
-               active = active > 0 ? active * FIXED_1 : 0;
+       /*
+        * Catch-up, fold however many we are behind still
+        */
+       delta = jiffies - calc_load_update - 10;
+       n = 1 + (delta / LOAD_FREQ);
  
-               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+       active = atomic_long_read(&calc_load_tasks);
+       active = active > 0 ? active * FIXED_1 : 0;
  
-               calc_load_update += n * LOAD_FREQ;
-       }
+       avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+       avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+       avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
  
-       /*
-        * Its possible the remainder of the above division also crosses
-        * a LOAD_FREQ period, the regular check in calc_global_load()
-        * which comes after this will take care of that.
-        *
-        * Consider us being 11 ticks before a cycle completion, and us
-        * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
-        * age us 4 cycles, and the test in calc_global_load() will
-        * pick up the final one.
-        */
+       calc_load_update += n * LOAD_FREQ;
  }
  #else
  void calc_load_account_idle(struct rq *this_rq)
@@ -2288,7 +2344,7 @@ static inline long calc_load_fold_idle(void)
         return 0;
  }
  
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
  {
  }
  #endif
@@ -2316,8 +2372,6 @@ void calc_global_load(unsigned long ticks)
  {
         long active;
  
-       calc_global_nohz(ticks);
-
         if (time_before(jiffies, calc_load_update + 10))
                 return;
  
@@ -2329,6 +2383,16 @@ void calc_global_load(unsigned long ticks)
         avenrun[2] = calc_load(avenrun[2], EXP_15, active);
  
         calc_load_update += LOAD_FREQ;
+
+       /*
+        * Account one period with whatever state we found before
+        * folding in the nohz state and ageing the entire idle period.
+        *
+        * This avoids loosing a sample when we go idle between 
+        * calc_load_account_active() (10 ticks ago) and now and thus
+        * under-accounting.
+        */
+       calc_global_nohz();
  }
  
  /*
@@ -2500,8 +2564,10 @@ unlock:
  #endif
  
  DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
  
  EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
  
  /*
   * Return any ns on the sched_clock that have not yet been accounted in
@@ -2554,6 +2620,84 @@ unsigned long long task_sched_runtime(struct task_struct *p)
         return ns;
  }
  
+#ifdef CONFIG_CGROUP_CPUACCT
+struct cgroup_subsys cpuacct_subsys;
+struct cpuacct root_cpuacct;
+#endif
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+                                           u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+       struct kernel_cpustat *kcpustat;
+       struct cpuacct *ca;
+#endif
+       /*
+        * Since all updates are sure to touch the root cgroup, we
+        * get ourselves ahead and touch it first. If the root cgroup
+        * is the only cgroup, then nothing else should be necessary.
+        *
+        */
+       __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+
+#ifdef CONFIG_CGROUP_CPUACCT
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       rcu_read_lock();
+       ca = task_ca(p);
+       while (ca && (ca != &root_cpuacct)) {
+               kcpustat = this_cpu_ptr(ca->cpustat);
+               kcpustat->cpustat[index] += tmp;
+               ca = parent_ca(ca);
+       }
+       rcu_read_unlock();
+#endif
+}
+
+
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define cputime_to_u64(t) ((__force u64)(t))
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 cputime_to_u64(cputime_t t)
+{
+       u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+       unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+                                         + __this_cpu_read(steal_residual),
+                                       NS_PER_TICK,
+                                       &__get_cpu_var(steal_residual));
+
+       __this_cpu_write(steal_snapshot, s);
+       if (t < jiffies_to_cputime(adj))
+               return 0;
+
+       return (__force u64)(t - jiffies_to_cputime(adj));
+}
+
+static void steal_resume(void)
+{
+       cputime_to_u64(((cputime_t)1 << (BITS_PER_LONG * sizeof(cputime_t)
+                                        / sizeof(long) - 1)) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+       .resume = steal_resume,
+};
+
+static int __init steal_register(void)
+{
+       register_syscore_ops(&steal_syscore_ops);
+       return 0;
+}
+core_initcall(steal_register);
+#endif
+
  /*
   * Account user cpu time to a process.
   * @p: the process that the cpu time gets accounted to
@@ -2563,22 +2707,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
  void account_user_time(struct task_struct *p, cputime_t cputime,
                        cputime_t cputime_scaled)
  {
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp;
+       int index;
  
         /* Add user time to process. */
-       p->utime = cputime_add(p->utime, cputime);
-       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+       p->utime += cputime;
+       p->utimescaled += cputime_scaled;
         account_group_user_time(p, cputime);
  
+       index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
         /* Add user time to cpustat. */
-       tmp = cputime_to_cputime64(cputime);
-       if (TASK_NICE(p) > 0)
-               cpustat->nice = cputime64_add(cpustat->nice, tmp);
-       else
-               cpustat->user = cputime64_add(cpustat->user, tmp);
+       task_group_account_field(p, index, cputime_to_u64(cputime));
  
-       cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
         /* Account for user time used */
         acct_update_integrals(p);
  }
@@ -2592,24 +2732,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
  static void account_guest_time(struct task_struct *p, cputime_t cputime,
                                cputime_t cputime_scaled)
  {
-       cputime64_t tmp;
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-
-       tmp = cputime_to_cputime64(cputime);
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
  
         /* Add guest time to process. */
-       p->utime = cputime_add(p->utime, cputime);
-       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+       p->utime += cputime;
+       p->utimescaled += cputime_scaled;
         account_group_user_time(p, cputime);
-       p->gtime = cputime_add(p->gtime, cputime);
+       p->gtime += cputime;
  
         /* Add guest time to cpustat. */
         if (TASK_NICE(p) > 0) {
-               cpustat->nice = cputime64_add(cpustat->nice, tmp);
-               cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+               cpustat[CPUTIME_NICE] += (__force u64) cputime;
+               cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
         } else {
-               cpustat->user = cputime64_add(cpustat->user, tmp);
-               cpustat->guest = cputime64_add(cpustat->guest, tmp);
+               cpustat[CPUTIME_USER] += (__force u64) cputime;
+               cpustat[CPUTIME_GUEST] += (__force u64) cputime;
         }
  }
  
@@ -2622,18 +2759,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
   */
  static inline
  void __account_system_time(struct task_struct *p, cputime_t cputime,
-                       cputime_t cputime_scaled, cputime64_t *target_cputime64)
+                       cputime_t cputime_scaled, int index)
  {
-       cputime64_t tmp = cputime_to_cputime64(cputime);
-
         /* Add system time to process. */
-       p->stime = cputime_add(p->stime, cputime);
-       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+       p->stime += cputime;
+       p->stimescaled += cputime_scaled;
         account_group_system_time(p, cputime);
  
         /* Add system time to cpustat. */
-       *target_cputime64 = cputime64_add(*target_cputime64, tmp);
-       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+       task_group_account_field(p, index, cputime_to_u64(cputime));
  
         /* Account for system time used */
         acct_update_integrals(p);
@@ -2649,8 +2783,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
  void account_system_time(struct task_struct *p, int hardirq_offset,
                          cputime_t cputime, cputime_t cputime_scaled)
  {
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t *target_cputime64;
+       int index;
  
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                 account_guest_time(p, cputime, cputime_scaled);
@@ -2658,13 +2791,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         }
  
         if (hardirq_count() - hardirq_offset)
-               target_cputime64 = &cpustat->irq;
+               index = CPUTIME_IRQ;
         else if (in_serving_softirq())
-               target_cputime64 = &cpustat->softirq;
+               index = CPUTIME_SOFTIRQ;
         else
-               target_cputime64 = &cpustat->system;
+               index = CPUTIME_SYSTEM;
  
-       __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+       __account_system_time(p, cputime, cputime_scaled, index);
  }
  
  /*
@@ -2673,10 +2806,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
   */
  void account_steal_time(cputime_t cputime)
  {
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t cputime64 = cputime_to_cputime64(cputime);
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
  
-       cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+       cpustat[CPUTIME_STEAL] += (__force u64) cputime;
  }
  
  /*
@@ -2685,20 +2817,19 @@ void account_steal_time(cputime_t cputime)
   */
  void account_idle_time(cputime_t cputime)
  {
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t cputime64 = cputime_to_cputime64(cputime);
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
         struct rq *rq = this_rq();
  
         if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+               cpustat[CPUTIME_IOWAIT] += cputime_to_u64(cputime);
         else
-               cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+               cpustat[CPUTIME_IDLE] += cputime_to_u64(cputime);
  }
  
  static __always_inline bool steal_account_process_tick(void)
  {
  #ifdef CONFIG_PARAVIRT
-       if (static_branch(&paravirt_steal_enabled)) {
+       if (static_key_false(&paravirt_steal_enabled)) {
                 u64 steal, st = 0;
  
                 steal = paravirt_steal_clock(smp_processor_id());
@@ -2742,16 +2873,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                                 struct rq *rq)
  {
         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-       cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
  
         if (steal_account_process_tick())
                 return;
  
         if (irqtime_account_hi_update()) {
-               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+               cpustat[CPUTIME_IRQ] += cputime_to_u64(cputime_one_jiffy);
         } else if (irqtime_account_si_update()) {
-               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+               cpustat[CPUTIME_SOFTIRQ] += cputime_to_u64(cputime_one_jiffy);
         } else if (this_cpu_ksoftirqd() == p) {
                 /*
                  * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -2759,7 +2889,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                  * Also, p->stime needs to be updated for ksoftirqd.
                  */
                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       &cpustat->softirq);
+                                       CPUTIME_SOFTIRQ);
         } else if (user_tick) {
                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
         } else if (p == rq->idle) {
@@ -2768,7 +2898,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
         } else {
                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       &cpustat->system);
+                                       CPUTIME_SYSTEM);
         }
  }
  
@@ -2867,7 +2997,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
  
  void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
  {
-       cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
+       cputime_t rtime, utime = p->utime, total = utime + p->stime;
  
         /*
          * Use CFS's precise accounting:
@@ -2875,11 +3005,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
  
         if (total) {
-               u64 temp = rtime;
+               u64 temp = (__force u64) rtime;
  
-               temp *= utime;
-               do_div(temp, total);
-               utime = (cputime_t)temp;
+               temp *= (__force u64) utime;
+               do_div(temp, (__force u32) total);
+               utime = (__force cputime_t) temp;
         } else
                 utime = rtime;
  
@@ -2887,7 +3017,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
          * Compare with previous values, to keep monotonicity:
          */
         p->prev_utime = max(p->prev_utime, utime);
-       p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
+       p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
  
         *ut = p->prev_utime;
         *st = p->prev_stime;
@@ -2904,21 +3034,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
  
         thread_group_cputime(p, &cputime);
  
-       total = cputime_add(cputime.utime, cputime.stime);
+       total = cputime.utime + cputime.stime;
         rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
  
         if (total) {
-               u64 temp = rtime;
+               u64 temp = (__force u64) rtime;
  
-               temp *= cputime.utime;
-               do_div(temp, total);
-               utime = (cputime_t)temp;
+               temp *= (__force u64) cputime.utime;
+               do_div(temp, (__force u32) total);
+               utime = (__force cputime_t) temp;
         } else
                 utime = rtime;
  
         sig->prev_utime = max(sig->prev_utime, utime);
-       sig->prev_stime = max(sig->prev_stime,
-                             cputime_sub(rtime, sig->prev_utime));
+       sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
  
         *ut = sig->prev_utime;
         *st = sig->prev_stime;
@@ -3015,7 +3144,8 @@ EXPORT_SYMBOL(sub_preempt_count);
   */
  static noinline void __schedule_bug(struct task_struct *prev)
  {
-       struct pt_regs *regs = get_irq_regs();
+       if (oops_in_progress)
+               return;
  
         printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
                 prev->comm, prev->pid, preempt_count());
@@ -3024,11 +3154,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
         print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
-
-       if (regs)
-               show_regs(regs);
-       else
-               dump_stack();
+       dump_stack();
  }
  
  /*
@@ -3162,14 +3288,14 @@ need_resched:
  
         post_schedule(rq);
  
-       preempt_enable_no_resched();
+       sched_preempt_enable_no_resched();
         if (need_resched())
                 goto need_resched;
  }
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
-       if (!tsk->state)
+       if (!tsk->state || tsk_is_pi_blocked(tsk))
                 return;
         /*
          * If we are going to sleep and we have plugged IO queued,
@@ -3188,7 +3314,24 @@ asmlinkage void __sched schedule(void)
  }
  EXPORT_SYMBOL(schedule);
  
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+       sched_preempt_enable_no_resched();
+       schedule();
+       preempt_disable();
+}
+
  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+#include <asm/mutex.h>
+
+#ifndef arch_cpu_is_running
+#define arch_cpu_is_running(cpu) true
+#endif
  
  static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
  {
@@ -3203,7 +3346,8 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
          */
         barrier();
  
-       return owner->on_cpu;
+       return owner->on_cpu
+              && arch_cpu_is_running(task_thread_info(owner)->cpu);
  }
  
  /*
@@ -3348,9 +3492,9 @@ EXPORT_SYMBOL(__wake_up);
  /*
   * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
   */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
  {
-       __wake_up_common(q, mode, 1, 0, NULL);
+       __wake_up_common(q, mode, nr, 0, NULL);
  }
  EXPORT_SYMBOL_GPL(__wake_up_locked);
  
@@ -3709,6 +3853,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         rq = __task_rq_lock(p);
  
+       /*
+        * Idle task boosting is a nono in general. There is one
+        * exception, when PREEMPT_RT and NOHZ is active:
+        *
+        * The idle task calls get_next_timer_interrupt() and holds
+        * the timer wheel base->lock on the CPU and another CPU wants
+        * to access the timer (probably to cancel it). We can safely
+        * ignore the boosting request, as the idle CPU runs this code
+        * with interrupts disabled and will complete the lock
+        * protected section without being interrupted. So there is no
+        * real need to boost.
+        */
+       if (unlikely(p == rq->idle)) {
+               WARN_ON(p != rq->curr);
+               WARN_ON(p->pi_blocked_on);
+               goto out_unlock;
+       }
+
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
@@ -3732,11 +3894,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
  
         check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
         __task_rq_unlock(rq);
  }
-
  #endif
-
  void set_user_nice(struct task_struct *p, long nice)
  {
         int old_prio, delta, on_rq;
@@ -4070,7 +4231,7 @@ recheck:
         on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
-               deactivate_task(rq, p, 0);
+               dequeue_task(rq, p, 0);
         if (running)
                 p->sched_class->put_prev_task(rq, p);
  
@@ -4083,7 +4244,7 @@ recheck:
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (on_rq)
-               activate_task(rq, p, 0);
+               enqueue_task(rq, p, 0);
  
         check_class_changed(rq, p, prev_class, oldprio);
         task_rq_unlock(rq, p, &flags);
@@ -4266,7 +4427,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                 goto out_free_cpus_allowed;
         }
         retval = -EPERM;
-       if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+       if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
                 goto out_unlock;
  
         retval = security_task_setscheduler(p);
@@ -4416,7 +4577,7 @@ SYSCALL_DEFINE0(sched_yield)
         __release(rq->lock);
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
         do_raw_spin_unlock(&rq->lock);
-       preempt_enable_no_resched();
+       sched_preempt_enable_no_resched();
  
         schedule();
  
@@ -4490,8 +4651,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
  /**
   * yield - yield the current processor to other threads.
   *
- * This is a shortcut for kernel-space yielding - it marks the
- * thread runnable and calls sys_sched_yield().
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ *     yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
   */
  void __sched yield(void)
  {
@@ -4722,7 +4899,7 @@ void sched_show_task(struct task_struct *p)
         free = stack_not_used(p);
  #endif
         printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-               task_pid_nr(p), task_pid_nr(p->real_parent),
+               task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
                 (unsigned long)task_thread_info(p)->flags);
  
         show_stack(p, NULL);
@@ -4934,9 +5111,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
          * placed properly.
          */
         if (p->on_rq) {
-               deactivate_task(rq_src, p, 0);
+               dequeue_task(rq_src, p, 0);
                 set_task_cpu(p, dest_cpu);
-               activate_task(rq_dest, p, 0);
+               enqueue_task(rq_dest, p, 0);
                 check_preempt_curr(rq_dest, p, 0);
         }
  done:
@@ -5112,7 +5289,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
  static void
  set_table_entry(struct ctl_table *entry,
                 const char *procname, void *data, int maxlen,
-               mode_t mode, proc_handler *proc_handler)
+               umode_t mode, proc_handler *proc_handler)
  {
         entry->procname = procname;
         entry->data = data;
@@ -5323,7 +5500,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
  {
         switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
+       case CPU_STARTING:
         case CPU_DOWN_FAILED:
                 set_cpu_active((long)hcpu, true);
                 return NOTIFY_OK;
@@ -5689,6 +5866,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  }
  
  /*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first cpu number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two cpus are in the same cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_id);
+
+static void update_top_cache_domain(int cpu)
+{
+       struct sched_domain *sd;
+       int id = cpu;
+
+       sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+       if (sd)
+               id = cpumask_first(sched_domain_span(sd));
+
+       rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+       per_cpu(sd_llc_id, cpu) = id;
+}
+
+/*
   * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
   * hold the hotplug lock.
   */
@@ -5727,6 +5929,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
         tmp = rq->sd;
         rcu_assign_pointer(rq->sd, sd);
         destroy_sched_domains(tmp, cpu);
+
+       update_top_cache_domain(cpu);
  }
  
  /* cpus with isolated domains */
@@ -6226,6 +6430,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                         if (!sg)
                                 return -ENOMEM;
  
+                       sg->next = sg;
+
                         *per_cpu_ptr(sdd->sg, j) = sg;
  
                         sgp = kzalloc_node(sizeof(struct sched_group_power),
@@ -6249,16 +6455,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
                 struct sd_data *sdd = &tl->data;
  
                 for_each_cpu(j, cpu_map) {
-                       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
-                       if (sd && (sd->flags & SD_OVERLAP))
-                               free_sched_groups(sd->groups, 0);
-                       kfree(*per_cpu_ptr(sdd->sd, j));
-                       kfree(*per_cpu_ptr(sdd->sg, j));
-                       kfree(*per_cpu_ptr(sdd->sgp, j));
+                       struct sched_domain *sd;
+
+                       if (sdd->sd) {
+                               sd = *per_cpu_ptr(sdd->sd, j);
+                               if (sd && (sd->flags & SD_OVERLAP))
+                                       free_sched_groups(sd->groups, 0);
+                               kfree(*per_cpu_ptr(sdd->sd, j));
+                       }
+
+                       if (sdd->sg)
+                               kfree(*per_cpu_ptr(sdd->sg, j));
+                       if (sdd->sgp)
+                               kfree(*per_cpu_ptr(sdd->sgp, j));
                 }
                 free_percpu(sdd->sd);
+               sdd->sd = NULL;
                 free_percpu(sdd->sg);
+               sdd->sg = NULL;
                 free_percpu(sdd->sgp);
+               sdd->sgp = NULL;
         }
  }
  
@@ -6584,54 +6800,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  }
  
  #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
-                                          struct sysdev_class_attribute *attr,
-                                          char *page)
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+                                          struct device_attribute *attr,
+                                          char *buf)
  {
-       return sprintf(page, "%u\n", sched_mc_power_savings);
+       return sprintf(buf, "%u\n", sched_mc_power_savings);
  }
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
-                                           struct sysdev_class_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+                                           struct device_attribute *attr,
                                             const char *buf, size_t count)
  {
         return sched_power_savings_store(buf, count, 0);
  }
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-                        sched_mc_power_savings_show,
-                        sched_mc_power_savings_store);
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+                  sched_mc_power_savings_show,
+                  sched_mc_power_savings_store);
  #endif
  
  #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
-                                           struct sysdev_class_attribute *attr,
-                                           char *page)
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
  {
-       return sprintf(page, "%u\n", sched_smt_power_savings);
+       return sprintf(buf, "%u\n", sched_smt_power_savings);
  }
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
-                                            struct sysdev_class_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+                                           struct device_attribute *attr,
                                              const char *buf, size_t count)
  {
         return sched_power_savings_store(buf, count, 1);
  }
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
                    sched_smt_power_savings_show,
                    sched_smt_power_savings_store);
  #endif
  
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
  {
         int err = 0;
  
  #ifdef CONFIG_SCHED_SMT
         if (smt_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_smt_power_savings.attr);
+               err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
  #endif
  #ifdef CONFIG_SCHED_MC
         if (!err && mc_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_mc_power_savings.attr);
+               err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
  #endif
         return err;
  }
@@ -6779,8 +6993,15 @@ void __init sched_init(void)
         INIT_LIST_HEAD(&root_task_group.children);
         INIT_LIST_HEAD(&root_task_group.siblings);
         autogroup_init(&init_task);
+
  #endif /* CONFIG_CGROUP_SCHED */
  
+#ifdef CONFIG_CGROUP_CPUACCT
+       root_cpuacct.cpustat = &kernel_cpustat;
+       root_cpuacct.cpuusage = alloc_percpu(u64);
+       /* Too early, not expected to fail */
+       BUG_ON(!root_cpuacct.cpuusage);
+#endif
         for_each_possible_cpu(i) {
                 struct rq *rq;
  
@@ -6840,6 +7061,9 @@ void __init sched_init(void)
                 rq->online = 0;
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
+
+               INIT_LIST_HEAD(&rq->cfs_tasks);
+
                 rq_attach_root(rq, &def_root_domain);
  #ifdef CONFIG_NO_HZ
                 rq->nohz_flags = 0;
@@ -6936,10 +7160,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
  
         on_rq = p->on_rq;
         if (on_rq)
-               deactivate_task(rq, p, 0);
+               dequeue_task(rq, p, 0);
         __setscheduler(rq, p, SCHED_NORMAL, 0);
         if (on_rq) {
-               activate_task(rq, p, 0);
+               enqueue_task(rq, p, 0);
                 resched_task(rq->curr);
         }
  
@@ -7038,10 +7262,6 @@ void set_curr_task(int cpu, struct task_struct *p)
  
  #endif
  
-#ifdef CONFIG_RT_GROUP_SCHED
-#else /* !CONFIG_RT_GROUP_SCHED */
-#endif /* CONFIG_RT_GROUP_SCHED */
-
  #ifdef CONFIG_CGROUP_SCHED
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
@@ -7150,9 +7370,6 @@ void sched_move_task(struct task_struct *tsk)
  }
  #endif /* CONFIG_CGROUP_SCHED */
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#endif
-
  #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
  static unsigned long to_ratio(u64 period, u64 runtime)
  {
@@ -7441,8 +7658,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
                             struct task_group, css);
  }
  
-static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
  {
         struct task_group *tg, *parent;
  
@@ -7459,37 +7675,43 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
         return &tg->css;
  }
  
-static void
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpu_cgroup_destroy(struct cgroup *cgrp)
  {
         struct task_group *tg = cgroup_tg(cgrp);
  
         sched_destroy_group(tg);
  }
  
-static int
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+                                struct cgroup_taskset *tset)
  {
+       struct task_struct *task;
+
+       cgroup_taskset_for_each(task, cgrp, tset) {
  #ifdef CONFIG_RT_GROUP_SCHED
-       if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
-               return -EINVAL;
+               if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+                       return -EINVAL;
  #else
-       /* We don't support RT-tasks being in separate groups */
-       if (tsk->sched_class != &fair_sched_class)
-               return -EINVAL;
+               /* We don't support RT-tasks being in separate groups */
+               if (task->sched_class != &fair_sched_class)
+                       return -EINVAL;
  #endif
+       }
         return 0;
  }
  
-static void
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cpu_cgroup_attach(struct cgroup *cgrp,
+                             struct cgroup_taskset *tset)
  {
-       sched_move_task(tsk);
+       struct task_struct *task;
+
+       cgroup_taskset_for_each(task, cgrp, tset)
+               sched_move_task(task);
  }
  
  static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct cgroup *old_cgrp, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+               struct task_struct *task)
  {
         /*
          * cgroup_exit() is called in the copy_process() failure path.
@@ -7620,9 +7842,6 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
         period = (u64)cfs_period_us * NSEC_PER_USEC;
         quota = tg->cfs_bandwidth.quota;
  
-       if (period <= 0)
-               return -EINVAL;
-
         return tg_set_cfs_bandwidth(tg, period, quota);
  }
  
@@ -7822,8 +8041,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
         .name           = "cpu",
         .create         = cpu_cgroup_create,
         .destroy        = cpu_cgroup_destroy,
-       .can_attach_task = cpu_cgroup_can_attach_task,
-       .attach_task    = cpu_cgroup_attach_task,
+       .can_attach     = cpu_cgroup_can_attach,
+       .attach         = cpu_cgroup_attach,
         .exit           = cpu_cgroup_exit,
         .populate       = cpu_cgroup_populate,
         .subsys_id      = cpu_cgroup_subsys_id,
@@ -7841,38 +8060,15 @@ struct cgroup_subsys cpu_cgroup_subsys = {
   * (balbir@in.ibm.com).
   */
  
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-       struct cgroup_subsys_state css;
-       /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 __percpu *cpuusage;
-       struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
-       struct cpuacct *parent;
-};
-
-struct cgroup_subsys cpuacct_subsys;
-
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                           struct cpuacct, css);
-}
-
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                           struct cpuacct, css);
-}
-
  /* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
-       struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
  {
-       struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-       int i;
+       struct cpuacct *ca;
+
+       if (!cgrp->parent)
+               return &root_cpuacct.css;
  
+       ca = kzalloc(sizeof(*ca), GFP_KERNEL);
         if (!ca)
                 goto out;
  
@@ -7880,18 +8076,13 @@ static struct cgroup_subsys_state *cpuacct_create(
         if (!ca->cpuusage)
                 goto out_free_ca;
  
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
-               if (percpu_counter_init(&ca->cpustat[i], 0))
-                       goto out_free_counters;
-
-       if (cgrp->parent)
-               ca->parent = cgroup_ca(cgrp->parent);
+       ca->cpustat = alloc_percpu(struct kernel_cpustat);
+       if (!ca->cpustat)
+               goto out_free_cpuusage;
  
         return &ca->css;
  
-out_free_counters:
-       while (--i >= 0)
-               percpu_counter_destroy(&ca->cpustat[i]);
+out_free_cpuusage:
         free_percpu(ca->cpuusage);
  out_free_ca:
         kfree(ca);
@@ -7900,14 +8091,11 @@ out:
  }
  
  /* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpuacct_destroy(struct cgroup *cgrp)
  {
         struct cpuacct *ca = cgroup_ca(cgrp);
-       int i;
  
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
-               percpu_counter_destroy(&ca->cpustat[i]);
+       free_percpu(ca->cpustat);
         free_percpu(ca->cpuusage);
         kfree(ca);
  }
@@ -8000,16 +8188,31 @@ static const char *cpuacct_stat_desc[] = {
  };
  
  static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-               struct cgroup_map_cb *cb)
+                             struct cgroup_map_cb *cb)
  {
         struct cpuacct *ca = cgroup_ca(cgrp);
-       int i;
+       int cpu;
+       s64 val = 0;
+
+       for_each_online_cpu(cpu) {
+               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+               val += kcpustat->cpustat[CPUTIME_USER];
+               val += kcpustat->cpustat[CPUTIME_NICE];
+       }
+       val = cputime64_to_clock_t(val);
+       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
  
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
-               s64 val = percpu_counter_read(&ca->cpustat[i]);
-               val = cputime64_to_clock_t(val);
-               cb->fill(cb, cpuacct_stat_desc[i], val);
+       val = 0;
+       for_each_online_cpu(cpu) {
+               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+               val += kcpustat->cpustat[CPUTIME_SYSTEM];
+               val += kcpustat->cpustat[CPUTIME_IRQ];
+               val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
         }
+
+       val = cputime64_to_clock_t(val);
+       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
         return 0;
  }
  
@@ -8053,7 +8256,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  
         ca = task_ca(tsk);
  
-       for (; ca; ca = ca->parent) {
+       for (; ca; ca = parent_ca(ca)) {
                 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
@@ -8061,45 +8264,6 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
         rcu_read_unlock();
  }
  
-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- * in cputime_t units. As a result, cpuacct_update_stats calls
- * percpu_counter_add with values large enough to always overflow the
- * per cpu batch limit causing bad SMP scalability.
- *
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- */
-#ifdef CONFIG_SMP
-#define CPUACCT_BATCH  \
-       min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
-#else
-#define CPUACCT_BATCH  0
-#endif
-
-/*
- * Charge the system/user time to the task's accounting group.
- */
-void cpuacct_update_stats(struct task_struct *tsk,
-               enum cpuacct_stat_index idx, cputime_t val)
-{
-       struct cpuacct *ca;
-       int batch = CPUACCT_BATCH;
-
-       if (unlikely(!cpuacct_subsys.active))
-               return;
-
-       rcu_read_lock();
-       ca = task_ca(tsk);
-
-       do {
-               __percpu_counter_add(&ca->cpustat[idx], val, batch);
-               ca = ca->parent;
-       } while (ca);
-       rcu_read_unlock();
-}
-
  struct cgroup_subsys cpuacct_subsys = {
         .name = "cpuacct",
         .create = cpuacct_create,