Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / kernel / sched / core.c
index dbbe35f..9a7fe31 100644 (file)
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
+#include <linux/binfmts.h>
 
+#include <asm/switch_to.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include <asm/mutex.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
@@ -149,7 +152,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
 {
        int i;
 
-       for (i = 0; sched_feat_names[i]; i++) {
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
                if (!(sysctl_sched_features & (1UL << i)))
                        seq_puts(m, "NO_");
                seq_printf(m, "%s ", sched_feat_names[i]);
@@ -159,6 +162,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
        return 0;
 }
 
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled)      \
+       jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+       if (static_key_enabled(&sched_feat_keys[i]))
+               static_key_slow_dec(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+       if (!static_key_enabled(&sched_feat_keys[i]))
+               static_key_slow_inc(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
 static ssize_t
 sched_feat_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
@@ -182,17 +215,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                cmp += 3;
        }
 
-       for (i = 0; sched_feat_names[i]; i++) {
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                       if (neg)
+                       if (neg) {
                                sysctl_sched_features &= ~(1UL << i);
-                       else
+                               sched_feat_disable(i);
+                       } else {
                                sysctl_sched_features |= (1UL << i);
+                               sched_feat_enable(i);
+                       }
                        break;
                }
        }
 
-       if (!sched_feat_names[i])
+       if (i == __SCHED_FEAT_NR)
                return -EINVAL;
 
        *ppos += cnt;
@@ -221,8 +257,7 @@ static __init int sched_init_debug(void)
        return 0;
 }
 late_initcall(sched_init_debug);
-
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
 
 /*
  * Number of tasks to iterate in a single balance run.
@@ -691,9 +726,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
        p->sched_class->dequeue_task(rq, p, flags);
 }
 
-/*
- * activate_task - move a task to the runqueue.
- */
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
@@ -702,9 +734,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
        enqueue_task(rq, p, flags);
 }
 
-/*
- * deactivate_task - remove a task from the runqueue.
- */
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
@@ -867,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        delta -= irq_delta;
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       if (static_branch((&paravirt_steal_rq_enabled))) {
+       if (static_key_false((&paravirt_steal_rq_enabled))) {
                u64 st;
 
                steal = paravirt_steal_clock(cpu_of(rq));
@@ -903,7 +932,7 @@ static int irqtime_account_hi_update(void)
 
        local_irq_save(flags);
        latest_ns = this_cpu_read(cpu_hardirq_time);
-       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat[CPUTIME_IRQ]))
+       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
                ret = 1;
        local_irq_restore(flags);
        return ret;
@@ -918,7 +947,7 @@ static int irqtime_account_si_update(void)
 
        local_irq_save(flags);
        latest_ns = this_cpu_read(cpu_softirq_time);
-       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat[CPUTIME_SOFTIRQ]))
+       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
                ret = 1;
        local_irq_restore(flags);
        return ret;
@@ -1236,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-       int dest_cpu;
        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+       enum { cpuset, possible, fail } state = cpuset;
+       int dest_cpu;
 
        /* Look for allowed, online CPU in same node. */
-       for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+       for_each_cpu(dest_cpu, nodemask) {
+               if (!cpu_online(dest_cpu))
+                       continue;
+               if (!cpu_active(dest_cpu))
+                       continue;
                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                        return dest_cpu;
+       }
 
-       /* Any allowed, online CPU? */
-       dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
-       if (dest_cpu < nr_cpu_ids)
-               return dest_cpu;
+       for (;;) {
+               /* Any allowed, online CPU? */
+               for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+                       if (!cpu_online(dest_cpu))
+                               continue;
+                       if (!cpu_active(dest_cpu))
+                               continue;
+                       goto out;
+               }
 
-       /* No more Mr. Nice Guy. */
-       dest_cpu = cpuset_cpus_allowed_fallback(p);
-       /*
-        * Don't tell them about moving exiting tasks or
-        * kernel threads (both mm NULL), since they never
-        * leave kernel.
-        */
-       if (p->mm && printk_ratelimit()) {
-               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, cpu);
+               switch (state) {
+               case cpuset:
+                       /* No more Mr. Nice Guy. */
+                       cpuset_cpus_allowed_fallback(p);
+                       state = possible;
+                       break;
+
+               case possible:
+                       do_set_cpus_allowed(p, cpu_possible_mask);
+                       state = fail;
+                       break;
+
+               case fail:
+                       BUG();
+                       break;
+               }
+       }
+
+out:
+       if (state != cpuset) {
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                                       task_pid_nr(p), p->comm, cpu);
+               }
        }
 
        return dest_cpu;
@@ -1479,6 +1538,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
 
 }
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+       return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
 #endif /* CONFIG_SMP */
 
 static void ttwu_queue(struct task_struct *p, int cpu)
@@ -1486,7 +1550,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
        struct rq *rq = cpu_rq(cpu);
 
 #if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                sched_clock_cpu(cpu); /* sync clocks x-cpu */
                ttwu_queue_remote(p, cpu);
                return;
@@ -1900,6 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
+       finish_arch_post_lock_switch();
 
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -2234,13 +2299,10 @@ calc_load_n(unsigned long load, unsigned long exp,
  * Once we've updated the global active value, we need to apply the exponential
  * weights adjusted to the number of cycles missed.
  */
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
 {
        long delta, active, n;
 
-       if (time_before(jiffies, calc_load_update))
-               return;
-
        /*
         * If we crossed a calc_load_update boundary, make sure to fold
         * any pending idle changes, the respective CPUs might have
@@ -2252,31 +2314,25 @@ static void calc_global_nohz(unsigned long ticks)
                atomic_long_add(delta, &calc_load_tasks);
 
        /*
-        * If we were idle for multiple load cycles, apply them.
+        * It could be the one fold was all it took, we done!
         */
-       if (ticks >= LOAD_FREQ) {
-               n = ticks / LOAD_FREQ;
+       if (time_before(jiffies, calc_load_update + 10))
+               return;
 
-               active = atomic_long_read(&calc_load_tasks);
-               active = active > 0 ? active * FIXED_1 : 0;
+       /*
+        * Catch-up, fold however many we are behind still
+        */
+       delta = jiffies - calc_load_update - 10;
+       n = 1 + (delta / LOAD_FREQ);
 
-               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+       active = atomic_long_read(&calc_load_tasks);
+       active = active > 0 ? active * FIXED_1 : 0;
 
-               calc_load_update += n * LOAD_FREQ;
-       }
+       avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+       avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+       avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
 
-       /*
-        * Its possible the remainder of the above division also crosses
-        * a LOAD_FREQ period, the regular check in calc_global_load()
-        * which comes after this will take care of that.
-        *
-        * Consider us being 11 ticks before a cycle completion, and us
-        * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
-        * age us 4 cycles, and the test in calc_global_load() will
-        * pick up the final one.
-        */
+       calc_load_update += n * LOAD_FREQ;
 }
 #else
 void calc_load_account_idle(struct rq *this_rq)
@@ -2288,7 +2344,7 @@ static inline long calc_load_fold_idle(void)
        return 0;
 }
 
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
 {
 }
 #endif
@@ -2316,8 +2372,6 @@ void calc_global_load(unsigned long ticks)
 {
        long active;
 
-       calc_global_nohz(ticks);
-
        if (time_before(jiffies, calc_load_update + 10))
                return;
 
@@ -2329,6 +2383,16 @@ void calc_global_load(unsigned long ticks)
        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
 
        calc_load_update += LOAD_FREQ;
+
+       /*
+        * Account one period with whatever state we found before
+        * folding in the nohz state and ageing the entire idle period.
+        *
+        * This avoids loosing a sample when we go idle between 
+        * calc_load_account_active() (10 ticks ago) and now and thus
+        * under-accounting.
+        */
+       calc_global_nohz();
 }
 
 /*
@@ -2556,6 +2620,84 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        return ns;
 }
 
+#ifdef CONFIG_CGROUP_CPUACCT
+struct cgroup_subsys cpuacct_subsys;
+struct cpuacct root_cpuacct;
+#endif
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+                                           u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+       struct kernel_cpustat *kcpustat;
+       struct cpuacct *ca;
+#endif
+       /*
+        * Since all updates are sure to touch the root cgroup, we
+        * get ourselves ahead and touch it first. If the root cgroup
+        * is the only cgroup, then nothing else should be necessary.
+        *
+        */
+       __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+
+#ifdef CONFIG_CGROUP_CPUACCT
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       rcu_read_lock();
+       ca = task_ca(p);
+       while (ca && (ca != &root_cpuacct)) {
+               kcpustat = this_cpu_ptr(ca->cpustat);
+               kcpustat->cpustat[index] += tmp;
+               ca = parent_ca(ca);
+       }
+       rcu_read_unlock();
+#endif
+}
+
+
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define cputime_to_u64(t) ((__force u64)(t))
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 cputime_to_u64(cputime_t t)
+{
+       u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+       unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+                                         + __this_cpu_read(steal_residual),
+                                       NS_PER_TICK,
+                                       &__get_cpu_var(steal_residual));
+
+       __this_cpu_write(steal_snapshot, s);
+       if (t < jiffies_to_cputime(adj))
+               return 0;
+
+       return (__force u64)(t - jiffies_to_cputime(adj));
+}
+
+static void steal_resume(void)
+{
+       cputime_to_u64(((cputime_t)1 << (BITS_PER_LONG * sizeof(cputime_t)
+                                        / sizeof(long) - 1)) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+       .resume = steal_resume,
+};
+
+static int __init steal_register(void)
+{
+       register_syscore_ops(&steal_syscore_ops);
+       return 0;
+}
+core_initcall(steal_register);
+#endif
+
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -2565,22 +2707,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 void account_user_time(struct task_struct *p, cputime_t cputime,
                       cputime_t cputime_scaled)
 {
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-       u64 tmp;
        int index;
 
        /* Add user time to process. */
-       p->utime = cputime_add(p->utime, cputime);
-       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+       p->utime += cputime;
+       p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
 
-       /* Add user time to cpustat. */
-       tmp = cputime_to_cputime64(cputime);
-
        index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
-       cpustat[index] += tmp;
 
-       cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
+       /* Add user time to cpustat. */
+       task_group_account_field(p, index, cputime_to_u64(cputime));
+
        /* Account for user time used */
        acct_update_integrals(p);
 }
@@ -2594,24 +2732,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 static void account_guest_time(struct task_struct *p, cputime_t cputime,
                               cputime_t cputime_scaled)
 {
-       u64 tmp;
        u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-       tmp = cputime_to_cputime64(cputime);
-
        /* Add guest time to process. */
-       p->utime = cputime_add(p->utime, cputime);
-       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+       p->utime += cputime;
+       p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
-       p->gtime = cputime_add(p->gtime, cputime);
+       p->gtime += cputime;
 
        /* Add guest time to cpustat. */
        if (TASK_NICE(p) > 0) {
-               cpustat[CPUTIME_NICE] += tmp;
-               cpustat[CPUTIME_GUEST_NICE] += tmp;
+               cpustat[CPUTIME_NICE] += (__force u64) cputime;
+               cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
-               cpustat[CPUTIME_USER] += tmp;
-               cpustat[CPUTIME_GUEST] += tmp;
+               cpustat[CPUTIME_USER] += (__force u64) cputime;
+               cpustat[CPUTIME_GUEST] += (__force u64) cputime;
        }
 }
 
@@ -2626,17 +2761,13 @@ static inline
 void __account_system_time(struct task_struct *p, cputime_t cputime,
                        cputime_t cputime_scaled, int index)
 {
-       u64 tmp = cputime_to_cputime64(cputime);
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-
        /* Add system time to process. */
-       p->stime = cputime_add(p->stime, cputime);
-       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+       p->stime += cputime;
+       p->stimescaled += cputime_scaled;
        account_group_system_time(p, cputime);
 
        /* Add system time to cpustat. */
-       cpustat[index] += tmp;
-       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+       task_group_account_field(p, index, cputime_to_u64(cputime));
 
        /* Account for system time used */
        acct_update_integrals(p);
@@ -2676,9 +2807,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 void account_steal_time(cputime_t cputime)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
-       u64 cputime64 = cputime_to_cputime64(cputime);
 
-       cpustat[CPUTIME_STEAL] += cputime64;
+       cpustat[CPUTIME_STEAL] += (__force u64) cputime;
 }
 
 /*
@@ -2688,19 +2818,18 @@ void account_steal_time(cputime_t cputime)
 void account_idle_time(cputime_t cputime)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
-       u64 cputime64 = cputime_to_cputime64(cputime);
        struct rq *rq = this_rq();
 
        if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat[CPUTIME_IOWAIT] += cputime64;
+               cpustat[CPUTIME_IOWAIT] += cputime_to_u64(cputime);
        else
-               cpustat[CPUTIME_IDLE] += cputime64;
+               cpustat[CPUTIME_IDLE] += cputime_to_u64(cputime);
 }
 
 static __always_inline bool steal_account_process_tick(void)
 {
 #ifdef CONFIG_PARAVIRT
-       if (static_branch(&paravirt_steal_enabled)) {
+       if (static_key_false(&paravirt_steal_enabled)) {
                u64 steal, st = 0;
 
                steal = paravirt_steal_clock(smp_processor_id());
@@ -2744,16 +2873,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                                struct rq *rq)
 {
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-       u64 tmp = cputime_to_cputime64(cputime_one_jiffy);
        u64 *cpustat = kcpustat_this_cpu->cpustat;
 
        if (steal_account_process_tick())
                return;
 
        if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += tmp;
+               cpustat[CPUTIME_IRQ] += cputime_to_u64(cputime_one_jiffy);
        } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += tmp;
+               cpustat[CPUTIME_SOFTIRQ] += cputime_to_u64(cputime_one_jiffy);
        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -2869,7 +2997,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-       cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
+       cputime_t rtime, utime = p->utime, total = utime + p->stime;
 
        /*
         * Use CFS's precise accounting:
@@ -2877,11 +3005,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 
        if (total) {
-               u64 temp = rtime;
+               u64 temp = (__force u64) rtime;
 
-               temp *= utime;
-               do_div(temp, total);
-               utime = (cputime_t)temp;
+               temp *= (__force u64) utime;
+               do_div(temp, (__force u32) total);
+               utime = (__force cputime_t) temp;
        } else
                utime = rtime;
 
@@ -2889,7 +3017,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         * Compare with previous values, to keep monotonicity:
         */
        p->prev_utime = max(p->prev_utime, utime);
-       p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
+       p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
 
        *ut = p->prev_utime;
        *st = p->prev_stime;
@@ -2906,21 +3034,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 
        thread_group_cputime(p, &cputime);
 
-       total = cputime_add(cputime.utime, cputime.stime);
+       total = cputime.utime + cputime.stime;
        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
 
        if (total) {
-               u64 temp = rtime;
+               u64 temp = (__force u64) rtime;
 
-               temp *= cputime.utime;
-               do_div(temp, total);
-               utime = (cputime_t)temp;
+               temp *= (__force u64) cputime.utime;
+               do_div(temp, (__force u32) total);
+               utime = (__force cputime_t) temp;
        } else
                utime = rtime;
 
        sig->prev_utime = max(sig->prev_utime, utime);
-       sig->prev_stime = max(sig->prev_stime,
-                             cputime_sub(rtime, sig->prev_utime));
+       sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
 
        *ut = sig->prev_utime;
        *st = sig->prev_stime;
@@ -3017,7 +3144,8 @@ EXPORT_SYMBOL(sub_preempt_count);
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
-       struct pt_regs *regs = get_irq_regs();
+       if (oops_in_progress)
+               return;
 
        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
                prev->comm, prev->pid, preempt_count());
@@ -3026,11 +3154,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
-
-       if (regs)
-               show_regs(regs);
-       else
-               dump_stack();
+       dump_stack();
 }
 
 /*
@@ -3164,14 +3288,14 @@ need_resched:
 
        post_schedule(rq);
 
-       preempt_enable_no_resched();
+       sched_preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-       if (!tsk->state)
+       if (!tsk->state || tsk_is_pi_blocked(tsk))
                return;
        /*
         * If we are going to sleep and we have plugged IO queued,
@@ -3190,7 +3314,24 @@ asmlinkage void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
 
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+       sched_preempt_enable_no_resched();
+       schedule();
+       preempt_disable();
+}
+
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+#include <asm/mutex.h>
+
+#ifndef arch_cpu_is_running
+#define arch_cpu_is_running(cpu) true
+#endif
 
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
@@ -3205,7 +3346,8 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
         */
        barrier();
 
-       return owner->on_cpu;
+       return owner->on_cpu
+              && arch_cpu_is_running(task_thread_info(owner)->cpu);
 }
 
 /*
@@ -3350,9 +3492,9 @@ EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 {
-       __wake_up_common(q, mode, 1, 0, NULL);
+       __wake_up_common(q, mode, nr, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
@@ -3711,6 +3853,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
        rq = __task_rq_lock(p);
 
+       /*
+        * Idle task boosting is a nono in general. There is one
+        * exception, when PREEMPT_RT and NOHZ is active:
+        *
+        * The idle task calls get_next_timer_interrupt() and holds
+        * the timer wheel base->lock on the CPU and another CPU wants
+        * to access the timer (probably to cancel it). We can safely
+        * ignore the boosting request, as the idle CPU runs this code
+        * with interrupts disabled and will complete the lock
+        * protected section without being interrupted. So there is no
+        * real need to boost.
+        */
+       if (unlikely(p == rq->idle)) {
+               WARN_ON(p != rq->curr);
+               WARN_ON(p->pi_blocked_on);
+               goto out_unlock;
+       }
+
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
@@ -3734,11 +3894,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
        check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
        __task_rq_unlock(rq);
 }
-
 #endif
-
 void set_user_nice(struct task_struct *p, long nice)
 {
        int old_prio, delta, on_rq;
@@ -4072,7 +4231,7 @@ recheck:
        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
-               deactivate_task(rq, p, 0);
+               dequeue_task(rq, p, 0);
        if (running)
                p->sched_class->put_prev_task(rq, p);
 
@@ -4085,7 +4244,7 @@ recheck:
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq)
-               activate_task(rq, p, 0);
+               enqueue_task(rq, p, 0);
 
        check_class_changed(rq, p, prev_class, oldprio);
        task_rq_unlock(rq, p, &flags);
@@ -4268,7 +4427,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-       if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+       if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
                goto out_unlock;
 
        retval = security_task_setscheduler(p);
@@ -4418,7 +4577,7 @@ SYSCALL_DEFINE0(sched_yield)
        __release(rq->lock);
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
        do_raw_spin_unlock(&rq->lock);
-       preempt_enable_no_resched();
+       sched_preempt_enable_no_resched();
 
        schedule();
 
@@ -4492,8 +4651,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
- * This is a shortcut for kernel-space yielding - it marks the
- * thread runnable and calls sys_sched_yield().
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ *     yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
  */
 void __sched yield(void)
 {
@@ -4724,7 +4899,7 @@ void sched_show_task(struct task_struct *p)
        free = stack_not_used(p);
 #endif
        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-               task_pid_nr(p), task_pid_nr(p->real_parent),
+               task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
                (unsigned long)task_thread_info(p)->flags);
 
        show_stack(p, NULL);
@@ -4936,9 +5111,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * placed properly.
         */
        if (p->on_rq) {
-               deactivate_task(rq_src, p, 0);
+               dequeue_task(rq_src, p, 0);
                set_task_cpu(p, dest_cpu);
-               activate_task(rq_dest, p, 0);
+               enqueue_task(rq_dest, p, 0);
                check_preempt_curr(rq_dest, p, 0);
        }
 done:
@@ -5114,7 +5289,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 static void
 set_table_entry(struct ctl_table *entry,
                const char *procname, void *data, int maxlen,
-               mode_t mode, proc_handler *proc_handler)
+               umode_t mode, proc_handler *proc_handler)
 {
        entry->procname = procname;
        entry->data = data;
@@ -5325,7 +5500,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
+       case CPU_STARTING:
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5691,6 +5866,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 }
 
 /*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first cpu number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two cpus are in the same cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_id);
+
+static void update_top_cache_domain(int cpu)
+{
+       struct sched_domain *sd;
+       int id = cpu;
+
+       sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+       if (sd)
+               id = cpumask_first(sched_domain_span(sd));
+
+       rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+       per_cpu(sd_llc_id, cpu) = id;
+}
+
+/*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
@@ -5729,6 +5929,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
        destroy_sched_domains(tmp, cpu);
+
+       update_top_cache_domain(cpu);
 }
 
 /* cpus with isolated domains */
@@ -6228,6 +6430,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        if (!sg)
                                return -ENOMEM;
 
+                       sg->next = sg;
+
                        *per_cpu_ptr(sdd->sg, j) = sg;
 
                        sgp = kzalloc_node(sizeof(struct sched_group_power),
@@ -6251,16 +6455,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
                struct sd_data *sdd = &tl->data;
 
                for_each_cpu(j, cpu_map) {
-                       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
-                       if (sd && (sd->flags & SD_OVERLAP))
-                               free_sched_groups(sd->groups, 0);
-                       kfree(*per_cpu_ptr(sdd->sd, j));
-                       kfree(*per_cpu_ptr(sdd->sg, j));
-                       kfree(*per_cpu_ptr(sdd->sgp, j));
+                       struct sched_domain *sd;
+
+                       if (sdd->sd) {
+                               sd = *per_cpu_ptr(sdd->sd, j);
+                               if (sd && (sd->flags & SD_OVERLAP))
+                                       free_sched_groups(sd->groups, 0);
+                               kfree(*per_cpu_ptr(sdd->sd, j));
+                       }
+
+                       if (sdd->sg)
+                               kfree(*per_cpu_ptr(sdd->sg, j));
+                       if (sdd->sgp)
+                               kfree(*per_cpu_ptr(sdd->sgp, j));
                }
                free_percpu(sdd->sd);
+               sdd->sd = NULL;
                free_percpu(sdd->sg);
+               sdd->sg = NULL;
                free_percpu(sdd->sgp);
+               sdd->sgp = NULL;
        }
 }
 
@@ -6586,54 +6800,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
-                                          struct sysdev_class_attribute *attr,
-                                          char *page)
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+                                          struct device_attribute *attr,
+                                          char *buf)
 {
-       return sprintf(page, "%u\n", sched_mc_power_savings);
+       return sprintf(buf, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
-                                           struct sysdev_class_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+                                           struct device_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-                        sched_mc_power_savings_show,
-                        sched_mc_power_savings_store);
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+                  sched_mc_power_savings_show,
+                  sched_mc_power_savings_store);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
-                                           struct sysdev_class_attribute *attr,
-                                           char *page)
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
 {
-       return sprintf(page, "%u\n", sched_smt_power_savings);
+       return sprintf(buf, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
-                                            struct sysdev_class_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+                                           struct device_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
                   sched_smt_power_savings_show,
                   sched_smt_power_savings_store);
 #endif
 
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
 {
        int err = 0;
 
 #ifdef CONFIG_SCHED_SMT
        if (smt_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_smt_power_savings.attr);
+               err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
 #endif
 #ifdef CONFIG_SCHED_MC
        if (!err && mc_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_mc_power_savings.attr);
+               err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
 #endif
        return err;
 }
@@ -6781,8 +6993,15 @@ void __init sched_init(void)
        INIT_LIST_HEAD(&root_task_group.children);
        INIT_LIST_HEAD(&root_task_group.siblings);
        autogroup_init(&init_task);
+
 #endif /* CONFIG_CGROUP_SCHED */
 
+#ifdef CONFIG_CGROUP_CPUACCT
+       root_cpuacct.cpustat = &kernel_cpustat;
+       root_cpuacct.cpuusage = alloc_percpu(u64);
+       /* Too early, not expected to fail */
+       BUG_ON(!root_cpuacct.cpuusage);
+#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
 
@@ -6842,6 +7061,9 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+
+               INIT_LIST_HEAD(&rq->cfs_tasks);
+
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
                rq->nohz_flags = 0;
@@ -6938,10 +7160,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 
        on_rq = p->on_rq;
        if (on_rq)
-               deactivate_task(rq, p, 0);
+               dequeue_task(rq, p, 0);
        __setscheduler(rq, p, SCHED_NORMAL, 0);
        if (on_rq) {
-               activate_task(rq, p, 0);
+               enqueue_task(rq, p, 0);
                resched_task(rq->curr);
        }
 
@@ -7040,10 +7262,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #endif
 
-#ifdef CONFIG_RT_GROUP_SCHED
-#else /* !CONFIG_RT_GROUP_SCHED */
-#endif /* CONFIG_RT_GROUP_SCHED */
-
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
@@ -7152,9 +7370,6 @@ void sched_move_task(struct task_struct *tsk)
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#endif
-
 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
@@ -7443,8 +7658,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
                            struct task_group, css);
 }
 
-static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
 {
        struct task_group *tg, *parent;
 
@@ -7461,37 +7675,43 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        return &tg->css;
 }
 
-static void
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpu_cgroup_destroy(struct cgroup *cgrp)
 {
        struct task_group *tg = cgroup_tg(cgrp);
 
        sched_destroy_group(tg);
 }
 
-static int
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+                                struct cgroup_taskset *tset)
 {
+       struct task_struct *task;
+
+       cgroup_taskset_for_each(task, cgrp, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
-       if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
-               return -EINVAL;
+               if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+                       return -EINVAL;
 #else
-       /* We don't support RT-tasks being in separate groups */
-       if (tsk->sched_class != &fair_sched_class)
-               return -EINVAL;
+               /* We don't support RT-tasks being in separate groups */
+               if (task->sched_class != &fair_sched_class)
+                       return -EINVAL;
 #endif
+       }
        return 0;
 }
 
-static void
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cpu_cgroup_attach(struct cgroup *cgrp,
+                             struct cgroup_taskset *tset)
 {
-       sched_move_task(tsk);
+       struct task_struct *task;
+
+       cgroup_taskset_for_each(task, cgrp, tset)
+               sched_move_task(task);
 }
 
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct cgroup *old_cgrp, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+               struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
@@ -7622,9 +7842,6 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
        period = (u64)cfs_period_us * NSEC_PER_USEC;
        quota = tg->cfs_bandwidth.quota;
 
-       if (period <= 0)
-               return -EINVAL;
-
        return tg_set_cfs_bandwidth(tg, period, quota);
 }
 
@@ -7824,8 +8041,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-       .can_attach_task = cpu_cgroup_can_attach_task,
-       .attach_task    = cpu_cgroup_attach_task,
+       .can_attach     = cpu_cgroup_can_attach,
+       .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
@@ -7843,38 +8060,15 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  * (balbir@in.ibm.com).
  */
 
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-       struct cgroup_subsys_state css;
-       /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 __percpu *cpuusage;
-       struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
-       struct cpuacct *parent;
-};
-
-struct cgroup_subsys cpuacct_subsys;
-
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                           struct cpuacct, css);
-}
-
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                           struct cpuacct, css);
-}
-
 /* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
-       struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
 {
-       struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-       int i;
+       struct cpuacct *ca;
+
+       if (!cgrp->parent)
+               return &root_cpuacct.css;
 
+       ca = kzalloc(sizeof(*ca), GFP_KERNEL);
        if (!ca)
                goto out;
 
@@ -7882,18 +8076,13 @@ static struct cgroup_subsys_state *cpuacct_create(
        if (!ca->cpuusage)
                goto out_free_ca;
 
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
-               if (percpu_counter_init(&ca->cpustat[i], 0))
-                       goto out_free_counters;
-
-       if (cgrp->parent)
-               ca->parent = cgroup_ca(cgrp->parent);
+       ca->cpustat = alloc_percpu(struct kernel_cpustat);
+       if (!ca->cpustat)
+               goto out_free_cpuusage;
 
        return &ca->css;
 
-out_free_counters:
-       while (--i >= 0)
-               percpu_counter_destroy(&ca->cpustat[i]);
+out_free_cpuusage:
        free_percpu(ca->cpuusage);
 out_free_ca:
        kfree(ca);
@@ -7902,14 +8091,11 @@ out:
 }
 
 /* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpuacct_destroy(struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
-       int i;
 
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
-               percpu_counter_destroy(&ca->cpustat[i]);
+       free_percpu(ca->cpustat);
        free_percpu(ca->cpuusage);
        kfree(ca);
 }
@@ -8002,16 +8188,31 @@ static const char *cpuacct_stat_desc[] = {
 };
 
 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-               struct cgroup_map_cb *cb)
+                             struct cgroup_map_cb *cb)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
-       int i;
+       int cpu;
+       s64 val = 0;
+
+       for_each_online_cpu(cpu) {
+               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+               val += kcpustat->cpustat[CPUTIME_USER];
+               val += kcpustat->cpustat[CPUTIME_NICE];
+       }
+       val = cputime64_to_clock_t(val);
+       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
 
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
-               s64 val = percpu_counter_read(&ca->cpustat[i]);
-               val = cputime64_to_clock_t(val);
-               cb->fill(cb, cpuacct_stat_desc[i], val);
+       val = 0;
+       for_each_online_cpu(cpu) {
+               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+               val += kcpustat->cpustat[CPUTIME_SYSTEM];
+               val += kcpustat->cpustat[CPUTIME_IRQ];
+               val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
        }
+
+       val = cputime64_to_clock_t(val);
+       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
        return 0;
 }
 
@@ -8055,7 +8256,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 
        ca = task_ca(tsk);
 
-       for (; ca; ca = ca->parent) {
+       for (; ca; ca = parent_ca(ca)) {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
@@ -8063,45 +8264,6 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        rcu_read_unlock();
 }
 
-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- * in cputime_t units. As a result, cpuacct_update_stats calls
- * percpu_counter_add with values large enough to always overflow the
- * per cpu batch limit causing bad SMP scalability.
- *
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- */
-#ifdef CONFIG_SMP
-#define CPUACCT_BATCH  \
-       min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
-#else
-#define CPUACCT_BATCH  0
-#endif
-
-/*
- * Charge the system/user time to the task's accounting group.
- */
-void cpuacct_update_stats(struct task_struct *tsk,
-               enum cpuacct_stat_index idx, cputime_t val)
-{
-       struct cpuacct *ca;
-       int batch = CPUACCT_BATCH;
-
-       if (unlikely(!cpuacct_subsys.active))
-               return;
-
-       rcu_read_lock();
-       ca = task_ca(tsk);
-
-       do {
-               __percpu_counter_add(&ca->cpustat[idx], val, batch);
-               ca = ca->parent;
-       } while (ca);
-       rcu_read_unlock();
-}
-
 struct cgroup_subsys cpuacct_subsys = {
        .name = "cpuacct",
        .create = cpuacct_create,