Update to 3.4-final.

[linux-flexiantxendom0-3.2.10.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 457c881..9a7fe31 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,9 +71,12 @@
  #include <linux/ftrace.h>
  #include <linux/slab.h>
  #include <linux/init_task.h>
+#include <linux/binfmts.h>
  
+#include <asm/switch_to.h>
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
+#include <asm/mutex.h>
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt.h>
  #endif
@@ -161,13 +164,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
  
  #ifdef HAVE_JUMP_LABEL
  
-#define jump_label_key__true  jump_label_key_enabled
-#define jump_label_key__false jump_label_key_disabled
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
  
  #define SCHED_FEAT(name, enabled)      \
         jump_label_key__##enabled ,
  
-struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
  #include "features.h"
  };
  
@@ -175,14 +178,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
  
  static void sched_feat_disable(int i)
  {
-       if (jump_label_enabled(&sched_feat_keys[i]))
-               jump_label_dec(&sched_feat_keys[i]);
+       if (static_key_enabled(&sched_feat_keys[i]))
+               static_key_slow_dec(&sched_feat_keys[i]);
  }
  
  static void sched_feat_enable(int i)
  {
-       if (!jump_label_enabled(&sched_feat_keys[i]))
-               jump_label_inc(&sched_feat_keys[i]);
+       if (!static_key_enabled(&sched_feat_keys[i]))
+               static_key_slow_inc(&sched_feat_keys[i]);
  }
  #else
  static void sched_feat_disable(int i) { };
@@ -723,9 +726,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
         p->sched_class->dequeue_task(rq, p, flags);
  }
  
-/*
- * activate_task - move a task to the runqueue.
- */
  void activate_task(struct rq *rq, struct task_struct *p, int flags)
  {
         if (task_contributes_to_load(p))
@@ -734,9 +734,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
         enqueue_task(rq, p, flags);
  }
  
-/*
- * deactivate_task - remove a task from the runqueue.
- */
  void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
  {
         if (task_contributes_to_load(p))
@@ -899,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
         delta -= irq_delta;
  #endif
  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       if (static_branch((&paravirt_steal_rq_enabled))) {
+       if (static_key_false((&paravirt_steal_rq_enabled))) {
                 u64 st;
  
                 steal = paravirt_steal_clock(cpu_of(rq));
@@ -1268,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
   */
  static int select_fallback_rq(int cpu, struct task_struct *p)
  {
-       int dest_cpu;
         const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+       enum { cpuset, possible, fail } state = cpuset;
+       int dest_cpu;
  
         /* Look for allowed, online CPU in same node. */
-       for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+       for_each_cpu(dest_cpu, nodemask) {
+               if (!cpu_online(dest_cpu))
+                       continue;
+               if (!cpu_active(dest_cpu))
+                       continue;
                 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                         return dest_cpu;
+       }
  
-       /* Any allowed, online CPU? */
-       dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
-       if (dest_cpu < nr_cpu_ids)
-               return dest_cpu;
+       for (;;) {
+               /* Any allowed, online CPU? */
+               for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+                       if (!cpu_online(dest_cpu))
+                               continue;
+                       if (!cpu_active(dest_cpu))
+                               continue;
+                       goto out;
+               }
  
-       /* No more Mr. Nice Guy. */
-       dest_cpu = cpuset_cpus_allowed_fallback(p);
-       /*
-        * Don't tell them about moving exiting tasks or
-        * kernel threads (both mm NULL), since they never
-        * leave kernel.
-        */
-       if (p->mm && printk_ratelimit()) {
-               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, cpu);
+               switch (state) {
+               case cpuset:
+                       /* No more Mr. Nice Guy. */
+                       cpuset_cpus_allowed_fallback(p);
+                       state = possible;
+                       break;
+
+               case possible:
+                       do_set_cpus_allowed(p, cpu_possible_mask);
+                       state = fail;
+                       break;
+
+               case fail:
+                       BUG();
+                       break;
+               }
+       }
+
+out:
+       if (state != cpuset) {
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                                       task_pid_nr(p), p->comm, cpu);
+               }
         }
  
         return dest_cpu;
@@ -1512,7 +1539,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
  }
  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
  
-static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_cache(int this_cpu, int that_cpu)
  {
         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
  }
@@ -1523,7 +1550,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
         struct rq *rq = cpu_rq(cpu);
  
  #if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                 sched_clock_cpu(cpu); /* sync clocks x-cpu */
                 ttwu_queue_remote(p, cpu);
                 return;
@@ -1937,7 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         local_irq_enable();
  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
-       trace_sched_stat_sleeptime(current, rq->clock);
+       finish_arch_post_lock_switch();
  
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@ -2272,13 +2299,10 @@ calc_load_n(unsigned long load, unsigned long exp,
   * Once we've updated the global active value, we need to apply the exponential
   * weights adjusted to the number of cycles missed.
   */
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
  {
         long delta, active, n;
  
-       if (time_before(jiffies, calc_load_update))
-               return;
-
         /*
          * If we crossed a calc_load_update boundary, make sure to fold
          * any pending idle changes, the respective CPUs might have
@@ -2290,31 +2314,25 @@ static void calc_global_nohz(unsigned long ticks)
                 atomic_long_add(delta, &calc_load_tasks);
  
         /*
-        * If we were idle for multiple load cycles, apply them.
+        * It could be the one fold was all it took, we done!
          */
-       if (ticks >= LOAD_FREQ) {
-               n = ticks / LOAD_FREQ;
+       if (time_before(jiffies, calc_load_update + 10))
+               return;
  
-               active = atomic_long_read(&calc_load_tasks);
-               active = active > 0 ? active * FIXED_1 : 0;
+       /*
+        * Catch-up, fold however many we are behind still
+        */
+       delta = jiffies - calc_load_update - 10;
+       n = 1 + (delta / LOAD_FREQ);
  
-               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+       active = atomic_long_read(&calc_load_tasks);
+       active = active > 0 ? active * FIXED_1 : 0;
  
-               calc_load_update += n * LOAD_FREQ;
-       }
+       avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+       avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+       avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
  
-       /*
-        * Its possible the remainder of the above division also crosses
-        * a LOAD_FREQ period, the regular check in calc_global_load()
-        * which comes after this will take care of that.
-        *
-        * Consider us being 11 ticks before a cycle completion, and us
-        * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
-        * age us 4 cycles, and the test in calc_global_load() will
-        * pick up the final one.
-        */
+       calc_load_update += n * LOAD_FREQ;
  }
  #else
  void calc_load_account_idle(struct rq *this_rq)
@@ -2326,7 +2344,7 @@ static inline long calc_load_fold_idle(void)
         return 0;
  }
  
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
  {
  }
  #endif
@@ -2354,8 +2372,6 @@ void calc_global_load(unsigned long ticks)
  {
         long active;
  
-       calc_global_nohz(ticks);
-
         if (time_before(jiffies, calc_load_update + 10))
                 return;
  
@@ -2367,6 +2383,16 @@ void calc_global_load(unsigned long ticks)
         avenrun[2] = calc_load(avenrun[2], EXP_15, active);
  
         calc_load_update += LOAD_FREQ;
+
+       /*
+        * Account one period with whatever state we found before
+        * folding in the nohz state and ageing the entire idle period.
+        *
+        * This avoids loosing a sample when we go idle between 
+        * calc_load_account_active() (10 ticks ago) and now and thus
+        * under-accounting.
+        */
+       calc_global_nohz();
  }
  
  /*
@@ -2630,6 +2656,48 @@ static inline void task_group_account_field(struct task_struct *p, int index,
  }
  
  
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define cputime_to_u64(t) ((__force u64)(t))
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 cputime_to_u64(cputime_t t)
+{
+       u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+       unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+                                         + __this_cpu_read(steal_residual),
+                                       NS_PER_TICK,
+                                       &__get_cpu_var(steal_residual));
+
+       __this_cpu_write(steal_snapshot, s);
+       if (t < jiffies_to_cputime(adj))
+               return 0;
+
+       return (__force u64)(t - jiffies_to_cputime(adj));
+}
+
+static void steal_resume(void)
+{
+       cputime_to_u64(((cputime_t)1 << (BITS_PER_LONG * sizeof(cputime_t)
+                                        / sizeof(long) - 1)) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+       .resume = steal_resume,
+};
+
+static int __init steal_register(void)
+{
+       register_syscore_ops(&steal_syscore_ops);
+       return 0;
+}
+core_initcall(steal_register);
+#endif
+
  /*
   * Account user cpu time to a process.
   * @p: the process that the cpu time gets accounted to
@@ -2649,7 +2717,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
         index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
  
         /* Add user time to cpustat. */
-       task_group_account_field(p, index, (__force u64) cputime);
+       task_group_account_field(p, index, cputime_to_u64(cputime));
  
         /* Account for user time used */
         acct_update_integrals(p);
@@ -2699,7 +2767,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
         account_group_system_time(p, cputime);
  
         /* Add system time to cpustat. */
-       task_group_account_field(p, index, (__force u64) cputime);
+       task_group_account_field(p, index, cputime_to_u64(cputime));
  
         /* Account for system time used */
         acct_update_integrals(p);
@@ -2753,15 +2821,15 @@ void account_idle_time(cputime_t cputime)
         struct rq *rq = this_rq();
  
         if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+               cpustat[CPUTIME_IOWAIT] += cputime_to_u64(cputime);
         else
-               cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+               cpustat[CPUTIME_IDLE] += cputime_to_u64(cputime);
  }
  
  static __always_inline bool steal_account_process_tick(void)
  {
  #ifdef CONFIG_PARAVIRT
-       if (static_branch(&paravirt_steal_enabled)) {
+       if (static_key_false(&paravirt_steal_enabled)) {
                 u64 steal, st = 0;
  
                 steal = paravirt_steal_clock(smp_processor_id());
@@ -2811,9 +2879,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                 return;
  
         if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_IRQ] += cputime_to_u64(cputime_one_jiffy);
         } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_SOFTIRQ] += cputime_to_u64(cputime_one_jiffy);
         } else if (this_cpu_ksoftirqd() == p) {
                 /*
                  * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -3076,8 +3144,6 @@ EXPORT_SYMBOL(sub_preempt_count);
   */
  static noinline void __schedule_bug(struct task_struct *prev)
  {
-       struct pt_regs *regs = get_irq_regs();
-
         if (oops_in_progress)
                 return;
  
@@ -3088,11 +3154,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
         print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
-
-       if (regs)
-               show_regs(regs);
-       else
-               dump_stack();
+       dump_stack();
  }
  
  /*
@@ -3226,14 +3288,14 @@ need_resched:
  
         post_schedule(rq);
  
-       preempt_enable_no_resched();
+       sched_preempt_enable_no_resched();
         if (need_resched())
                 goto need_resched;
  }
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
-       if (!tsk->state)
+       if (!tsk->state || tsk_is_pi_blocked(tsk))
                 return;
         /*
          * If we are going to sleep and we have plugged IO queued,
@@ -3252,7 +3314,24 @@ asmlinkage void __sched schedule(void)
  }
  EXPORT_SYMBOL(schedule);
  
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+       sched_preempt_enable_no_resched();
+       schedule();
+       preempt_disable();
+}
+
  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+#include <asm/mutex.h>
+
+#ifndef arch_cpu_is_running
+#define arch_cpu_is_running(cpu) true
+#endif
  
  static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
  {
@@ -3267,7 +3346,8 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
          */
         barrier();
  
-       return owner->on_cpu;
+       return owner->on_cpu
+              && arch_cpu_is_running(task_thread_info(owner)->cpu);
  }
  
  /*
@@ -3412,9 +3492,9 @@ EXPORT_SYMBOL(__wake_up);
  /*
   * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
   */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
  {
-       __wake_up_common(q, mode, 1, 0, NULL);
+       __wake_up_common(q, mode, nr, 0, NULL);
  }
  EXPORT_SYMBOL_GPL(__wake_up_locked);
  
@@ -3773,6 +3853,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         rq = __task_rq_lock(p);
  
+       /*
+        * Idle task boosting is a nono in general. There is one
+        * exception, when PREEMPT_RT and NOHZ is active:
+        *
+        * The idle task calls get_next_timer_interrupt() and holds
+        * the timer wheel base->lock on the CPU and another CPU wants
+        * to access the timer (probably to cancel it). We can safely
+        * ignore the boosting request, as the idle CPU runs this code
+        * with interrupts disabled and will complete the lock
+        * protected section without being interrupted. So there is no
+        * real need to boost.
+        */
+       if (unlikely(p == rq->idle)) {
+               WARN_ON(p != rq->curr);
+               WARN_ON(p->pi_blocked_on);
+               goto out_unlock;
+       }
+
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
@@ -3796,11 +3894,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
  
         check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
         __task_rq_unlock(rq);
  }
-
  #endif
-
  void set_user_nice(struct task_struct *p, long nice)
  {
         int old_prio, delta, on_rq;
@@ -4134,7 +4231,7 @@ recheck:
         on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
-               deactivate_task(rq, p, 0);
+               dequeue_task(rq, p, 0);
         if (running)
                 p->sched_class->put_prev_task(rq, p);
  
@@ -4147,7 +4244,7 @@ recheck:
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (on_rq)
-               activate_task(rq, p, 0);
+               enqueue_task(rq, p, 0);
  
         check_class_changed(rq, p, prev_class, oldprio);
         task_rq_unlock(rq, p, &flags);
@@ -4330,7 +4427,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                 goto out_free_cpus_allowed;
         }
         retval = -EPERM;
-       if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+       if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
                 goto out_unlock;
  
         retval = security_task_setscheduler(p);
@@ -4480,7 +4577,7 @@ SYSCALL_DEFINE0(sched_yield)
         __release(rq->lock);
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
         do_raw_spin_unlock(&rq->lock);
-       preempt_enable_no_resched();
+       sched_preempt_enable_no_resched();
  
         schedule();
  
@@ -4554,8 +4651,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
  /**
   * yield - yield the current processor to other threads.
   *
- * This is a shortcut for kernel-space yielding - it marks the
- * thread runnable and calls sys_sched_yield().
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ *     yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
   */
  void __sched yield(void)
  {
@@ -4998,9 +5111,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
          * placed properly.
          */
         if (p->on_rq) {
-               deactivate_task(rq_src, p, 0);
+               dequeue_task(rq_src, p, 0);
                 set_task_cpu(p, dest_cpu);
-               activate_task(rq_dest, p, 0);
+               enqueue_task(rq_dest, p, 0);
                 check_preempt_curr(rq_dest, p, 0);
         }
  done:
@@ -5176,7 +5289,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
  static void
  set_table_entry(struct ctl_table *entry,
                 const char *procname, void *data, int maxlen,
-               mode_t mode, proc_handler *proc_handler)
+               umode_t mode, proc_handler *proc_handler)
  {
         entry->procname = procname;
         entry->data = data;
@@ -5387,7 +5500,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
  {
         switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
+       case CPU_STARTING:
         case CPU_DOWN_FAILED:
                 set_cpu_active((long)hcpu, true);
                 return NOTIFY_OK;
@@ -5759,7 +5872,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
   *
   * Also keep a unique ID per domain (we use the first cpu number in
   * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see ttwu_share_cache().
+ * two cpus are in the same cache domain, see cpus_share_cache().
   */
  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
  DEFINE_PER_CPU(int, sd_llc_id);
@@ -6317,6 +6430,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                         if (!sg)
                                 return -ENOMEM;
  
+                       sg->next = sg;
+
                         *per_cpu_ptr(sdd->sg, j) = sg;
  
                         sgp = kzalloc_node(sizeof(struct sched_group_power),
@@ -6340,16 +6455,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
                 struct sd_data *sdd = &tl->data;
  
                 for_each_cpu(j, cpu_map) {
-                       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
-                       if (sd && (sd->flags & SD_OVERLAP))
-                               free_sched_groups(sd->groups, 0);
-                       kfree(*per_cpu_ptr(sdd->sd, j));
-                       kfree(*per_cpu_ptr(sdd->sg, j));
-                       kfree(*per_cpu_ptr(sdd->sgp, j));
+                       struct sched_domain *sd;
+
+                       if (sdd->sd) {
+                               sd = *per_cpu_ptr(sdd->sd, j);
+                               if (sd && (sd->flags & SD_OVERLAP))
+                                       free_sched_groups(sd->groups, 0);
+                               kfree(*per_cpu_ptr(sdd->sd, j));
+                       }
+
+                       if (sdd->sg)
+                               kfree(*per_cpu_ptr(sdd->sg, j));
+                       if (sdd->sgp)
+                               kfree(*per_cpu_ptr(sdd->sgp, j));
                 }
                 free_percpu(sdd->sd);
+               sdd->sd = NULL;
                 free_percpu(sdd->sg);
+               sdd->sg = NULL;
                 free_percpu(sdd->sgp);
+               sdd->sgp = NULL;
         }
  }
  
@@ -6675,54 +6800,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  }
  
  #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
-                                          struct sysdev_class_attribute *attr,
-                                          char *page)
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+                                          struct device_attribute *attr,
+                                          char *buf)
  {
-       return sprintf(page, "%u\n", sched_mc_power_savings);
+       return sprintf(buf, "%u\n", sched_mc_power_savings);
  }
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
-                                           struct sysdev_class_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+                                           struct device_attribute *attr,
                                             const char *buf, size_t count)
  {
         return sched_power_savings_store(buf, count, 0);
  }
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-                        sched_mc_power_savings_show,
-                        sched_mc_power_savings_store);
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+                  sched_mc_power_savings_show,
+                  sched_mc_power_savings_store);
  #endif
  
  #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
-                                           struct sysdev_class_attribute *attr,
-                                           char *page)
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
  {
-       return sprintf(page, "%u\n", sched_smt_power_savings);
+       return sprintf(buf, "%u\n", sched_smt_power_savings);
  }
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
-                                            struct sysdev_class_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+                                           struct device_attribute *attr,
                                              const char *buf, size_t count)
  {
         return sched_power_savings_store(buf, count, 1);
  }
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
                    sched_smt_power_savings_show,
                    sched_smt_power_savings_store);
  #endif
  
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
  {
         int err = 0;
  
  #ifdef CONFIG_SCHED_SMT
         if (smt_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_smt_power_savings.attr);
+               err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
  #endif
  #ifdef CONFIG_SCHED_MC
         if (!err && mc_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_mc_power_savings.attr);
+               err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
  #endif
         return err;
  }
@@ -6938,6 +7061,9 @@ void __init sched_init(void)
                 rq->online = 0;
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
+
+               INIT_LIST_HEAD(&rq->cfs_tasks);
+
                 rq_attach_root(rq, &def_root_domain);
  #ifdef CONFIG_NO_HZ
                 rq->nohz_flags = 0;
@@ -7034,10 +7160,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
  
         on_rq = p->on_rq;
         if (on_rq)
-               deactivate_task(rq, p, 0);
+               dequeue_task(rq, p, 0);
         __setscheduler(rq, p, SCHED_NORMAL, 0);
         if (on_rq) {
-               activate_task(rq, p, 0);
+               enqueue_task(rq, p, 0);
                 resched_task(rq->curr);
         }
  
@@ -7532,8 +7658,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
                             struct task_group, css);
  }
  
-static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
  {
         struct task_group *tg, *parent;
  
@@ -7550,37 +7675,43 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
         return &tg->css;
  }
  
-static void
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpu_cgroup_destroy(struct cgroup *cgrp)
  {
         struct task_group *tg = cgroup_tg(cgrp);
  
         sched_destroy_group(tg);
  }
  
-static int
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+                                struct cgroup_taskset *tset)
  {
+       struct task_struct *task;
+
+       cgroup_taskset_for_each(task, cgrp, tset) {
  #ifdef CONFIG_RT_GROUP_SCHED
-       if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
-               return -EINVAL;
+               if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+                       return -EINVAL;
  #else
-       /* We don't support RT-tasks being in separate groups */
-       if (tsk->sched_class != &fair_sched_class)
-               return -EINVAL;
+               /* We don't support RT-tasks being in separate groups */
+               if (task->sched_class != &fair_sched_class)
+                       return -EINVAL;
  #endif
+       }
         return 0;
  }
  
-static void
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cpu_cgroup_attach(struct cgroup *cgrp,
+                             struct cgroup_taskset *tset)
  {
-       sched_move_task(tsk);
+       struct task_struct *task;
+
+       cgroup_taskset_for_each(task, cgrp, tset)
+               sched_move_task(task);
  }
  
  static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct cgroup *old_cgrp, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+               struct task_struct *task)
  {
         /*
          * cgroup_exit() is called in the copy_process() failure path.
@@ -7910,8 +8041,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
         .name           = "cpu",
         .create         = cpu_cgroup_create,
         .destroy        = cpu_cgroup_destroy,
-       .can_attach_task = cpu_cgroup_can_attach_task,
-       .attach_task    = cpu_cgroup_attach_task,
+       .can_attach     = cpu_cgroup_can_attach,
+       .attach         = cpu_cgroup_attach,
         .exit           = cpu_cgroup_exit,
         .populate       = cpu_cgroup_populate,
         .subsys_id      = cpu_cgroup_subsys_id,
@@ -7930,8 +8061,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
   */
  
  /* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
-       struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
  {
         struct cpuacct *ca;
  
@@ -7961,8 +8091,7 @@ out:
  }
  
  /* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpuacct_destroy(struct cgroup *cgrp)
  {
         struct cpuacct *ca = cgroup_ca(cgrp);