Merge branch 'sched/arch' into sched/urgent

author Ingo Molnar <mingo@kernel.org>

Thu, 29 Mar 2012 10:48:15 +0000 (12:48 +0200)

committer Ingo Molnar <mingo@kernel.org>

Thu, 29 Mar 2012 10:48:22 +0000 (12:48 +0200)
author Ingo Molnar <mingo@kernel.org>
Thu, 29 Mar 2012 10:48:15 +0000 (12:48 +0200)
committer Ingo Molnar <mingo@kernel.org>
Thu, 29 Mar 2012 10:48:22 +0000 (12:48 +0200)
diff --combined kernel/sched/core.c

index 929fd85,423f40f..985f6e5
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -1263,59 -1263,29 +1263,59 @@@ EXPORT_SYMBOL_GPL(kick_process)
    */
   static int select_fallback_rq(int cpu, struct task_struct *p)
   {
- -      int dest_cpu;
         const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ +      enum { cpuset, possible, fail } state = cpuset;
+ +      int dest_cpu;
   
         /* Look for allowed, online CPU in same node. */
- -      for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+ +      for_each_cpu_mask(dest_cpu, *nodemask) {
+ +              if (!cpu_online(dest_cpu))
+ +                      continue;
+ +              if (!cpu_active(dest_cpu))
+ +                      continue;
                 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                         return dest_cpu;
+ +      }
   
- -      /* Any allowed, online CPU? */
- -      dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
- -      if (dest_cpu < nr_cpu_ids)
- -              return dest_cpu;
+ +      for (;;) {
+ +              /* Any allowed, online CPU? */
+ +              for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) {
+ +                      if (!cpu_online(dest_cpu))
+ +                              continue;
+ +                      if (!cpu_active(dest_cpu))
+ +                              continue;
+ +                      goto out;
+ +              }
   
- -      /* No more Mr. Nice Guy. */
- -      dest_cpu = cpuset_cpus_allowed_fallback(p);
- -      /*
- -       * Don't tell them about moving exiting tasks or
- -       * kernel threads (both mm NULL), since they never
- -       * leave kernel.
- -       */
- -      if (p->mm && printk_ratelimit()) {
- -              printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
- -                              task_pid_nr(p), p->comm, cpu);
+ +              switch (state) {
+ +              case cpuset:
+ +                      /* No more Mr. Nice Guy. */
+ +                      cpuset_cpus_allowed_fallback(p);
+ +                      state = possible;
+ +                      break;
+ +
+ +              case possible:
+ +                      do_set_cpus_allowed(p, cpu_possible_mask);
+ +                      state = fail;
+ +                      break;
+ +
+ +              case fail:
+ +                      BUG();
+ +                      break;
+ +              }
+ +      }
+ +
+ +out:
+ +      if (state != cpuset) {
+ +              /*
+ +               * Don't tell them about moving exiting tasks or
+ +               * kernel threads (both mm NULL), since they never
+ +               * leave kernel.
+ +               */
+ +              if (p->mm && printk_ratelimit()) {
+ +                      printk_sched("process %d (%s) no longer affine to cpu%d\n",
+ +                                      task_pid_nr(p), p->comm, cpu);
+ +              }
         }
   
         return dest_cpu;
@@@ -1537,7 -1507,7 +1537,7 @@@ static int ttwu_activate_remote(struct 
   }
   #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
   
- -static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+ +bool cpus_share_cache(int this_cpu, int that_cpu)
   {
         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
   }
@@@ -1548,7 -1518,7 +1548,7 @@@ static void ttwu_queue(struct task_stru
         struct rq *rq = cpu_rq(cpu);
   
   #if defined(CONFIG_SMP)
- -      if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+ +      if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                 sched_clock_cpu(cpu); /* sync clocks x-cpu */
                 ttwu_queue_remote(p, cpu);
                 return;
@@@ -1962,6 -1932,7 +1962,7 @@@ static void finish_task_switch(struct r
         local_irq_enable();
   #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
+       finish_arch_post_lock_switch();
   
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@@ -2296,10 -2267,13 +2297,10 @@@ calc_load_n(unsigned long load, unsigne
    * Once we've updated the global active value, we need to apply the exponential
    * weights adjusted to the number of cycles missed.
    */
- -static void calc_global_nohz(unsigned long ticks)
+ +static void calc_global_nohz(void)
   {
         long delta, active, n;
   
- -      if (time_before(jiffies, calc_load_update))
- -              return;
- -
         /*
          * If we crossed a calc_load_update boundary, make sure to fold
          * any pending idle changes, the respective CPUs might have
@@@ -2311,25 -2285,31 +2312,25 @@@
                 atomic_long_add(delta, &calc_load_tasks);
   
         /*
- -       * If we were idle for multiple load cycles, apply them.
+ +       * It could be the one fold was all it took, we done!
          */
- -      if (ticks >= LOAD_FREQ) {
- -              n = ticks / LOAD_FREQ;
+ +      if (time_before(jiffies, calc_load_update + 10))
+ +              return;
   
- -              active = atomic_long_read(&calc_load_tasks);
- -              active = active > 0 ? active * FIXED_1 : 0;
+ +      /*
+ +       * Catch-up, fold however many we are behind still
+ +       */
+ +      delta = jiffies - calc_load_update - 10;
+ +      n = 1 + (delta / LOAD_FREQ);
   
- -              avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- -              avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- -              avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ +      active = atomic_long_read(&calc_load_tasks);
+ +      active = active > 0 ? active * FIXED_1 : 0;
   
- -              calc_load_update += n * LOAD_FREQ;
- -      }
+ +      avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ +      avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ +      avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
   
- -      /*
- -       * Its possible the remainder of the above division also crosses
- -       * a LOAD_FREQ period, the regular check in calc_global_load()
- -       * which comes after this will take care of that.
- -       *
- -       * Consider us being 11 ticks before a cycle completion, and us
- -       * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
- -       * age us 4 cycles, and the test in calc_global_load() will
- -       * pick up the final one.
- -       */
+ +      calc_load_update += n * LOAD_FREQ;
   }
   #else
   void calc_load_account_idle(struct rq *this_rq)
@@@ -2341,7 -2321,7 +2342,7 @@@ static inline long calc_load_fold_idle(
         return 0;
   }
   
- -static void calc_global_nohz(unsigned long ticks)
+ +static void calc_global_nohz(void)
   {
   }
   #endif
@@@ -2369,6 -2349,8 +2370,6 @@@ void calc_global_load(unsigned long tic
   {
         long active;
   
- -      calc_global_nohz(ticks);
- -
         if (time_before(jiffies, calc_load_update + 10))
                 return;
   
@@@ -2380,16 -2362,6 +2381,16 @@@
         avenrun[2] = calc_load(avenrun[2], EXP_15, active);
   
         calc_load_update += LOAD_FREQ;
+ +
+ +      /*
+ +       * Account one period with whatever state we found before
+ +       * folding in the nohz state and ageing the entire idle period.
+ +       *
+ +       * This avoids loosing a sample when we go idle between 
+ +       * calc_load_account_active() (10 ticks ago) and now and thus
+ +       * under-accounting.
+ +       */
+ +      calc_global_nohz();
   }
   
   /*
@@@ -3099,6 -3071,8 +3100,6 @@@ EXPORT_SYMBOL(sub_preempt_count)
    */
   static noinline void __schedule_bug(struct task_struct *prev)
   {
- -      struct pt_regs *regs = get_irq_regs();
- -
         if (oops_in_progress)
                 return;
   
@@@ -3109,7 -3083,11 +3110,7 @@@
         print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
- -
- -      if (regs)
- -              show_regs(regs);
- -      else
- -              dump_stack();
+ +      dump_stack();
   }
   
   /*
@@@ -3243,14 -3221,14 +3244,14 @@@ need_resched
   
         post_schedule(rq);
   
- -      preempt_enable_no_resched();
+ +      sched_preempt_enable_no_resched();
         if (need_resched())
                 goto need_resched;
   }
   
   static inline void sched_submit_work(struct task_struct *tsk)
   {
- -      if (!tsk->state)
+ +      if (!tsk->state || tsk_is_pi_blocked(tsk))
                 return;
         /*
          * If we are going to sleep and we have plugged IO queued,
@@@ -3269,18 -3247,6 +3270,18 @@@ asmlinkage void __sched schedule(void
   }
   EXPORT_SYMBOL(schedule);
   
+ +/**
+ + * schedule_preempt_disabled - called with preemption disabled
+ + *
+ + * Returns with preemption disabled. Note: preempt_count must be 1
+ + */
+ +void __sched schedule_preempt_disabled(void)
+ +{
+ +      sched_preempt_enable_no_resched();
+ +      schedule();
+ +      preempt_disable();
+ +}
+ +
   #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
   
   static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
@@@ -3441,9 -3407,9 +3442,9 @@@ EXPORT_SYMBOL(__wake_up)
   /*
    * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
    */
- -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
   {
- -      __wake_up_common(q, mode, 1, 0, NULL);
+ +      __wake_up_common(q, mode, nr, 0, NULL);
   }
   EXPORT_SYMBOL_GPL(__wake_up_locked);
   
@@@ -3802,24 -3768,6 +3803,24 @@@ void rt_mutex_setprio(struct task_struc
   
         rq = __task_rq_lock(p);
   
+ +      /*
+ +       * Idle task boosting is a nono in general. There is one
+ +       * exception, when PREEMPT_RT and NOHZ is active:
+ +       *
+ +       * The idle task calls get_next_timer_interrupt() and holds
+ +       * the timer wheel base->lock on the CPU and another CPU wants
+ +       * to access the timer (probably to cancel it). We can safely
+ +       * ignore the boosting request, as the idle CPU runs this code
+ +       * with interrupts disabled and will complete the lock
+ +       * protected section without being interrupted. So there is no
+ +       * real need to boost.
+ +       */
+ +      if (unlikely(p == rq->idle)) {
+ +              WARN_ON(p != rq->curr);
+ +              WARN_ON(p->pi_blocked_on);
+ +              goto out_unlock;
+ +      }
+ +
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
@@@ -3843,10 -3791,11 +3844,10 @@@
                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
   
         check_class_changed(rq, p, prev_class, oldprio);
+ +out_unlock:
         __task_rq_unlock(rq);
   }
- -
   #endif
- -
   void set_user_nice(struct task_struct *p, long nice)
   {
         int old_prio, delta, on_rq;
@@@ -4526,7 -4475,7 +4527,7 @@@ SYSCALL_DEFINE0(sched_yield
         __release(rq->lock);
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
         do_raw_spin_unlock(&rq->lock);
- -      preempt_enable_no_resched();
+ +      sched_preempt_enable_no_resched();
   
         schedule();
   
@@@ -4600,24 -4549,8 +4601,24 @@@ EXPORT_SYMBOL(__cond_resched_softirq)
   /**
    * yield - yield the current processor to other threads.
    *
- - * This is a shortcut for kernel-space yielding - it marks the
- - * thread runnable and calls sys_sched_yield().
+ + * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ + *
+ + * The scheduler is at all times free to pick the calling task as the most
+ + * eligible task to run, if removing the yield() call from your code breaks
+ + * it, its already broken.
+ + *
+ + * Typical broken usage is:
+ + *
+ + * while (!event)
+ + *    yield();
+ + *
+ + * where one assumes that yield() will let 'the other' process run that will
+ + * make event true. If the current task is a SCHED_FIFO task that will never
+ + * happen. Never use yield() as a progress guarantee!!
+ + *
+ + * If you want to use yield() to wait for something, use wait_event().
+ + * If you want to use yield() to be 'nice' for others, use cond_resched().
+ + * If you still want to use yield(), do not!
    */
   void __sched yield(void)
   {
@@@ -5449,7 -5382,7 +5450,7 @@@ static int __cpuinit sched_cpu_active(s
                                       unsigned long action, void *hcpu)
   {
         switch (action & ~CPU_TASKS_FROZEN) {
- -      case CPU_ONLINE:
+ +      case CPU_STARTING:
         case CPU_DOWN_FAILED:
                 set_cpu_active((long)hcpu, true);
                 return NOTIFY_OK;
@@@ -5821,7 -5754,7 +5822,7 @@@ static void destroy_sched_domains(struc
    *
    * Also keep a unique ID per domain (we use the first cpu number in
    * the cpumask of the domain), this allows us to quickly tell if
- - * two cpus are in the same cache domain, see ttwu_share_cache().
+ + * two cpus are in the same cache domain, see cpus_share_cache().
    */
   DEFINE_PER_CPU(struct sched_domain *, sd_llc);
   DEFINE_PER_CPU(int, sd_llc_id);
@@@ -6998,9 -6931,6 +6999,9 @@@ void __init sched_init(void
                 rq->online = 0;
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
+ +
+ +              INIT_LIST_HEAD(&rq->cfs_tasks);
+ +
                 rq_attach_root(rq, &def_root_domain);
   #ifdef CONFIG_NO_HZ
                 rq->nohz_flags = 0;
diff --combined kernel/sched/sched.h

index 753bdd5,d72483d..2f7a723
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -36,7 -36,11 +36,7 @@@ extern __read_mostly int scheduler_runn
   
   /*
    * These are the 'tuning knobs' of the scheduler:
- - *
- - * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- - * Timeslices get refilled after they expire.
    */
- -#define DEF_TIMESLICE         (100 * HZ / 1000)
   
   /*
    * single value that denotes runtime == period, ie unlimited time.
@@@ -212,6 -216,9 +212,6 @@@ struct cfs_rq 
         struct rb_root tasks_timeline;
         struct rb_node *rb_leftmost;
   
- -      struct list_head tasks;
- -      struct list_head *balance_iterator;
- -
         /*
          * 'curr' points to currently running entity on this cfs_rq.
          * It is set to NULL otherwise (i.e when none are currently running).
@@@ -239,6 -246,11 +239,6 @@@
   
   #ifdef CONFIG_SMP
         /*
- -       * the part of load.weight contributed by tasks
- -       */
- -      unsigned long task_weight;
- -
- -      /*
          *   h_load = weight * f(tg)
          *
          * Where f(tg) is the recursive weight fraction assigned to
@@@ -412,8 -424,6 +412,8 @@@ struct rq 
         int cpu;
         int online;
   
+ +      struct list_head cfs_tasks;
+ +
         u64 rt_avg;
         u64 age_stamp;
         u64 idle_stamp;
@@@ -452,6 -462,7 +452,6 @@@
         unsigned int yld_count;
   
         /* schedule() stats */
- -      unsigned int sched_switch;
         unsigned int sched_count;
         unsigned int sched_goidle;
   
@@@ -681,6 -692,9 +681,9 @@@ static inline int task_running(struct r
   #ifndef finish_arch_switch
   # define finish_arch_switch(prev)     do { } while (0)
   #endif
+ #ifndef finish_arch_post_lock_switch
+ # define finish_arch_post_lock_switch()       do { } while (0)
+ #endif
   
   #ifndef __ARCH_WANT_UNLOCKED_CTXSW
   static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
author	Ingo Molnar <mingo@kernel.org>
	Thu, 29 Mar 2012 10:48:15 +0000 (12:48 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Thu, 29 Mar 2012 10:48:22 +0000 (12:48 +0200)
		1	2
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history