UBUNTU: Ubuntu-2.6.38-12.51

[linux-flexiantxendom0-natty.git] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index d52b97a..7406f36 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
  static const struct sched_class fair_sched_class;
  
  /**************************************************************
@@ -146,8 +153,20 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
         if (!cfs_rq->on_list) {
-               list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+               /*
+                * Ensure we either appear before our parent (if already
+                * enqueued) or force our parent to appear after us when it is
+                * enqueued.  The fact that we always enqueue bottom-up
+                * reduces this to two cases.
+                */
+               if (cfs_rq->tg->parent &&
+                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               } else {
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               }
  
                 cfs_rq->on_list = 1;
         }
@@ -520,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
         return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
  /*
   * Update the current task's runtime statistics. Skip current tasks that
   * are not in our scheduling class.
@@ -539,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  
         curr->vruntime += delta_exec_weighted;
         update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
  }
  
  static void update_curr(struct cfs_rq *cfs_rq)
@@ -673,21 +699,54 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         cfs_rq->nr_running--;
  }
  
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                           int global_update)
+{
+       struct task_group *tg = cfs_rq->tg;
+       long load_avg;
+
+       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+       load_avg -= cfs_rq->load_contribution;
+
+       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+               atomic_add(load_avg, &tg->load_weight);
+               cfs_rq->load_contribution += load_avg;
+       }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
  {
-       u64 period = sched_avg_period();
+       u64 period = sysctl_sched_shares_window;
         u64 now, delta;
+       unsigned long load = cfs_rq->load.weight;
  
-       if (!cfs_rq)
+       if (cfs_rq->tg == &root_task_group)
                 return;
  
-       now = rq_of(cfs_rq)->clock;
+       now = rq_of(cfs_rq)->clock_task;
         delta = now - cfs_rq->load_stamp;
  
+       /* truncate load history at 4 idle periods */
+       if (cfs_rq->load_stamp > cfs_rq->load_last &&
+           now - cfs_rq->load_last > 4 * period) {
+               cfs_rq->load_period = 0;
+               cfs_rq->load_avg = 0;
+       }
+
         cfs_rq->load_stamp = now;
+       cfs_rq->load_unacc_exec_time = 0;
         cfs_rq->load_period += delta;
-       cfs_rq->load_avg += delta * cfs_rq->load.weight;
+       if (load) {
+               cfs_rq->load_last = now;
+               cfs_rq->load_avg += delta * load;
+       }
+
+       /* consider updating load contribution on each fold or truncate */
+       if (global_update || cfs_rq->load_period > period
+           || !cfs_rq->load_period)
+               update_cfs_rq_load_contribution(cfs_rq, global_update);
  
         while (cfs_rq->load_period > period) {
                 /*
@@ -700,17 +759,64 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
                 cfs_rq->load_avg /= 2;
         }
  
-       if (lb && !cfs_rq->nr_running) {
-               if (cfs_rq->load_avg < (period / 8))
-                       list_del_leaf_cfs_rq(cfs_rq);
+       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+               list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+                               long weight_delta)
+{
+       long load_weight, load, shares;
+
+       load = cfs_rq->load.weight + weight_delta;
+
+       load_weight = atomic_read(&tg->load_weight);
+       load_weight -= cfs_rq->load_contribution;
+       load_weight += load;
+
+       shares = (tg->shares * load);
+       if (load_weight)
+               shares /= load_weight;
+
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       if (shares > tg->shares)
+               shares = tg->shares;
+
+       return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
         }
  }
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+                               long weight_delta)
+{
+       return tg->shares;
+}
  
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                             unsigned long weight)
  {
-       if (se->on_rq)
+       if (se->on_rq) {
+               /* commit outstanding execution time */
+               if (cfs_rq->curr == se)
+                       update_curr(cfs_rq);
                 account_entity_dequeue(cfs_rq, se);
+       }
  
         update_load_set(&se->load, weight);
  
@@ -722,41 +828,32 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
  {
         struct task_group *tg;
         struct sched_entity *se;
-       long load_weight, load, shares;
-
-       if (!cfs_rq)
-               return;
+       long shares;
  
         tg = cfs_rq->tg;
         se = tg->se[cpu_of(rq_of(cfs_rq))];
         if (!se)
                 return;
-
-       load = cfs_rq->load.weight + weight_delta;
-
-       load_weight = atomic_read(&tg->load_weight);
-       load_weight -= cfs_rq->load_contribution;
-       load_weight += load;
-
-       shares = (tg->shares * load);
-       if (load_weight)
-               shares /= load_weight;
-
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       if (shares > tg->shares)
-               shares = tg->shares;
+#ifndef CONFIG_SMP
+       if (likely(se->load.weight == tg->shares))
+               return;
+#endif
+       shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
  
         reweight_entity(cfs_rq_of(se), se, shares);
  }
  #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
  {
  }
  
  static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
  {
  }
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -990,6 +1087,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                 struct sched_entity *se = __pick_next_entity(cfs_rq);
                 s64 delta = curr->vruntime - se->vruntime;
  
+               if (delta < 0)
+                       return;
+
                 if (delta > ideal_runtime)
                         resched_task(rq_of(cfs_rq)->curr);
         }
@@ -1073,6 +1173,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
          */
         update_curr(cfs_rq);
  
+       /*
+        * Update share accounting for long-running entities.
+        */
+       update_entity_shares_tick(cfs_rq);
+
  #ifdef CONFIG_SCHED_HRTICK
         /*
          * queued ticks are scheduled to match the slice, so don't bother
@@ -1285,27 +1390,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                 return wl;
  
         for_each_sched_entity(se) {
-               long S, rw, s, a, b;
+               long lw, w;
  
-               S = se->my_q->tg->shares;
-               s = se->load.weight;
-               rw = se->my_q->load.weight;
+               tg = se->my_q->tg;
+               w = se->my_q->load.weight;
  
-               a = S*(rw + wl);
-               b = S*rw + s*wg;
+               /* use this cpu's instantaneous contribution */
+               lw = atomic_read(&tg->load_weight);
+               lw -= se->my_q->load_contribution;
+               lw += w + wg;
  
-               wl = s*(a-b);
+               wl += w;
  
-               if (likely(b))
-                       wl /= b;
+               if (lw > 0 && wl < lw)
+                       wl = (wl * tg->shares) / lw;
+               else
+                       wl = tg->shares;
  
-               /*
-                * Assume the group is already running and will
-                * thus already be accounted for in the weight.
-                *
-                * That is, moving shares between CPUs, does not
-                * alter the group weight.
-                */
+               /* zero point is MIN_SHARES */
+               if (wl < MIN_SHARES)
+                       wl = MIN_SHARES;
+               wl -= se->load.weight;
                 wg = 0;
         }
  
@@ -1324,7 +1429,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
  
  static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  {
-       unsigned long this_load, load;
+       s64 this_load, load;
         int idx, this_cpu, prev_cpu;
         unsigned long tl_per_task;
         struct task_group *tg;
@@ -1363,8 +1468,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
          * Otherwise check if either cpus are near enough in load to allow this
          * task to be woken on this_cpu.
          */
-       if (this_load) {
-               unsigned long this_eff_load, prev_eff_load;
+       if (this_load > 0) {
+               s64 this_eff_load, prev_eff_load;
  
                 this_eff_load = 100;
                 this_eff_load *= power_of(prev_cpu);
@@ -1739,12 +1844,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
         int scale = cfs_rq->nr_running >= sched_nr_latency;
  
-       if (unlikely(rt_prio(p->prio)))
-               goto preempt;
-
-       if (unlikely(p->sched_class != &fair_sched_class))
-               return;
-
         if (unlikely(se == pse))
                 return;
  
@@ -1849,10 +1948,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
         set_task_cpu(p, this_cpu);
         activate_task(this_rq, p, 0);
         check_preempt_curr(this_rq, p, 0);
-
-       /* re-arm NEWIDLE balancing when moving tasks */
-       src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
-       this_rq->idle_stamp = 0;
  }
  
  /*
@@ -1948,21 +2043,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
               enum cpu_idle_type idle, int *all_pinned,
               int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
  {
-       int loops = 0, pulled = 0, pinned = 0;
+       int loops = 0, pulled = 0;
         long rem_load_move = max_load_move;
         struct task_struct *p, *n;
  
         if (max_load_move == 0)
                 goto out;
  
-       pinned = 1;
-
         list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
                 if (loops++ > sysctl_sched_nr_migrate)
                         break;
  
                 if ((p->se.load.weight >> 1) > rem_load_move ||
-                   !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+                   !can_migrate_task(p, busiest, this_cpu, sd, idle,
+                                     all_pinned))
                         continue;
  
                 pull_task(busiest, p, this_rq, this_cpu);
@@ -1997,9 +2091,6 @@ out:
          */
         schedstat_add(sd, lb_gained[idle], pulled);
  
-       if (all_pinned)
-               *all_pinned = pinned;
-
         return max_load_move - rem_load_move;
  }
  
@@ -2007,12 +2098,11 @@ out:
  /*
   * update tg->load_weight by folding this cpu's load_avg
   */
-static int tg_shares_up(struct task_group *tg, int cpu)
+static int update_shares_cpu(struct task_group *tg, int cpu)
  {
         struct cfs_rq *cfs_rq;
         unsigned long flags;
         struct rq *rq;
-       long load_avg;
  
         if (!tg->se[cpu])
                 return 0;
@@ -2025,11 +2115,6 @@ static int tg_shares_up(struct task_group *tg, int cpu)
         update_rq_clock(rq);
         update_cfs_load(cfs_rq, 1);
  
-       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-       load_avg -= cfs_rq->load_contribution;
-       atomic_add(load_avg, &tg->load_weight);
-       cfs_rq->load_contribution += load_avg;
-
         /*
          * We need to update shares after updating tg->load_weight in
          * order to adjust the weight of groups with long running tasks.
@@ -2047,14 +2132,8 @@ static void update_shares(int cpu)
         struct rq *rq = cpu_rq(cpu);
  
         rcu_read_lock();
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
-               struct task_group *tg = cfs_rq->tg;
-
-               do {
-                       tg_shares_up(tg, cpu);
-                       tg = tg->parent;
-               } while (tg);
-       }
+       for_each_leaf_cfs_rq(rq, cfs_rq)
+               update_shares_cpu(cfs_rq->tg, cpu);
         rcu_read_unlock();
  }
  
@@ -2178,13 +2257,16 @@ struct sd_lb_stats {
         unsigned long this_load_per_task;
         unsigned long this_nr_running;
         unsigned long this_has_capacity;
+       unsigned int  this_idle_cpus;
  
         /* Statistics of the busiest group */
+       unsigned int  busiest_idle_cpus;
         unsigned long max_load;
         unsigned long busiest_load_per_task;
         unsigned long busiest_nr_running;
         unsigned long busiest_group_capacity;
         unsigned long busiest_has_capacity;
+       unsigned int  busiest_group_weight;
  
         int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2206,6 +2288,8 @@ struct sg_lb_stats {
         unsigned long sum_nr_running; /* Nr tasks running in the group */
         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
         unsigned long group_capacity;
+       unsigned long idle_cpus;
+       unsigned long group_weight;
         int group_imb; /* Is there an imbalance in the group ? */
         int group_has_capacity; /* Is there extra capacity in the group? */
  };
@@ -2574,7 +2658,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                 sgs->group_load += load;
                 sgs->sum_nr_running += rq->nr_running;
                 sgs->sum_weighted_load += weighted_cpuload(i);
-
+               if (idle_cpu(i))
+                       sgs->idle_cpus++;
         }
  
         /*
@@ -2612,6 +2697,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
         if (!sgs->group_capacity)
                 sgs->group_capacity = fix_small_capacity(sd, group);
+       sgs->group_weight = group->group_weight;
  
         if (sgs->group_capacity > sgs->sum_nr_running)
                 sgs->group_has_capacity = 1;
@@ -2719,13 +2805,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                         sds->this_nr_running = sgs.sum_nr_running;
                         sds->this_load_per_task = sgs.sum_weighted_load;
                         sds->this_has_capacity = sgs.group_has_capacity;
+                       sds->this_idle_cpus = sgs.idle_cpus;
                 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                         sds->max_load = sgs.avg_load;
                         sds->busiest = sg;
                         sds->busiest_nr_running = sgs.sum_nr_running;
+                       sds->busiest_idle_cpus = sgs.idle_cpus;
                         sds->busiest_group_capacity = sgs.group_capacity;
                         sds->busiest_load_per_task = sgs.sum_weighted_load;
                         sds->busiest_has_capacity = sgs.group_has_capacity;
+                       sds->busiest_group_weight = sgs.group_weight;
                         sds->group_imb = sgs.group_imb;
                 }
  
@@ -3003,8 +3092,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (sds.this_load >= sds.avg_load)
                 goto out_balanced;
  
-       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-               goto out_balanced;
+       /*
+        * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+        * And to check for busy balance use !idle_cpu instead of
+        * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+        * even when they are idle.
+        */
+       if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+               if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                       goto out_balanced;
+       } else {
+               /*
+                * This cpu is idle. If the busiest group load doesn't
+                * have more tasks than the number of available cpu's and
+                * there is no imbalance between this and busiest group
+                * wrt to idle cpu's, it is balanced.
+                */
+               if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                   sds.busiest_nr_running <= sds.busiest_group_weight)
+                       goto out_balanced;
+       }
  
  force_balance:
         /* Looks like there is an imbalance. Compute it */
@@ -3186,6 +3293,7 @@ redo:
                  * still unbalanced. ld_moved simply stays zero, so it is
                  * correctly treated as an imbalance.
                  */
+               all_pinned = 1;
                 local_irq_save(flags);
                 double_rq_lock(this_rq, busiest);
                 ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3321,6 +3429,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
          */
         raw_spin_unlock(&this_rq->lock);
  
+       update_shares(this_cpu);
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
                 int balance = 1;
@@ -3337,8 +3446,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
                         next_balance = sd->last_balance + interval;
-               if (pulled_task)
+               if (pulled_task) {
+                       this_rq->idle_stamp = 0;
                         break;
+               }
         }
  
         raw_spin_lock(&this_rq->lock);