const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
static const struct sched_class fair_sched_class;
/**************************************************************
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
cfs_rq->nr_running--;
}
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void update_cfs_load(struct cfs_rq *cfs_rq)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+ int global_update)
{
- u64 period = sched_avg_period();
+ struct task_group *tg = cfs_rq->tg;
+ long load_avg;
+
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+ load_avg -= cfs_rq->load_contribution;
+
+ if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+ atomic_add(load_avg, &tg->load_weight);
+ cfs_rq->load_contribution += load_avg;
+ }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+ u64 period = sysctl_sched_shares_window;
u64 now, delta;
unsigned long load = cfs_rq->load.weight;
- if (!cfs_rq)
+ if (cfs_rq->tg == &root_task_group)
return;
- now = rq_of(cfs_rq)->clock;
+ now = rq_of(cfs_rq)->clock_task;
delta = now - cfs_rq->load_stamp;
/* truncate load history at 4 idle periods */
}
cfs_rq->load_stamp = now;
+ cfs_rq->load_unacc_exec_time = 0;
cfs_rq->load_period += delta;
if (load) {
cfs_rq->load_last = now;
cfs_rq->load_avg += delta * load;
}
+ /* consider updating load contribution on each fold or truncate */
+ if (global_update || cfs_rq->load_period > period
+ || !cfs_rq->load_period)
+ update_cfs_rq_load_contribution(cfs_rq, global_update);
+
while (cfs_rq->load_period > period) {
/*
* Inline assembly required to prevent the compiler
list_del_leaf_cfs_rq(cfs_rq);
}
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+ long weight_delta)
+{
+ long load_weight, load, shares;
+
+ load = cfs_rq->load.weight + weight_delta;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight -= cfs_rq->load_contribution;
+ load_weight += load;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+ long weight_delta)
+{
+ return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
- if (se->on_rq)
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se);
+ }
update_load_set(&se->load, weight);
{
struct task_group *tg;
struct sched_entity *se;
- long load_weight, load, shares;
-
- if (!cfs_rq)
- return;
+ long shares;
tg = cfs_rq->tg;
se = tg->se[cpu_of(rq_of(cfs_rq))];
if (!se)
return;
-
- load = cfs_rq->load.weight + weight_delta;
-
- load_weight = atomic_read(&tg->load_weight);
- load_weight -= cfs_rq->load_contribution;
- load_weight += load;
-
- shares = (tg->shares * load);
- if (load_weight)
- shares /= load_weight;
-
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
+#ifndef CONFIG_SMP
+ if (likely(se->load.weight == tg->shares))
+ return;
+#endif
+ shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
}
static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
{
}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- update_cfs_load(cfs_rq);
+ update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq, se->load.weight);
account_entity_enqueue(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
- update_cfs_load(cfs_rq);
+ update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq, 0);
struct sched_entity *se = __pick_next_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
+ if (delta < 0)
+ return;
+
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
*/
update_curr(cfs_rq);
+ /*
+ * Update share accounting for long-running entities.
+ */
+ update_entity_shares_tick(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- update_cfs_load(cfs_rq);
+ update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq, 0);
}
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- update_cfs_load(cfs_rq);
+ update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq, 0);
}
return wl;
for_each_sched_entity(se) {
- long S, rw, s, a, b;
+ long lw, w;
- S = se->my_q->tg->shares;
- s = se->load.weight;
- rw = se->my_q->load.weight;
+ tg = se->my_q->tg;
+ w = se->my_q->load.weight;
- a = S*(rw + wl);
- b = S*rw + s*wg;
+ /* use this cpu's instantaneous contribution */
+ lw = atomic_read(&tg->load_weight);
+ lw -= se->my_q->load_contribution;
+ lw += w + wg;
- wl = s*(a-b);
+ wl += w;
- if (likely(b))
- wl /= b;
+ if (lw > 0 && wl < lw)
+ wl = (wl * tg->shares) / lw;
+ else
+ wl = tg->shares;
- /*
- * Assume the group is already running and will
- * thus already be accounted for in the weight.
- *
- * That is, moving shares between CPUs, does not
- * alter the group weight.
- */
+ /* zero point is MIN_SHARES */
+ if (wl < MIN_SHARES)
+ wl = MIN_SHARES;
+ wl -= se->load.weight;
wg = 0;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- unsigned long this_load, load;
+ s64 this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
struct task_group *tg;
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- if (this_load) {
- unsigned long this_eff_load, prev_eff_load;
+ if (this_load > 0) {
+ s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
this_eff_load *= power_of(prev_cpu);
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
- if (unlikely(rt_prio(p->prio)))
- goto preempt;
-
- if (unlikely(p->sched_class != &fair_sched_class))
- return;
-
if (unlikely(se == pse))
return;
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
check_preempt_curr(this_rq, p, 0);
-
- /* re-arm NEWIDLE balancing when moving tasks */
- src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
- this_rq->idle_stamp = 0;
}
/*
enum cpu_idle_type idle, int *all_pinned,
int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
{
- int loops = 0, pulled = 0, pinned = 0;
+ int loops = 0, pulled = 0;
long rem_load_move = max_load_move;
struct task_struct *p, *n;
if (max_load_move == 0)
goto out;
- pinned = 1;
-
list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
if (loops++ > sysctl_sched_nr_migrate)
break;
if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+ !can_migrate_task(p, busiest, this_cpu, sd, idle,
+ all_pinned))
continue;
pull_task(busiest, p, this_rq, this_cpu);
*/
schedstat_add(sd, lb_gained[idle], pulled);
- if (all_pinned)
- *all_pinned = pinned;
-
return max_load_move - rem_load_move;
}
struct cfs_rq *cfs_rq;
unsigned long flags;
struct rq *rq;
- long load_avg;
if (!tg->se[cpu])
return 0;
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_load(cfs_rq);
-
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
+ update_cfs_load(cfs_rq, 1);
/*
* We need to update shares after updating tg->load_weight in
unsigned long this_load_per_task;
unsigned long this_nr_running;
unsigned long this_has_capacity;
+ unsigned int this_idle_cpus;
/* Statistics of the busiest group */
+ unsigned int busiest_idle_cpus;
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
unsigned long busiest_has_capacity;
+ unsigned int busiest_group_weight;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
+ unsigned long idle_cpus;
+ unsigned long group_weight;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
};
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
-
+ if (idle_cpu(i))
+ sgs->idle_cpus++;
}
/*
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
+ sgs->group_weight = group->group_weight;
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
+ sds->this_idle_cpus = sgs.idle_cpus;
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_idle_cpus = sgs.idle_cpus;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->busiest_has_capacity = sgs.group_has_capacity;
+ sds->busiest_group_weight = sgs.group_weight;
sds->group_imb = sgs.group_imb;
}
if (sds.this_load >= sds.avg_load)
goto out_balanced;
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
- goto out_balanced;
+ /*
+ * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+ * And to check for busy balance use !idle_cpu instead of
+ * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+ * even when they are idle.
+ */
+ if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
+ } else {
+ /*
+ * This cpu is idle. If the busiest group load doesn't
+ * have more tasks than the number of available cpu's and
+ * there is no imbalance between this and busiest group
+ * wrt to idle cpu's, it is balanced.
+ */
+ if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+ sds.busiest_nr_running <= sds.busiest_group_weight)
+ goto out_balanced;
+ }
force_balance:
/* Looks like there is an imbalance. Compute it */
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
+ all_pinned = 1;
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
*/
raw_spin_unlock(&this_rq->lock);
+ update_shares(this_cpu);
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
- if (pulled_task)
+ if (pulled_task) {
+ this_rq->idle_stamp = 0;
break;
+ }
}
raw_spin_lock(&this_rq->lock);