#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include <asm/mutex.h>
#include "sched_cpupri.h"
#include "workqueue_sched.h"
+#include "sched_autogroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
struct task_group *parent;
struct list_head siblings;
struct list_head children;
-};
-#define root_task_group init_task_group
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
+};
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
-# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
+# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
/*
* A weight of 0 or 1 can cause arithmetics problems.
#define MIN_SHARES 2
#define MAX_SHARES (1UL << 18)
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
#endif
/* Default task group.
* Every task in system belong to this group at bootup.
*/
-struct task_group init_task_group;
+struct task_group root_task_group;
#endif /* CONFIG_CGROUP_SCHED */
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
-
- /* BKL stats */
- unsigned int bkl_count;
#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- /*
- * A queue event has occurred, and we're going to schedule. In
- * this case, we can save a useless back to back clock update.
- */
- if (test_tsk_need_resched(p))
- rq->skip_clock_update = 1;
-}
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
static inline int cpu_of(struct rq *rq)
{
*/
static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;
+ if (p->flags & PF_EXITING)
+ return &root_task_group;
+
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ tg = container_of(css, struct task_group, css);
+
+ return autogroup_task_group(p, tg);
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
#endif /* CONFIG_CGROUP_SCHED */
-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update) {
- int cpu = cpu_of(rq);
- u64 irq_time;
+ s64 delta;
- rq->clock = sched_clock_cpu(cpu);
- irq_time = irq_time_cpu(cpu);
- if (rq->clock - irq_time > rq->clock_task)
- rq->clock_task = rq->clock - irq_time;
+ if (rq->skip_clock_update)
+ return;
- sched_irq_time_avg_update(rq, irq_time);
- }
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
}
/*
buf[cnt] = 0;
cmp = strstrip(buf);
- if (strncmp(buf, "NO_", 3) == 0) {
+ if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
cmp += 3;
}
* They are read and saved off onto struct rq in update_rq_clock().
* This may result in other CPU reading this CPU's irq time and can
* race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
*/
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
static DEFINE_PER_CPU(u64, cpu_softirq_time);
sched_clock_irqtime = 0;
}
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
{
- if (!sched_clock_irqtime)
- return 0;
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
void account_system_vtime(struct task_struct *curr)
{
unsigned long flags;
+ s64 delta;
int cpu;
- u64 now, delta;
if (!sched_clock_irqtime)
return;
local_irq_save(flags);
cpu = smp_processor_id();
- now = sched_clock_cpu(cpu);
- delta = now - per_cpu(irq_start_time, cpu);
- per_cpu(irq_start_time, cpu) = now;
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
- per_cpu(cpu_hardirq_time, cpu) += delta;
+ __this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- per_cpu(cpu_softirq_time, cpu) += delta;
+ __this_cpu_add(cpu_softirq_time, delta);
+ irq_time_write_end();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- rq->prev_irq_time = curr_irq_time;
- sched_rt_avg_update(rq, delta_irq);
- }
+ s64 irq_delta;
+
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ rq->clock_task += delta;
+
+ if (irq_delta && sched_feat(NONIRQ_POWER))
+ sched_rt_avg_update(rq, irq_delta);
}
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- return 0;
+ rq->clock_task += delta;
}
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
-
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_autogroup.c"
#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
p->sched_class->prio_changed(rq, p, oldprio, running);
}
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+ const struct sched_class *class;
+
+ if (p->sched_class == rq->curr->sched_class) {
+ rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ } else {
+ for_each_class(class) {
+ if (class == rq->curr->sched_class)
+ break;
+ if (class == p->sched_class) {
+ resched_task(rq->curr);
+ break;
+ }
+ }
+ }
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+ rq->skip_clock_update = 1;
+}
+
#ifdef CONFIG_SMP
/*
* Is this task likely cache-hot:
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
{
- struct rq *rq = task_rq(p);
-
/*
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
- * Put @p on the run-queue if it's not alredy there. The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task. this_rq() stays locked over invocation.
*/
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
put_cpu();
}
return delta;
}
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
#ifdef CONFIG_NO_HZ
/*
* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
return delta;
}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+ long delta, active, n;
+
+ if (time_before(jiffies, calc_load_update))
+ return;
+
+ /*
+ * If we crossed a calc_load_update boundary, make sure to fold
+ * any pending idle changes, the respective CPUs might have
+ * missed the tick driven calc_load_account_active() update
+ * due to NO_HZ.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ /*
+ * If we were idle for multiple load cycles, apply them.
+ */
+ if (ticks >= LOAD_FREQ) {
+ n = ticks / LOAD_FREQ;
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Its possible the remainder of the above division also crosses
+ * a LOAD_FREQ period, the regular check in calc_global_load()
+ * which comes after this will take care of that.
+ *
+ * Consider us being 11 ticks before a cycle completion, and us
+ * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ * age us 4 cycles, and the test in calc_global_load() will
+ * pick up the final one.
+ */
+}
#else
static void calc_load_account_idle(struct rq *this_rq)
{
{
return 0;
}
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
#endif
/**
loads[2] = (avenrun[2] + offset) << shift;
}
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- return load >> FSHIFT;
-}
-
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
{
- unsigned long upd = calc_load_update + 10;
long active;
- if (time_before(jiffies, upd))
+ calc_global_nohz(ticks);
+
+ if (time_before(jiffies, calc_load_update + 10))
return;
active = atomic_long_read(&calc_load_tasks);
* select_task_rq() can race against ->cpus_allowed
*/
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+ likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
task_rq_unlock(rq, &flags);
schedstat_inc(this_rq(), sched_count);
#ifdef CONFIG_SCHEDSTATS
if (unlikely(prev->lock_depth >= 0)) {
- schedstat_inc(this_rq(), bkl_count);
+ schedstat_inc(this_rq(), rq_sched_info.bkl_count);
schedstat_inc(prev, sched_info.bkl_count);
}
#endif
{
if (prev->se.on_rq)
update_rq_clock(rq);
- rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- clear_tsk_need_resched(prev);
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
put_prev_task(rq, prev);
next = pick_next_task(rq);
+ clear_tsk_need_resched(prev);
+ rq->skip_clock_update = 0;
if (likely(prev != next)) {
sched_info_switch(prev, next);
if (task_thread_info(rq->curr) != owner || need_resched())
return 0;
- cpu_relax();
+ arch_mutex_cpu_relax();
}
return 1;
{
__wake_up_common(q, mode, 1, 0, key);
}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*/
-unsigned long __sched
+long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
{
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*/
-unsigned long __sched
+long __sched
wait_for_completion_killable_timeout(struct completion *x,
unsigned long timeout)
{
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0) {
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return -EPERM;
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
- printk(KERN_INFO "%-13.13s %c", p->comm,
+ printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
- ftrace_graph_init_task(idle);
+ ftrace_graph_init_idle_task(idle, cpu);
}
/*
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (migrate_task(p, dest_cpu)) {
+ if (migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, &flags);
if (cpu != group_first_cpu(sd->groups))
return;
+ sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+
child = sd->child;
sd->groups->cpu_power = 0;
cfs_rq->tg = tg;
tg->se[cpu] = se;
- /* se could be NULL for init_task_group */
+ /* se could be NULL for root_task_group */
if (!se)
return;
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
- init_task_group.se = (struct sched_entity **)ptr;
+ root_task_group.se = (struct sched_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
- init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+ root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
- init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
- init_task_group.rt_rq = (struct rt_rq **)ptr;
+ root_task_group.rt_rq = (struct rt_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
global_rt_period(), global_rt_runtime());
#ifdef CONFIG_RT_GROUP_SCHED
- init_rt_bandwidth(&init_task_group.rt_bandwidth,
+ init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
- list_add(&init_task_group.list, &task_groups);
- INIT_LIST_HEAD(&init_task_group.children);
-
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
- init_task_group.shares = init_task_group_load;
+ root_task_group.shares = root_task_group_load;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
/*
- * How much cpu bandwidth does init_task_group get?
+ * How much cpu bandwidth does root_task_group get?
*
* In case of task-groups formed thr' the cgroup filesystem, it
* gets 100% of the cpu resources in the system. This overall
* system cpu resource is divided among the tasks of
- * init_task_group and its child task-groups in a fair manner,
+ * root_task_group and its child task-groups in a fair manner,
* based on each entity's (task or task-group's) weight
* (se->load.weight).
*
- * In other words, if init_task_group has 10 tasks of weight
+ * In other words, if root_task_group has 10 tasks of weight
* 1024) and two child groups A0 and A1 (of weight 1024 each),
* then A0's share of the cpu resource is:
*
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
- * We achieve this by letting init_task_group's tasks sit
- * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+ * We achieve this by letting root_task_group's tasks sit
+ * directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
-#endif
+ init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
- init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
-#endif
+ init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
- perf_event_init();
-
scheduler_running = 1;
}
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- int i;
/*
* Only empty task groups can be destroyed; so we can speculatively
return;
raw_spin_lock_irqsave(&rq->lock, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[i]);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#else /* !CONFG_FAIR_GROUP_SCHED */
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
+ autogroup_free(tg);
kfree(tg);
}
#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- int on_rq;
-
- on_rq = se->on_rq;
- if (on_rq)
- dequeue_entity(cfs_rq, se, 0);
-
- update_load_set(&se->load, shares);
-
- if (on_rq)
- enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- struct rq *rq = cfs_rq->rq;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __set_se_shares(se, shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
+ unsigned long flags;
/*
* We can't change the weight of the root cgroup.
tg->shares = shares;
for_each_possible_cpu(i) {
- /*
- * force a rebalance
- */
- set_se_shares(tg->se[i], shares);
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se;
+
+ se = tg->se[i];
+ /* Propagate contribution to hierarchy */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se), 0);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
done:
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
- return &init_task_group.css;
+ return &root_task_group.css;
}
parent = cgroup_tg(cgrp->parent);
}
}
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ sched_move_task(task);
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
.destroy = cpu_cgroup_destroy,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
+ .exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
.early_init = 1,
};
#endif /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
- barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly. This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier. Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
- int snap, trycount = 0;
-
- smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = atomic_read(&synchronize_sched_expedited_count) + 1;
- get_online_cpus();
- while (try_stop_cpus(cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_sched();
- return;
- }
- if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
- smp_mb(); /* ensure test happens before caller kfree */
- return;
- }
- get_online_cpus();
- }
- atomic_inc(&synchronize_sched_expedited_count);
- smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
- put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */