#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <asm/switch_to.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include <asm/mutex.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
{
int i;
- for (i = 0; sched_feat_names[i]; i++) {
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (!(sysctl_sched_features & (1UL << i)))
seq_puts(m, "NO_");
seq_printf(m, "%s ", sched_feat_names[i]);
return 0;
}
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled) \
+ jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+ if (static_key_enabled(&sched_feat_keys[i]))
+ static_key_slow_dec(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+ if (!static_key_enabled(&sched_feat_keys[i]))
+ static_key_slow_inc(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
cmp += 3;
}
- for (i = 0; sched_feat_names[i]; i++) {
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
- if (neg)
+ if (neg) {
sysctl_sched_features &= ~(1UL << i);
- else
+ sched_feat_disable(i);
+ } else {
sysctl_sched_features |= (1UL << i);
+ sched_feat_enable(i);
+ }
break;
}
}
- if (!sched_feat_names[i])
+ if (i == __SCHED_FEAT_NR)
return -EINVAL;
*ppos += cnt;
return 0;
}
late_initcall(sched_init_debug);
-
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
p->sched_class->dequeue_task(rq, p, flags);
}
-/*
- * activate_task - move a task to the runqueue.
- */
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
enqueue_task(rq, p, flags);
}
-/*
- * deactivate_task - remove a task from the runqueue.
- */
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
delta -= irq_delta;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- if (static_branch((¶virt_steal_rq_enabled))) {
+ if (static_key_false((¶virt_steal_rq_enabled))) {
u64 st;
steal = paravirt_steal_clock(cpu_of(rq));
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int irqtime_account_hi_update(void)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
u64 latest_ns;
int ret = 0;
local_irq_save(flags);
latest_ns = this_cpu_read(cpu_hardirq_time);
- if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
ret = 1;
local_irq_restore(flags);
return ret;
static int irqtime_account_si_update(void)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
u64 latest_ns;
int ret = 0;
local_irq_save(flags);
latest_ns = this_cpu_read(cpu_softirq_time);
- if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
ret = 1;
local_irq_restore(flags);
return ret;
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
- int dest_cpu;
const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ enum { cpuset, possible, fail } state = cpuset;
+ int dest_cpu;
/* Look for allowed, online CPU in same node. */
- for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return dest_cpu;
+ }
- /* Any allowed, online CPU? */
- dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
- if (dest_cpu < nr_cpu_ids)
- return dest_cpu;
+ for (;;) {
+ /* Any allowed, online CPU? */
+ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ goto out;
+ }
- /* No more Mr. Nice Guy. */
- dest_cpu = cpuset_cpus_allowed_fallback(p);
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
+ switch (state) {
+ case cpuset:
+ /* No more Mr. Nice Guy. */
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+
+ case possible:
+ do_set_cpus_allowed(p, cpu_possible_mask);
+ state = fail;
+ break;
+
+ case fail:
+ BUG();
+ break;
+ }
+ }
+
+out:
+ if (state != cpuset) {
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk_sched("process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
+ }
}
return dest_cpu;
}
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu)
struct rq *rq = cpu_rq(cpu);
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
local_irq_enable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
+ finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
if (mm)
* Once we've updated the global active value, we need to apply the exponential
* weights adjusted to the number of cycles missed.
*/
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
{
long delta, active, n;
- if (time_before(jiffies, calc_load_update))
- return;
-
/*
* If we crossed a calc_load_update boundary, make sure to fold
* any pending idle changes, the respective CPUs might have
atomic_long_add(delta, &calc_load_tasks);
/*
- * If we were idle for multiple load cycles, apply them.
+ * It could be the one fold was all it took, we done!
*/
- if (ticks >= LOAD_FREQ) {
- n = ticks / LOAD_FREQ;
+ if (time_before(jiffies, calc_load_update + 10))
+ return;
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
- calc_load_update += n * LOAD_FREQ;
- }
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- /*
- * Its possible the remainder of the above division also crosses
- * a LOAD_FREQ period, the regular check in calc_global_load()
- * which comes after this will take care of that.
- *
- * Consider us being 11 ticks before a cycle completion, and us
- * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
- * age us 4 cycles, and the test in calc_global_load() will
- * pick up the final one.
- */
+ calc_load_update += n * LOAD_FREQ;
}
#else
void calc_load_account_idle(struct rq *this_rq)
return 0;
}
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
{
}
#endif
{
long active;
- calc_global_nohz(ticks);
-
if (time_before(jiffies, calc_load_update + 10))
return;
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
calc_load_update += LOAD_FREQ;
+
+ /*
+ * Account one period with whatever state we found before
+ * folding in the nohz state and ageing the entire idle period.
+ *
+ * This avoids loosing a sample when we go idle between
+ * calc_load_account_active() (10 ticks ago) and now and thus
+ * under-accounting.
+ */
+ calc_global_nohz();
}
/*
#endif
DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
* Return any ns on the sched_clock that have not yet been accounted in
return ns;
}
+#ifdef CONFIG_CGROUP_CPUACCT
+struct cgroup_subsys cpuacct_subsys;
+struct cpuacct root_cpuacct;
+#endif
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+ u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+#endif
+ /*
+ * Since all updates are sure to touch the root cgroup, we
+ * get ourselves ahead and touch it first. If the root cgroup
+ * is the only cgroup, then nothing else should be necessary.
+ *
+ */
+ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+
+#ifdef CONFIG_CGROUP_CPUACCT
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca && (ca != &root_cpuacct)) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += tmp;
+ ca = parent_ca(ca);
+ }
+ rcu_read_unlock();
+#endif
+}
+
+
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define cputime_to_u64(t) ((__force u64)(t))
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 cputime_to_u64(cputime_t t)
+{
+ u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+ unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+ + __this_cpu_read(steal_residual),
+ NS_PER_TICK,
+ &__get_cpu_var(steal_residual));
+
+ __this_cpu_write(steal_snapshot, s);
+ if (t < jiffies_to_cputime(adj))
+ return 0;
+
+ return (__force u64)(t - jiffies_to_cputime(adj));
+}
+
+static void steal_resume(void)
+{
+ cputime_to_u64(((cputime_t)1 << (BITS_PER_LONG * sizeof(cputime_t)
+ / sizeof(long) - 1)) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+ .resume = steal_resume,
+};
+
+static int __init steal_register(void)
+{
+ register_syscore_ops(&steal_syscore_ops);
+ return 0;
+}
+core_initcall(steal_register);
+#endif
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
void account_user_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t tmp;
+ int index;
/* Add user time to process. */
- p->utime = cputime_add(p->utime, cputime);
- p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
+ index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
/* Add user time to cpustat. */
- tmp = cputime_to_cputime64(cputime);
- if (TASK_NICE(p) > 0)
- cpustat->nice = cputime64_add(cpustat->nice, tmp);
- else
- cpustat->user = cputime64_add(cpustat->user, tmp);
+ task_group_account_field(p, index, cputime_to_u64(cputime));
- cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
static void account_guest_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled)
{
- cputime64_t tmp;
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-
- tmp = cputime_to_cputime64(cputime);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
/* Add guest time to process. */
- p->utime = cputime_add(p->utime, cputime);
- p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
- p->gtime = cputime_add(p->gtime, cputime);
+ p->gtime += cputime;
/* Add guest time to cpustat. */
if (TASK_NICE(p) > 0) {
- cpustat->nice = cputime64_add(cpustat->nice, tmp);
- cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+ cpustat[CPUTIME_NICE] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
} else {
- cpustat->user = cputime64_add(cpustat->user, tmp);
- cpustat->guest = cputime64_add(cpustat->guest, tmp);
+ cpustat[CPUTIME_USER] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST] += (__force u64) cputime;
}
}
*/
static inline
void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, cputime64_t *target_cputime64)
+ cputime_t cputime_scaled, int index)
{
- cputime64_t tmp = cputime_to_cputime64(cputime);
-
/* Add system time to process. */
- p->stime = cputime_add(p->stime, cputime);
- p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ p->stime += cputime;
+ p->stimescaled += cputime_scaled;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
- *target_cputime64 = cputime64_add(*target_cputime64, tmp);
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ task_group_account_field(p, index, cputime_to_u64(cputime));
/* Account for system time used */
acct_update_integrals(p);
void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t *target_cputime64;
+ int index;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
}
if (hardirq_count() - hardirq_offset)
- target_cputime64 = &cpustat->irq;
+ index = CPUTIME_IRQ;
else if (in_serving_softirq())
- target_cputime64 = &cpustat->softirq;
+ index = CPUTIME_SOFTIRQ;
else
- target_cputime64 = &cpustat->system;
+ index = CPUTIME_SYSTEM;
- __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+ __account_system_time(p, cputime, cputime_scaled, index);
}
/*
*/
void account_steal_time(cputime_t cputime)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
- cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+ cpustat[CPUTIME_STEAL] += (__force u64) cputime;
}
/*
*/
void account_idle_time(cputime_t cputime)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
struct rq *rq = this_rq();
if (atomic_read(&rq->nr_iowait) > 0)
- cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+ cpustat[CPUTIME_IOWAIT] += cputime_to_u64(cputime);
else
- cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+ cpustat[CPUTIME_IDLE] += cputime_to_u64(cputime);
}
static __always_inline bool steal_account_process_tick(void)
{
#ifdef CONFIG_PARAVIRT
- if (static_branch(¶virt_steal_enabled)) {
+ if (static_key_false(¶virt_steal_enabled)) {
u64 steal, st = 0;
steal = paravirt_steal_clock(smp_processor_id());
struct rq *rq)
{
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
if (irqtime_account_hi_update()) {
- cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ cpustat[CPUTIME_IRQ] += cputime_to_u64(cputime_one_jiffy);
} else if (irqtime_account_si_update()) {
- cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ cpustat[CPUTIME_SOFTIRQ] += cputime_to_u64(cputime_one_jiffy);
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* Also, p->stime needs to be updated for ksoftirqd.
*/
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- &cpustat->softirq);
+ CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
} else if (p == rq->idle) {
account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
} else {
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- &cpustat->system);
+ CPUTIME_SYSTEM);
}
}
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
+ cputime_t rtime, utime = p->utime, total = utime + p->stime;
/*
* Use CFS's precise accounting:
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
if (total) {
- u64 temp = rtime;
+ u64 temp = (__force u64) rtime;
- temp *= utime;
- do_div(temp, total);
- utime = (cputime_t)temp;
+ temp *= (__force u64) utime;
+ do_div(temp, (__force u32) total);
+ utime = (__force cputime_t) temp;
} else
utime = rtime;
* Compare with previous values, to keep monotonicity:
*/
p->prev_utime = max(p->prev_utime, utime);
- p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
+ p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
*ut = p->prev_utime;
*st = p->prev_stime;
thread_group_cputime(p, &cputime);
- total = cputime_add(cputime.utime, cputime.stime);
+ total = cputime.utime + cputime.stime;
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
if (total) {
- u64 temp = rtime;
+ u64 temp = (__force u64) rtime;
- temp *= cputime.utime;
- do_div(temp, total);
- utime = (cputime_t)temp;
+ temp *= (__force u64) cputime.utime;
+ do_div(temp, (__force u32) total);
+ utime = (__force cputime_t) temp;
} else
utime = rtime;
sig->prev_utime = max(sig->prev_utime, utime);
- sig->prev_stime = max(sig->prev_stime,
- cputime_sub(rtime, sig->prev_utime));
+ sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
*ut = sig->prev_utime;
*st = sig->prev_stime;
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
- struct pt_regs *regs = get_irq_regs();
+ if (oops_in_progress)
+ return;
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
prev->comm, prev->pid, preempt_count());
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+ dump_stack();
}
/*
post_schedule(rq);
- preempt_enable_no_resched();
+ sched_preempt_enable_no_resched();
if (need_resched())
goto need_resched;
}
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state)
+ if (!tsk->state || tsk_is_pi_blocked(tsk))
return;
/*
* If we are going to sleep and we have plugged IO queued,
}
EXPORT_SYMBOL(schedule);
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+ sched_preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+}
+
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+#include <asm/mutex.h>
+
+#ifndef arch_cpu_is_running
+#define arch_cpu_is_running(cpu) true
+#endif
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
{
*/
barrier();
- return owner->on_cpu;
+ return owner->on_cpu
+ && arch_cpu_is_running(task_thread_info(owner)->cpu);
}
/*
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
{
- __wake_up_common(q, mode, 1, 0, NULL);
+ __wake_up_common(q, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
rq = __task_rq_lock(p);
+ /*
+ * Idle task boosting is a nono in general. There is one
+ * exception, when PREEMPT_RT and NOHZ is active:
+ *
+ * The idle task calls get_next_timer_interrupt() and holds
+ * the timer wheel base->lock on the CPU and another CPU wants
+ * to access the timer (probably to cancel it). We can safely
+ * ignore the boosting request, as the idle CPU runs this code
+ * with interrupts disabled and will complete the lock
+ * protected section without being interrupted. So there is no
+ * real need to boost.
+ */
+ if (unlikely(p == rq->idle)) {
+ WARN_ON(p != rq->curr);
+ WARN_ON(p->pi_blocked_on);
+ goto out_unlock;
+ }
+
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
__task_rq_unlock(rq);
}
-
#endif
-
void set_user_nice(struct task_struct *p, long nice)
{
int old_prio, delta, on_rq;
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
- deactivate_task(rq, p, 0);
+ dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
- activate_task(rq, p, 0);
+ enqueue_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+ if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p);
__release(rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
do_raw_spin_unlock(&rq->lock);
- preempt_enable_no_resched();
+ sched_preempt_enable_no_resched();
schedule();
/**
* yield - yield the current processor to other threads.
*
- * This is a shortcut for kernel-space yielding - it marks the
- * thread runnable and calls sys_sched_yield().
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
*/
void __sched yield(void)
{
free = stack_not_used(p);
#endif
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), task_pid_nr(p->real_parent),
+ task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
(unsigned long)task_thread_info(p)->flags);
show_stack(p, NULL);
* placed properly.
*/
if (p->on_rq) {
- deactivate_task(rq_src, p, 0);
+ dequeue_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
- activate_task(rq_dest, p, 0);
+ enqueue_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p, 0);
}
done:
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
- mode_t mode, proc_handler *proc_handler)
+ umode_t mode, proc_handler *proc_handler)
{
entry->procname = procname;
entry->data = data;
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
+ case CPU_STARTING:
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
}
/*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first cpu number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two cpus are in the same cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_id);
+
+static void update_top_cache_domain(int cpu)
+{
+ struct sched_domain *sd;
+ int id = cpu;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ if (sd)
+ id = cpumask_first(sched_domain_span(sd));
+
+ rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_id, cpu) = id;
+}
+
+/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
destroy_sched_domains(tmp, cpu);
+
+ update_top_cache_domain(cpu);
}
/* cpus with isolated domains */
if (!sg)
return -ENOMEM;
+ sg->next = sg;
+
*per_cpu_ptr(sdd->sg, j) = sg;
sgp = kzalloc_node(sizeof(struct sched_group_power),
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
- free_sched_groups(sd->groups, 0);
- kfree(*per_cpu_ptr(sdd->sd, j));
- kfree(*per_cpu_ptr(sdd->sg, j));
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgp)
+ kfree(*per_cpu_ptr(sdd->sgp, j));
}
free_percpu(sdd->sd);
+ sdd->sd = NULL;
free_percpu(sdd->sg);
+ sdd->sg = NULL;
free_percpu(sdd->sgp);
+ sdd->sgp = NULL;
}
}
}
#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
- struct sysdev_class_attribute *attr,
- char *page)
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
- return sprintf(page, "%u\n", sched_mc_power_savings);
+ return sprintf(buf, "%u\n", sched_mc_power_savings);
}
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
- struct sysdev_class_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 0);
}
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
- sched_mc_power_savings_show,
- sched_mc_power_savings_store);
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+ sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
#endif
#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
- struct sysdev_class_attribute *attr,
- char *page)
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
- return sprintf(page, "%u\n", sched_smt_power_savings);
+ return sprintf(buf, "%u\n", sched_smt_power_savings);
}
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
- struct sysdev_class_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 1);
}
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
sched_smt_power_savings_show,
sched_smt_power_savings_store);
#endif
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
{
int err = 0;
#ifdef CONFIG_SCHED_SMT
if (smt_capable())
- err = sysfs_create_file(&cls->kset.kobj,
- &attr_sched_smt_power_savings.attr);
+ err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
#endif
#ifdef CONFIG_SCHED_MC
if (!err && mc_capable())
- err = sysfs_create_file(&cls->kset.kobj,
- &attr_sched_mc_power_savings.attr);
+ err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
#endif
return err;
}
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
+
#endif /* CONFIG_CGROUP_SCHED */
+#ifdef CONFIG_CGROUP_CPUACCT
+ root_cpuacct.cpustat = &kernel_cpustat;
+ root_cpuacct.cpuusage = alloc_percpu(u64);
+ /* Too early, not expected to fail */
+ BUG_ON(!root_cpuacct.cpuusage);
+#endif
for_each_possible_cpu(i) {
struct rq *rq;
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
+
+ INIT_LIST_HEAD(&rq->cfs_tasks);
+
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ
rq->nohz_flags = 0;
on_rq = p->on_rq;
if (on_rq)
- deactivate_task(rq, p, 0);
+ dequeue_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
if (on_rq) {
- activate_task(rq, p, 0);
+ enqueue_task(rq, p, 0);
resched_task(rq->curr);
}
#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-#else /* !CONFIG_RT_GROUP_SCHED */
-#endif /* CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
}
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#endif
-
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
static unsigned long to_ratio(u64 period, u64 runtime)
{
struct task_group, css);
}
-static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
{
struct task_group *tg, *parent;
return &tg->css;
}
-static void
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpu_cgroup_destroy(struct cgroup *cgrp)
{
struct task_group *tg = cgroup_tg(cgrp);
sched_destroy_group(tg);
}
-static int
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
- return -EINVAL;
+ if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ return -EINVAL;
#else
- /* We don't support RT-tasks being in separate groups */
- if (tsk->sched_class != &fair_sched_class)
- return -EINVAL;
+ /* We don't support RT-tasks being in separate groups */
+ if (task->sched_class != &fair_sched_class)
+ return -EINVAL;
#endif
+ }
return 0;
}
-static void
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cpu_cgroup_attach(struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
- sched_move_task(tsk);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset)
+ sched_move_task(task);
}
static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota;
- if (period <= 0)
- return -EINVAL;
-
return tg_set_cfs_bandwidth(tg, period, quota);
}
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach_task = cpu_cgroup_can_attach_task,
- .attach_task = cpu_cgroup_attach_task,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
* (balbir@in.ibm.com).
*/
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
- struct cgroup_subsys_state css;
- /* cpuusage holds pointer to a u64-type object on every cpu */
- u64 __percpu *cpuusage;
- struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
- struct cpuacct *parent;
-};
-
-struct cgroup_subsys cpuacct_subsys;
-
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
- struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
{
- struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- int i;
+ struct cpuacct *ca;
+
+ if (!cgrp->parent)
+ return &root_cpuacct.css;
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto out;
if (!ca->cpuusage)
goto out_free_ca;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- if (percpu_counter_init(&ca->cpustat[i], 0))
- goto out_free_counters;
-
- if (cgrp->parent)
- ca->parent = cgroup_ca(cgrp->parent);
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
return &ca->css;
-out_free_counters:
- while (--i >= 0)
- percpu_counter_destroy(&ca->cpustat[i]);
+out_free_cpuusage:
free_percpu(ca->cpuusage);
out_free_ca:
kfree(ca);
}
/* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpuacct_destroy(struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
- int i;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- percpu_counter_destroy(&ca->cpustat[i]);
+ free_percpu(ca->cpustat);
free_percpu(ca->cpuusage);
kfree(ca);
}
};
static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
+ struct cgroup_map_cb *cb)
{
struct cpuacct *ca = cgroup_ca(cgrp);
- int i;
+ int cpu;
+ s64 val = 0;
+
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
+ }
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
- s64 val = percpu_counter_read(&ca->cpustat[i]);
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[i], val);
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
}
+
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
return 0;
}
ca = task_ca(tsk);
- for (; ca; ca = ca->parent) {
+ for (; ca; ca = parent_ca(ca)) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
rcu_read_unlock();
}
-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- * in cputime_t units. As a result, cpuacct_update_stats calls
- * percpu_counter_add with values large enough to always overflow the
- * per cpu batch limit causing bad SMP scalability.
- *
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- */
-#ifdef CONFIG_SMP
-#define CPUACCT_BATCH \
- min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
-#else
-#define CPUACCT_BATCH 0
-#endif
-
-/*
- * Charge the system/user time to the task's accounting group.
- */
-void cpuacct_update_stats(struct task_struct *tsk,
- enum cpuacct_stat_index idx, cputime_t val)
-{
- struct cpuacct *ca;
- int batch = CPUACCT_BATCH;
-
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(tsk);
-
- do {
- __percpu_counter_add(&ca->cpustat[idx], val, batch);
- ca = ca->parent;
- } while (ca);
- rcu_read_unlock();
-}
-
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.create = cpuacct_create,