UBUNTU: Ubuntu-2.6.38-12.51

[linux-flexiantxendom0-natty.git] / kernel / sched_stats.h
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h

index 1d9ec98..48ddf43 100644 (file)
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -4,11 +4,16 @@
   * bump this up when changing the output format or the meaning of an existing
   * format, so that tools can adapt (or abort)
   */
-#define SCHEDSTAT_VERSION 14
+#define SCHEDSTAT_VERSION 15
  
  static int show_schedstat(struct seq_file *seq, void *v)
  {
         int cpu;
+       int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
+       char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+
+       if (mask_str == NULL)
+               return -ENOMEM;
  
         seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
         seq_printf(seq, "timestamp %lu\n", jiffies);
@@ -16,18 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
                 struct rq *rq = cpu_rq(cpu);
  #ifdef CONFIG_SMP
                 struct sched_domain *sd;
-               int dcnt = 0;
+               int dcount = 0;
  #endif
  
                 /* runqueue-specific stats */
                 seq_printf(seq,
-                   "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
-                   cpu, rq->yld_both_empty,
-                   rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
-                   rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
-                   rq->ttwu_cnt, rq->ttwu_local,
-                   rq->rq_sched_info.cpu_time,
-                   rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+                   "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+                   cpu, rq->yld_count,
+                   rq->sched_switch, rq->sched_count, rq->sched_goidle,
+                   rq->ttwu_count, rq->ttwu_local,
+                   rq->rq_cpu_time,
+                   rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
  
                 seq_printf(seq, "\n");
  
@@ -36,15 +40,14 @@ static int show_schedstat(struct seq_file *seq, void *v)
                 preempt_disable();
                 for_each_domain(cpu, sd) {
                         enum cpu_idle_type itype;
-                       char mask_str[NR_CPUS];
  
-                       cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
-                       seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+                       cpumask_scnprintf(mask_str, mask_len,
+                                         sched_domain_span(sd));
+                       seq_printf(seq, "domain%d %s", dcount++, mask_str);
                         for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
                                         itype++) {
-                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
-                                               "%lu",
-                                   sd->lb_cnt[itype],
+                               seq_printf(seq, " %u %u %u %u %u %u %u %u",
+                                   sd->lb_count[itype],
                                     sd->lb_balanced[itype],
                                     sd->lb_failed[itype],
                                     sd->lb_imbalance[itype],
@@ -53,17 +56,18 @@ static int show_schedstat(struct seq_file *seq, void *v)
                                     sd->lb_nobusyq[itype],
                                     sd->lb_nobusyg[itype]);
                         }
-                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
-                           " %lu %lu %lu\n",
-                           sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-                           sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
-                           sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+                       seq_printf(seq,
+                                  " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+                           sd->alb_count, sd->alb_failed, sd->alb_pushed,
+                           sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+                           sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
                             sd->ttwu_wake_remote, sd->ttwu_move_affine,
                             sd->ttwu_move_balance);
                 }
                 preempt_enable();
  #endif
         }
+       kfree(mask_str);
         return 0;
  }
  
@@ -86,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file)
         return res;
  }
  
-const struct file_operations proc_schedstat_operations = {
+static const struct file_operations proc_schedstat_operations = {
         .open    = schedstat_open,
         .read    = seq_read,
         .llseek  = seq_lseek,
         .release = single_release,
  };
  
+static int __init proc_schedstat_init(void)
+{
+       proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+       return 0;
+}
+module_init(proc_schedstat_init);
+
  /*
   * Expects runqueue lock to be held for atomicity of update
   */
@@ -101,7 +112,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
  {
         if (rq) {
                 rq->rq_sched_info.run_delay += delta;
-               rq->rq_sched_info.pcnt++;
+               rq->rq_sched_info.pcount++;
         }
  }
  
@@ -112,7 +123,14 @@ static inline void
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {
         if (rq)
-               rq->rq_sched_info.cpu_time += delta;
+               rq->rq_cpu_time += delta;
+}
+
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+       if (rq)
+               rq->rq_sched_info.run_delay += delta;
  }
  # define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
  # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
@@ -122,6 +140,9 @@ static inline void
  rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
  {}
  static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{}
+static inline void
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {}
  # define schedstat_inc(rq, field)      do { } while (0)
@@ -129,25 +150,29 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  # define schedstat_set(var, val)       do { } while (0)
  #endif
  
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+       t->sched_info.last_queued = 0;
+}
+
  /*
- * Called when a process is dequeued from the active array and given
- * the cpu.  We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue.  (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * This function is only called from sched_info_arrive(), rather than
- * dequeue_task(). Even though a task may be queued and dequeued multiple
- * times as it is shuffled about, we're really interested in knowing how
- * long it was from the *first* time it was queued to the time that it
- * finally hit a cpu.
+ * We are interested in knowing how long it was from the *first* time a
+ * task was queued to the time that it finally hit a cpu, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across cpus. The
+ * delta taken on each cpu would annul the skew.
   */
  static inline void sched_info_dequeued(struct task_struct *t)
  {
-       t->sched_info.last_queued = 0;
+       unsigned long long now = task_rq(t)->clock, delta = 0;
+
+       if (unlikely(sched_info_on()))
+               if (t->sched_info.last_queued)
+                       delta = now - t->sched_info.last_queued;
+       sched_info_reset_dequeued(t);
+       t->sched_info.run_delay += delta;
+
+       rq_sched_info_dequeued(task_rq(t), delta);
  }
  
  /*
@@ -157,29 +182,19 @@ static inline void sched_info_dequeued(struct task_struct *t)
   */
  static void sched_info_arrive(struct task_struct *t)
  {
-       unsigned long long now = sched_clock(), delta = 0;
+       unsigned long long now = task_rq(t)->clock, delta = 0;
  
         if (t->sched_info.last_queued)
                 delta = now - t->sched_info.last_queued;
-       sched_info_dequeued(t);
+       sched_info_reset_dequeued(t);
         t->sched_info.run_delay += delta;
         t->sched_info.last_arrival = now;
-       t->sched_info.pcnt++;
+       t->sched_info.pcount++;
  
         rq_sched_info_arrive(task_rq(t), delta);
  }
  
  /*
- * Called when a process is queued into either the active or expired
- * array.  The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu.  Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
- *
   * This function is only called from enqueue_task(), but also only updates
   * the timestamp if it is already not set.  It's assumed that
   * sched_info_dequeued() will clear that stamp when appropriate.
@@ -188,19 +203,25 @@ static inline void sched_info_queued(struct task_struct *t)
  {
         if (unlikely(sched_info_on()))
                 if (!t->sched_info.last_queued)
-                       t->sched_info.last_queued = sched_clock();
+                       t->sched_info.last_queued = task_rq(t)->clock;
  }
  
  /*
   * Called when a process ceases being the active-running process, either
   * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ * Also, if the process is still in the TASK_RUNNING state, call
+ * sched_info_queued() to mark that it has now again started waiting on
+ * the runqueue.
   */
  static inline void sched_info_depart(struct task_struct *t)
  {
-       unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
+       unsigned long long delta = task_rq(t)->clock -
+                                       t->sched_info.last_arrival;
  
-       t->sched_info.cpu_time += delta;
         rq_sched_info_depart(task_rq(t), delta);
+
+       if (t->state == TASK_RUNNING)
+               sched_info_queued(t);
  }
  
  /*
@@ -231,7 +252,85 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
                 __sched_info_switch(prev, next);
  }
  #else
-#define sched_info_queued(t)           do { } while (0)
-#define sched_info_switch(t, next)     do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#define sched_info_queued(t)                   do { } while (0)
+#define sched_info_reset_dequeued(t)   do { } while (0)
+#define sched_info_dequeued(t)                 do { } while (0)
+#define sched_info_switch(t, next)             do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
  
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick.  None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+/**
+ * account_group_user_time - Maintain utime for a thread group.
+ *
+ * @tsk:       Pointer to task structure.
+ * @cputime:   Time value by which to increment the utime field of the
+ *             thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+                                          cputime_t cputime)
+{
+       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+       if (!cputimer->running)
+               return;
+
+       spin_lock(&cputimer->lock);
+       cputimer->cputime.utime =
+               cputime_add(cputimer->cputime.utime, cputime);
+       spin_unlock(&cputimer->lock);
+}
+
+/**
+ * account_group_system_time - Maintain stime for a thread group.
+ *
+ * @tsk:       Pointer to task structure.
+ * @cputime:   Time value by which to increment the stime field of the
+ *             thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void account_group_system_time(struct task_struct *tsk,
+                                            cputime_t cputime)
+{
+       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+       if (!cputimer->running)
+               return;
+
+       spin_lock(&cputimer->lock);
+       cputimer->cputime.stime =
+               cputime_add(cputimer->cputime.stime, cputime);
+       spin_unlock(&cputimer->lock);
+}
+
+/**
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
+ *
+ * @tsk:       Pointer to task structure.
+ * @ns:                Time value by which to increment the sum_exec_runtime field
+ *             of the thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+                                             unsigned long long ns)
+{
+       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+       if (!cputimer->running)
+               return;
+
+       spin_lock(&cputimer->lock);
+       cputimer->cputime.sum_exec_runtime += ns;
+       spin_unlock(&cputimer->lock);
+}