UBUNTU: Ubuntu-2.6.38-12.51

[linux-flexiantxendom0-natty.git] / block / blk-throttle.c
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index c1bc1b6..e36cc10 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -20,6 +20,11 @@ static int throtl_quantum = 32;
  /* Throttling is performed over 100ms slice and after that slice is renewed */
  static unsigned long throtl_slice = HZ/10;     /* 100 ms */
  
+/* A workqueue to queue throttle related work */
+static struct workqueue_struct *kthrotld_workqueue;
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+                               unsigned long delay);
+
  struct throtl_rb_root {
         struct rb_root rb;
         struct rb_node *left;
@@ -168,7 +173,15 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
          * tree of blkg (instead of traversing through hash list all
          * the time.
          */
-       tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+       /*
+        * This is the common case when there are no blkio cgroups.
+        * Avoid lookup in this case
+        */
+       if (blkcg == &blkio_root_cgroup)
+               tg = &td->root_tg;
+       else
+               tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
  
         /* Fill in device details for root group */
         if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
@@ -337,10 +350,9 @@ static void throtl_schedule_next_dispatch(struct throtl_data *td)
         update_min_dispatch_time(st);
  
         if (time_before_eq(st->min_disptime, jiffies))
-               throtl_schedule_delayed_work(td->queue, 0);
+               throtl_schedule_delayed_work(td, 0);
         else
-               throtl_schedule_delayed_work(td->queue,
-                               (st->min_disptime - jiffies));
+               throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
  }
  
  static inline void
@@ -355,6 +367,12 @@ throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
                         tg->slice_end[rw], jiffies);
  }
  
+static inline void throtl_set_slice_end(struct throtl_data *td,
+               struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+       tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+}
+
  static inline void throtl_extend_slice(struct throtl_data *td,
                 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
  {
@@ -391,6 +409,16 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
         if (throtl_slice_used(td, tg, rw))
                 return;
  
+       /*
+        * A bio has been dispatched. Also adjust slice_end. It might happen
+        * that initially cgroup limit was very low resulting in high
+        * slice_end, but later limit was bumped up and bio was dispached
+        * sooner, then we need to reduce slice_end. A high bogus slice_end
+        * is bad because it does not allow new slice to start.
+        */
+
+       throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+
         time_elapsed = jiffies - tg->slice_start[rw];
  
         nr_slices = time_elapsed / throtl_slice;
@@ -430,6 +458,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
         bool rw = bio_data_dir(bio);
         unsigned int io_allowed;
         unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+       u64 tmp;
  
         jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  
@@ -439,7 +468,20 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
  
         jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
  
-       io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
+       /*
+        * jiffy_elapsed_rnd should not be a big value as minimum iops can be
+        * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+        * will allow dispatch after 1 second and after that slice should
+        * have been trimmed.
+        */
+
+       tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+       do_div(tmp, HZ);
+
+       if (tmp > UINT_MAX)
+               io_allowed = UINT_MAX;
+       else
+               io_allowed = tmp;
  
         if (tg->io_disp[rw] + 1 <= io_allowed) {
                 if (wait)
@@ -631,7 +673,7 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
  {
         unsigned int nr_reads = 0, nr_writes = 0;
         unsigned int max_nr_reads = throtl_grp_quantum*3/4;
-       unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+       unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
         struct bio *bio;
  
         /* Try to dispatch 75% READS and 25% WRITES */
@@ -695,26 +737,21 @@ static void throtl_process_limit_change(struct throtl_data *td)
         struct throtl_grp *tg;
         struct hlist_node *pos, *n;
  
-       /*
-        * Make sure atomic_inc() effects from
-        * throtl_update_blkio_group_read_bps(), group of functions are
-        * visible.
-        * Is this required or smp_mb__after_atomic_inc() was suffcient
-        * after the atomic_inc().
-        */
-       smp_rmb();
         if (!atomic_read(&td->limits_changed))
                 return;
  
         throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
  
-       hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
-               /*
-                * Do I need an smp_rmb() here to make sure tg->limits_changed
-                * update is visible. I am relying on smp_rmb() at the
-                * beginning of function and not putting a new one here.
-                */
+       /*
+        * Make sure updates from throtl_update_blkio_group_read_bps() group
+        * of functions to tg->limits_changed are visible. We do not
+        * want update td->limits_changed to be visible but update to
+        * tg->limits_changed not being visible yet on this cpu. Hence
+        * the read barrier.
+        */
+       smp_rmb();
  
+       hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
                 if (throtl_tg_on_rr(tg) && tg->limits_changed) {
                         throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
                                 " riops=%u wiops=%u", tg->bps[READ],
@@ -782,10 +819,10 @@ void blk_throtl_work(struct work_struct *work)
  }
  
  /* Call with queue lock held */
-void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+static void
+throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
  {
  
-       struct throtl_data *td = q->td;
         struct delayed_work *dwork = &td->throtl_work;
  
         if (total_nr_queued(td) > 0) {
@@ -794,12 +831,11 @@ void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
                  * Cancel that and schedule a new one.
                  */
                 __cancel_delayed_work(dwork);
-               kblockd_schedule_delayed_work(q, dwork, delay);
+               queue_delayed_work(kthrotld_workqueue, dwork, delay);
                 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
                                 delay, jiffies);
         }
  }
-EXPORT_SYMBOL(throtl_schedule_delayed_work);
  
  static void
  throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
@@ -887,7 +923,7 @@ static void throtl_update_blkio_group_read_bps(void *key,
         smp_mb__after_atomic_inc();
  
         /* Schedule a work now to process the limit change */
-       throtl_schedule_delayed_work(td->queue, 0);
+       throtl_schedule_delayed_work(td, 0);
  }
  
  static void throtl_update_blkio_group_write_bps(void *key,
@@ -901,7 +937,7 @@ static void throtl_update_blkio_group_write_bps(void *key,
         smp_mb__before_atomic_inc();
         atomic_inc(&td->limits_changed);
         smp_mb__after_atomic_inc();
-       throtl_schedule_delayed_work(td->queue, 0);
+       throtl_schedule_delayed_work(td, 0);
  }
  
  static void throtl_update_blkio_group_read_iops(void *key,
@@ -915,7 +951,7 @@ static void throtl_update_blkio_group_read_iops(void *key,
         smp_mb__before_atomic_inc();
         atomic_inc(&td->limits_changed);
         smp_mb__after_atomic_inc();
-       throtl_schedule_delayed_work(td->queue, 0);
+       throtl_schedule_delayed_work(td, 0);
  }
  
  static void throtl_update_blkio_group_write_iops(void *key,
@@ -929,7 +965,7 @@ static void throtl_update_blkio_group_write_iops(void *key,
         smp_mb__before_atomic_inc();
         atomic_inc(&td->limits_changed);
         smp_mb__after_atomic_inc();
-       throtl_schedule_delayed_work(td->queue, 0);
+       throtl_schedule_delayed_work(td, 0);
  }
  
  void throtl_shutdown_timer_wq(struct request_queue *q)
@@ -1102,6 +1138,10 @@ void blk_throtl_exit(struct request_queue *q)
  
  static int __init throtl_init(void)
  {
+       kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
+       if (!kthrotld_workqueue)
+               panic("Failed to create kthrotld\n");
+
         blkio_policy_register(&blkio_policy_throtl);
         return 0;
  }