Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / block / blk-throttle.c
index bc2936b..f2ddb94 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
+#include "blk.h"
 
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
@@ -20,6 +21,11 @@ static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;     /* 100 ms */
 
+/* A workqueue to queue throttle related work */
+static struct workqueue_struct *kthrotld_workqueue;
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+                               unsigned long delay);
+
 struct throtl_rb_root {
        struct rb_root rb;
        struct rb_node *left;
@@ -70,6 +76,11 @@ struct throtl_grp {
        /* When did we start a new slice */
        unsigned long slice_start[2];
        unsigned long slice_end[2];
+
+       /* Some throttle limits got updated for the group */
+       int limits_changed;
+
+       struct rcu_head rcu_head;
 };
 
 struct throtl_data
@@ -80,7 +91,7 @@ struct throtl_data
        /* service tree for active throtl groups */
        struct throtl_rb_root tg_service_tree;
 
-       struct throtl_grp root_tg;
+       struct throtl_grp *root_tg;
        struct request_queue *queue;
 
        /* Total Number of queued bios on READ and WRITE lists */
@@ -93,6 +104,8 @@ struct throtl_data
 
        /* Work for dispatching throttled bios */
        struct delayed_work throtl_work;
+
+       int limits_changed;
 };
 
 enum tg_state_flags {
@@ -130,9 +143,9 @@ static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
        return NULL;
 }
 
-static inline int total_nr_queued(struct throtl_data *td)
+static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
-       return (td->nr_queued[0] + td->nr_queued[1]);
+       return td->nr_queued[0] + td->nr_queued[1];
 }
 
 static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
@@ -141,48 +154,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
        return tg;
 }
 
-static void throtl_put_tg(struct throtl_grp *tg)
+static void throtl_free_tg(struct rcu_head *head)
 {
-       BUG_ON(atomic_read(&tg->ref) <= 0);
-       if (!atomic_dec_and_test(&tg->ref))
-               return;
+       struct throtl_grp *tg;
+
+       tg = container_of(head, struct throtl_grp, rcu_head);
+       free_percpu(tg->blkg.stats_cpu);
        kfree(tg);
 }
 
-static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
-                       struct cgroup *cgroup)
+static void throtl_put_tg(struct throtl_grp *tg)
 {
-       struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-       struct throtl_grp *tg = NULL;
-       void *key = td;
-       struct backing_dev_info *bdi = &td->queue->backing_dev_info;
-       unsigned int major, minor;
+       BUG_ON(atomic_read(&tg->ref) <= 0);
+       if (!atomic_dec_and_test(&tg->ref))
+               return;
 
        /*
-        * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
-        * tree of blkg (instead of traversing through hash list all
-        * the time.
+        * A group is freed in rcu manner. But having an rcu lock does not
+        * mean that one can access all the fields of blkg and assume these
+        * are valid. For example, don't try to follow throtl_data and
+        * request queue links.
+        *
+        * Having a reference to blkg under an rcu allows acess to only
+        * values local to groups like group stats and group rate limits
         */
-       tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
-
-       /* Fill in device details for root group */
-       if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-               tg->blkg.dev = MKDEV(major, minor);
-               goto done;
-       }
-
-       if (tg)
-               goto done;
-
-       tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
-       if (!tg)
-               goto done;
+       call_rcu(&tg->rcu_head, throtl_free_tg);
+}
 
+static void throtl_init_group(struct throtl_grp *tg)
+{
        INIT_HLIST_NODE(&tg->tg_node);
        RB_CLEAR_NODE(&tg->rb_node);
        bio_list_init(&tg->bio_lists[0]);
        bio_list_init(&tg->bio_lists[1]);
+       tg->limits_changed = false;
+
+       /* Practically unlimited BW */
+       tg->bps[0] = tg->bps[1] = -1;
+       tg->iops[0] = tg->iops[1] = -1;
 
        /*
         * Take the initial reference that will be released on destroy
@@ -191,33 +200,171 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
         * exit or cgroup deletion path depending on who is exiting first.
         */
        atomic_set(&tg->ref, 1);
+}
+
+/* Should be called with rcu read lock held (needed for blkcg) */
+static void
+throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
+{
+       hlist_add_head(&tg->tg_node, &td->tg_list);
+       td->nr_undestroyed_grps++;
+}
+
+static void
+__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+       struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+       unsigned int major, minor;
+
+       if (!tg || tg->blkg.dev)
+               return;
+
+       /*
+        * Fill in device details for a group which might not have been
+        * filled at group creation time as queue was being instantiated
+        * and driver had not attached a device yet
+        */
+       if (bdi->dev && dev_name(bdi->dev)) {
+               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+               tg->blkg.dev = MKDEV(major, minor);
+       }
+}
+
+/*
+ * Should be called with without queue lock held. Here queue lock will be
+ * taken rarely. It will be taken only once during life time of a group
+ * if need be
+ */
+static void
+throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+       if (!tg || tg->blkg.dev)
+               return;
+
+       spin_lock_irq(td->queue->queue_lock);
+       __throtl_tg_fill_dev_details(td, tg);
+       spin_unlock_irq(td->queue->queue_lock);
+}
+
+static void throtl_init_add_tg_lists(struct throtl_data *td,
+                       struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+       __throtl_tg_fill_dev_details(td, tg);
 
        /* Add group onto cgroup list */
-       sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
        blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
-                               MKDEV(major, minor), BLKIO_POLICY_THROTL);
+                               tg->blkg.dev, BLKIO_POLICY_THROTL);
 
        tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
        tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
        tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
        tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
 
-       hlist_add_head(&tg->tg_node, &td->tg_list);
-       td->nr_undestroyed_grps++;
-done:
+       throtl_add_group_to_td_list(td, tg);
+}
+
+/* Should be called without queue lock and outside of rcu period */
+static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+{
+       struct throtl_grp *tg = NULL;
+       int ret;
+
+       tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+       if (!tg)
+               return NULL;
+
+       ret = blkio_alloc_blkg_stats(&tg->blkg);
+
+       if (ret) {
+               kfree(tg);
+               return NULL;
+       }
+
+       throtl_init_group(tg);
        return tg;
 }
 
-static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+static struct
+throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
-       struct cgroup *cgroup;
        struct throtl_grp *tg = NULL;
+       void *key = td;
+
+       /*
+        * This is the common case when there are no blkio cgroups.
+        * Avoid lookup in this case
+        */
+       if (blkcg == &blkio_root_cgroup)
+               tg = td->root_tg;
+       else
+               tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+       __throtl_tg_fill_dev_details(td, tg);
+       return tg;
+}
+
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+       struct throtl_grp *tg = NULL, *__tg = NULL;
+       struct blkio_cgroup *blkcg;
+       struct request_queue *q = td->queue;
+
+       /* no throttling for dead queue */
+       if (unlikely(blk_queue_dead(q)))
+               return NULL;
 
        rcu_read_lock();
-       cgroup = task_cgroup(current, blkio_subsys_id);
-       tg = throtl_find_alloc_tg(td, cgroup);
-       if (!tg)
-               tg = &td->root_tg;
+       blkcg = task_blkio_cgroup(current);
+       tg = throtl_find_tg(td, blkcg);
+       if (tg) {
+               rcu_read_unlock();
+               return tg;
+       }
+
+       /*
+        * Need to allocate a group. Allocation of group also needs allocation
+        * of per cpu stats which in-turn takes a mutex() and can block. Hence
+        * we need to drop rcu lock and queue_lock before we call alloc.
+        */
+       rcu_read_unlock();
+       spin_unlock_irq(q->queue_lock);
+
+       tg = throtl_alloc_tg(td);
+
+       /* Group allocated and queue is still alive. take the lock */
+       spin_lock_irq(q->queue_lock);
+
+       /* Make sure @q is still alive */
+       if (unlikely(blk_queue_dead(q))) {
+               kfree(tg);
+               return NULL;
+       }
+
+       /*
+        * Initialize the new group. After sleeping, read the blkcg again.
+        */
+       rcu_read_lock();
+       blkcg = task_blkio_cgroup(current);
+
+       /*
+        * If some other thread already allocated the group while we were
+        * not holding queue lock, free up the group
+        */
+       __tg = throtl_find_tg(td, blkcg);
+
+       if (__tg) {
+               kfree(tg);
+               rcu_read_unlock();
+               return __tg;
+       }
+
+       /* Group allocation failed. Account the IO to root group */
+       if (!tg) {
+               tg = td->root_tg;
+               return tg;
+       }
+
+       throtl_init_add_tg_lists(td, tg, blkcg);
        rcu_read_unlock();
        return tg;
 }
@@ -332,10 +479,9 @@ static void throtl_schedule_next_dispatch(struct throtl_data *td)
        update_min_dispatch_time(st);
 
        if (time_before_eq(st->min_disptime, jiffies))
-               throtl_schedule_delayed_work(td->queue, 0);
+               throtl_schedule_delayed_work(td, 0);
        else
-               throtl_schedule_delayed_work(td->queue,
-                               (st->min_disptime - jiffies));
+               throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
 }
 
 static inline void
@@ -350,6 +496,12 @@ throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
                        tg->slice_end[rw], jiffies);
 }
 
+static inline void throtl_set_slice_end(struct throtl_data *td,
+               struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+       tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+}
+
 static inline void throtl_extend_slice(struct throtl_data *td,
                struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 {
@@ -373,7 +525,8 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 static inline void
 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
-       unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
+       unsigned long nr_slices, time_elapsed, io_trim;
+       u64 bytes_trim, tmp;
 
        BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 
@@ -385,14 +538,26 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
        if (throtl_slice_used(td, tg, rw))
                return;
 
+       /*
+        * A bio has been dispatched. Also adjust slice_end. It might happen
+        * that initially cgroup limit was very low resulting in high
+        * slice_end, but later limit was bumped up and bio was dispached
+        * sooner, then we need to reduce slice_end. A high bogus slice_end
+        * is bad because it does not allow new slice to start.
+        */
+
+       throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+
        time_elapsed = jiffies - tg->slice_start[rw];
 
        nr_slices = time_elapsed / throtl_slice;
 
        if (!nr_slices)
                return;
+       tmp = tg->bps[rw] * throtl_slice * nr_slices;
+       do_div(tmp, HZ);
+       bytes_trim = tmp;
 
-       bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
        io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 
        if (!bytes_trim && !io_trim)
@@ -410,7 +575,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
        tg->slice_start[rw] += nr_slices * throtl_slice;
 
-       throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
+       throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
                        " start=%lu end=%lu jiffies=%lu",
                        rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
                        tg->slice_start[rw], tg->slice_end[rw], jiffies);
@@ -422,6 +587,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
        bool rw = bio_data_dir(bio);
        unsigned int io_allowed;
        unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+       u64 tmp;
 
        jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
@@ -431,8 +597,20 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 
        jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-       io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
-                               / MSEC_PER_SEC;
+       /*
+        * jiffy_elapsed_rnd should not be a big value as minimum iops can be
+        * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+        * will allow dispatch after 1 second and after that slice should
+        * have been trimmed.
+        */
+
+       tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+       do_div(tmp, HZ);
+
+       if (tmp > UINT_MAX)
+               io_allowed = UINT_MAX;
+       else
+               io_allowed = tmp;
 
        if (tg->io_disp[rw] + 1 <= io_allowed) {
                if (wait)
@@ -457,7 +635,7 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
                struct bio *bio, unsigned long *wait)
 {
        bool rw = bio_data_dir(bio);
-       u64 bytes_allowed, extra_bytes;
+       u64 bytes_allowed, extra_bytes, tmp;
        unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
        jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
@@ -468,8 +646,9 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 
        jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-       bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
-                               / MSEC_PER_SEC;
+       tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+       do_div(tmp, HZ);
+       bytes_allowed = tmp;
 
        if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
                if (wait)
@@ -494,6 +673,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
        return 0;
 }
 
+static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
+       if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
+               return 1;
+       return 0;
+}
+
 /*
  * Returns whether one can dispatch a bio or not. Also returns approx number
  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
@@ -552,16 +737,12 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
        bool rw = bio_data_dir(bio);
-       bool sync = bio->bi_rw & REQ_SYNC;
+       bool sync = rw_is_sync(bio->bi_rw);
 
        /* Charge the bio to the group */
        tg->bytes_disp[rw] += bio->bi_size;
        tg->io_disp[rw]++;
 
-       /*
-        * TODO: This will take blkg->stats_lock. Figure out a way
-        * to avoid this cost.
-        */
        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
 }
 
@@ -592,15 +773,6 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
        min_wait = min(read_wait, write_wait);
        disptime = jiffies + min_wait;
 
-       /*
-        * If group is already on active tree, then update dispatch time
-        * only if it is lesser than existing dispatch time. Otherwise
-        * always update the dispatch time
-        */
-
-       if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
-               return;
-
        /* Update dispatch time */
        throtl_dequeue_tg(td, tg);
        tg->disptime = disptime;
@@ -632,7 +804,7 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 {
        unsigned int nr_reads = 0, nr_writes = 0;
        unsigned int max_nr_reads = throtl_grp_quantum*3/4;
-       unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+       unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
        struct bio *bio;
 
        /* Try to dispatch 75% READS and 25% WRITES */
@@ -691,6 +863,43 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
        return nr_disp;
 }
 
+static void throtl_process_limit_change(struct throtl_data *td)
+{
+       struct throtl_grp *tg;
+       struct hlist_node *pos, *n;
+
+       if (!td->limits_changed)
+               return;
+
+       xchg(&td->limits_changed, false);
+
+       throtl_log(td, "limits changed");
+
+       hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+               if (!tg->limits_changed)
+                       continue;
+
+               if (!xchg(&tg->limits_changed, false))
+                       continue;
+
+               throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+                       " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
+                       tg->iops[READ], tg->iops[WRITE]);
+
+               /*
+                * Restart the slices for both READ and WRITES. It
+                * might happen that a group's limit are dropped
+                * suddenly and we don't want to account recently
+                * dispatched IO with new low rate
+                */
+               throtl_start_new_slice(td, tg, 0);
+               throtl_start_new_slice(td, tg, 1);
+
+               if (throtl_tg_on_rr(tg))
+                       tg_update_disptime(td, tg);
+       }
+}
+
 /* Dispatch throttled bios. Should be called without queue lock held. */
 static int throtl_dispatch(struct request_queue *q)
 {
@@ -698,15 +907,18 @@ static int throtl_dispatch(struct request_queue *q)
        unsigned int nr_disp = 0;
        struct bio_list bio_list_on_stack;
        struct bio *bio;
+       struct blk_plug plug;
 
        spin_lock_irq(q->queue_lock);
 
+       throtl_process_limit_change(td);
+
        if (!total_nr_queued(td))
                goto out;
 
        bio_list_init(&bio_list_on_stack);
 
-       throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
+       throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
                        total_nr_queued(td), td->nr_queued[READ],
                        td->nr_queued[WRITE]);
 
@@ -724,9 +936,10 @@ out:
         * immediate dispatch
         */
        if (nr_disp) {
+               blk_start_plug(&plug);
                while((bio = bio_list_pop(&bio_list_on_stack)))
                        generic_make_request(bio);
-               blk_unplug(q);
+               blk_finish_plug(&plug);
        }
        return nr_disp;
 }
@@ -741,24 +954,24 @@ void blk_throtl_work(struct work_struct *work)
 }
 
 /* Call with queue lock held */
-void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+static void
+throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 {
 
-       struct throtl_data *td = q->td;
        struct delayed_work *dwork = &td->throtl_work;
 
-       if (total_nr_queued(td) > 0) {
+       /* schedule work if limits changed even if no bio is queued */
+       if (total_nr_queued(td) || td->limits_changed) {
                /*
                 * We might have a work scheduled to be executed in future.
                 * Cancel that and schedule a new one.
                 */
                __cancel_delayed_work(dwork);
-               kblockd_schedule_delayed_work(q, dwork, delay);
+               queue_delayed_work(kthrotld_workqueue, dwork, delay);
                throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
                                delay, jiffies);
        }
 }
-EXPORT_SYMBOL(throtl_schedule_delayed_work);
 
 static void
 throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
@@ -792,11 +1005,6 @@ static void throtl_release_tgs(struct throtl_data *td)
        }
 }
 
-static void throtl_td_free(struct throtl_data *td)
-{
-       kfree(td);
-}
-
 /*
  * Blk cgroup controller notification saying that blkio_group object is being
  * delinked as associated cgroup object is going away. That also means that
@@ -821,31 +1029,65 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
        spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
 
-static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
-                       u64 read_bps)
+static void throtl_update_blkio_group_common(struct throtl_data *td,
+                               struct throtl_grp *tg)
 {
-       tg_of_blkg(blkg)->bps[READ] = read_bps;
+       xchg(&tg->limits_changed, true);
+       xchg(&td->limits_changed, true);
+       /* Schedule a work now to process the limit change */
+       throtl_schedule_delayed_work(td, 0);
 }
 
-static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
-                       u64 write_bps)
+/*
+ * For all update functions, key should be a valid pointer because these
+ * update functions are called under blkcg_lock, that means, blkg is
+ * valid and in turn key is valid. queue exit path can not race because
+ * of blkcg_lock
+ *
+ * Can not take queue lock in update functions as queue lock under blkcg_lock
+ * is not allowed. Under other paths we take blkcg_lock under queue_lock.
+ */
+static void throtl_update_blkio_group_read_bps(void *key,
+                               struct blkio_group *blkg, u64 read_bps)
 {
-       tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+       struct throtl_data *td = key;
+       struct throtl_grp *tg = tg_of_blkg(blkg);
+
+       tg->bps[READ] = read_bps;
+       throtl_update_blkio_group_common(td, tg);
 }
 
-static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
-                       unsigned int read_iops)
+static void throtl_update_blkio_group_write_bps(void *key,
+                               struct blkio_group *blkg, u64 write_bps)
 {
-       tg_of_blkg(blkg)->iops[READ] = read_iops;
+       struct throtl_data *td = key;
+       struct throtl_grp *tg = tg_of_blkg(blkg);
+
+       tg->bps[WRITE] = write_bps;
+       throtl_update_blkio_group_common(td, tg);
 }
 
-static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
-                       unsigned int write_iops)
+static void throtl_update_blkio_group_read_iops(void *key,
+                       struct blkio_group *blkg, unsigned int read_iops)
 {
-       tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+       struct throtl_data *td = key;
+       struct throtl_grp *tg = tg_of_blkg(blkg);
+
+       tg->iops[READ] = read_iops;
+       throtl_update_blkio_group_common(td, tg);
 }
 
-void throtl_shutdown_timer_wq(struct request_queue *q)
+static void throtl_update_blkio_group_write_iops(void *key,
+                       struct blkio_group *blkg, unsigned int write_iops)
+{
+       struct throtl_data *td = key;
+       struct throtl_grp *tg = tg_of_blkg(blkg);
+
+       tg->iops[WRITE] = write_iops;
+       throtl_update_blkio_group_common(td, tg);
+}
+
+static void throtl_shutdown_wq(struct request_queue *q)
 {
        struct throtl_data *td = q->td;
 
@@ -867,20 +1109,48 @@ static struct blkio_policy_type blkio_policy_throtl = {
        .plid = BLKIO_POLICY_THROTL,
 };
 
-int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
        struct throtl_data *td = q->td;
        struct throtl_grp *tg;
-       struct bio *bio = *biop;
        bool rw = bio_data_dir(bio), update_disptime = true;
+       struct blkio_cgroup *blkcg;
+       bool throttled = false;
 
        if (bio->bi_rw & REQ_THROTTLED) {
                bio->bi_rw &= ~REQ_THROTTLED;
-               return 0;
+               goto out;
+       }
+
+       /*
+        * A throtl_grp pointer retrieved under rcu can be used to access
+        * basic fields like stats and io rates. If a group has no rules,
+        * just update the dispatch stats in lockless manner and return.
+        */
+
+       rcu_read_lock();
+       blkcg = task_blkio_cgroup(current);
+       tg = throtl_find_tg(td, blkcg);
+       if (tg) {
+               throtl_tg_fill_dev_details(td, tg);
+
+               if (tg_no_rule_group(tg, rw)) {
+                       blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
+                                       rw, rw_is_sync(bio->bi_rw));
+                       rcu_read_unlock();
+                       goto out;
+               }
        }
+       rcu_read_unlock();
 
+       /*
+        * Either group has not been allocated yet or it is not an unlimited
+        * IO group
+        */
        spin_lock_irq(q->queue_lock);
        tg = throtl_get_tg(td);
+       if (unlikely(!tg))
+               goto out_unlock;
 
        if (tg->nr_queued[rw]) {
                /*
@@ -889,16 +1159,30 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
                 */
                update_disptime = false;
                goto queue_bio;
+
        }
 
        /* Bio is with-in rate limit of group */
        if (tg_may_dispatch(td, tg, bio, NULL)) {
                throtl_charge_bio(tg, bio);
-               goto out;
+
+               /*
+                * We need to trim slice even when bios are not being queued
+                * otherwise it might happen that a bio is not queued for
+                * a long time and slice keeps on extending and trim is not
+                * called for a long time. Now if limits are reduced suddenly
+                * we take into account all the IO dispatched so far at new
+                * low rate and * newly queued IO gets a really long dispatch
+                * time.
+                *
+                * So keep on trimming slice even if bio is not queued.
+                */
+               throtl_trim_slice(td, tg, rw);
+               goto out_unlock;
        }
 
 queue_bio:
-       throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
+       throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
                        " iodisp=%u iops=%u queued=%d/%d",
                        rw == READ ? 'R' : 'W',
                        tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
@@ -906,16 +1190,52 @@ queue_bio:
                        tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
        throtl_add_bio_tg(q->td, tg, bio);
-       *biop = NULL;
+       throttled = true;
 
        if (update_disptime) {
                tg_update_disptime(td, tg);
                throtl_schedule_next_dispatch(td);
        }
 
+out_unlock:
+       spin_unlock_irq(q->queue_lock);
 out:
+       return throttled;
+}
+
+/**
+ * blk_throtl_drain - drain throttled bios
+ * @q: request_queue to drain throttled bios for
+ *
+ * Dispatch all currently throttled bios on @q through ->make_request_fn().
+ */
+void blk_throtl_drain(struct request_queue *q)
+       __releases(q->queue_lock) __acquires(q->queue_lock)
+{
+       struct throtl_data *td = q->td;
+       struct throtl_rb_root *st = &td->tg_service_tree;
+       struct throtl_grp *tg;
+       struct bio_list bl;
+       struct bio *bio;
+
+       queue_lockdep_assert_held(q);
+
+       bio_list_init(&bl);
+
+       while ((tg = throtl_rb_first(st))) {
+               throtl_dequeue_tg(td, tg);
+
+               while ((bio = bio_list_peek(&tg->bio_lists[READ])))
+                       tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+               while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+                       tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+       }
        spin_unlock_irq(q->queue_lock);
-       return 0;
+
+       while ((bio = bio_list_pop(&bl)))
+               generic_make_request(bio);
+
+       spin_lock_irq(q->queue_lock);
 }
 
 int blk_throtl_init(struct request_queue *q)
@@ -929,38 +1249,25 @@ int blk_throtl_init(struct request_queue *q)
 
        INIT_HLIST_HEAD(&td->tg_list);
        td->tg_service_tree = THROTL_RB_ROOT;
+       td->limits_changed = false;
+       INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
-       /* Init root group */
-       tg = &td->root_tg;
-       INIT_HLIST_NODE(&tg->tg_node);
-       RB_CLEAR_NODE(&tg->rb_node);
-       bio_list_init(&tg->bio_lists[0]);
-       bio_list_init(&tg->bio_lists[1]);
-
-       /* Practically unlimited BW */
-       tg->bps[0] = tg->bps[1] = -1;
-       tg->iops[0] = tg->iops[1] = -1;
+       /* alloc and Init root group. */
+       td->queue = q;
+       tg = throtl_alloc_tg(td);
 
-       /*
-        * Set root group reference to 2. One reference will be dropped when
-        * all groups on tg_list are being deleted during queue exit. Other
-        * reference will remain there as we don't want to delete this group
-        * as it is statically allocated and gets destroyed when throtl_data
-        * goes away.
-        */
-       atomic_set(&tg->ref, 2);
-       hlist_add_head(&tg->tg_node, &td->tg_list);
-       td->nr_undestroyed_grps++;
+       if (!tg) {
+               kfree(td);
+               return -ENOMEM;
+       }
 
-       INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+       td->root_tg = tg;
 
        rcu_read_lock();
-       blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
-                                       0, BLKIO_POLICY_THROTL);
+       throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
        rcu_read_unlock();
 
        /* Attach throtl data to request queue */
-       td->queue = q;
        q->td = td;
        return 0;
 }
@@ -972,7 +1279,7 @@ void blk_throtl_exit(struct request_queue *q)
 
        BUG_ON(!td);
 
-       throtl_shutdown_timer_wq(q);
+       throtl_shutdown_wq(q);
 
        spin_lock_irq(q->queue_lock);
        throtl_release_tgs(td);
@@ -996,11 +1303,26 @@ void blk_throtl_exit(struct request_queue *q)
         */
        if (wait)
                synchronize_rcu();
-       throtl_td_free(td);
+
+       /*
+        * Just being safe to make sure after previous flush if some body did
+        * update limits through cgroup and another work got queued, cancel
+        * it.
+        */
+       throtl_shutdown_wq(q);
+}
+
+void blk_throtl_release(struct request_queue *q)
+{
+       kfree(q->td);
 }
 
 static int __init throtl_init(void)
 {
+       kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
+       if (!kthrotld_workqueue)
+               panic("Failed to create kthrotld\n");
+
        blkio_policy_register(&blkio_policy_throtl);
        return 0;
 }