UBUNTU: Ubuntu-2.6.38-12.51

[linux-flexiantxendom0-natty.git] / block / cfq-iosched.c
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c

index 838834b..ea83a4f 100644 (file)
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,7 +14,7 @@
  #include <linux/rbtree.h>
  #include <linux/ioprio.h>
  #include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include "cfq.h"
  
  /*
   * tunables
@@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10;
  static int cfq_slice_async = HZ / 25;
  static const int cfq_slice_async_rq = 2;
  static int cfq_slice_idle = HZ / 125;
+static int cfq_group_idle = HZ / 125;
  static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
  static const int cfq_hist_divisor = 4;
  
@@ -55,6 +56,7 @@ static const int cfq_hist_divisor = 4;
  #define RQ_CIC(rq)             \
         ((struct cfq_io_context *) (rq)->elevator_private)
  #define RQ_CFQQ(rq)            (struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQG(rq)            (struct cfq_group *) ((rq)->elevator_private3)
  
  static struct kmem_cache *cfq_pool;
  static struct kmem_cache *cfq_ioc_pool;
@@ -63,6 +65,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
  static struct completion *ioc_gone;
  static DEFINE_SPINLOCK(ioc_gone_lock);
  
+static DEFINE_SPINLOCK(cic_index_lock);
+static DEFINE_IDA(cic_index_ida);
+
  #define CFQ_PRIO_LISTS         IOPRIO_BE_NR
  #define cfq_class_idle(cfqq)   ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  #define cfq_class_rt(cfqq)     ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -82,7 +87,6 @@ struct cfq_rb_root {
         unsigned count;
         unsigned total_weight;
         u64 min_vdisktime;
-       struct rb_node *active;
  };
  #define CFQ_RB_ROOT    (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
                         .count = 0, .min_vdisktime = 0, }
@@ -92,7 +96,7 @@ struct cfq_rb_root {
   */
  struct cfq_queue {
         /* reference count */
-       atomic_t ref;
+       int ref;
         /* various state flags, see below */
         unsigned int flags;
         /* parent cfq_data */
@@ -143,7 +147,7 @@ struct cfq_queue {
         struct cfq_queue *new_cfqq;
         struct cfq_group *cfqg;
         struct cfq_group *orig_cfqg;
-       /* Sectors dispatched in current dispatch round */
+       /* Number of sectors dispatched from queue in single dispatch round */
         unsigned long nr_sectors;
  };
  
@@ -155,6 +159,7 @@ enum wl_prio_t {
         BE_WORKLOAD = 0,
         RT_WORKLOAD = 1,
         IDLE_WORKLOAD = 2,
+       CFQ_PRIO_NR,
  };
  
  /*
@@ -174,15 +179,23 @@ struct cfq_group {
         /* group service_tree key */
         u64 vdisktime;
         unsigned int weight;
-       bool on_st;
  
         /* number of cfqq currently on this group */
         int nr_cfqq;
  
-       /* Per group busy queus average. Useful for workload slice calc. */
-       unsigned int busy_queues_avg[2];
         /*
-        * rr lists of queues with requests, onle rr for each priority class.
+        * Per group busy queus average. Useful for workload slice calc. We
+        * create the array for each prio class but at run time it is used
+        * only for RT and BE class and slot for IDLE class remains unused.
+        * This is primarily done to avoid confusion and a gcc warning.
+        */
+       unsigned int busy_queues_avg[CFQ_PRIO_NR];
+       /*
+        * rr lists of queues with requests. We maintain service trees for
+        * RT and BE classes. These trees are subdivided in subclasses
+        * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
+        * class there is no subclassification and all the cfq queues go on
+        * a single tree service_tree_idle.
          * Counts are embedded in the cfq_rb_root
          */
         struct cfq_rb_root service_trees[2][3];
@@ -194,8 +207,10 @@ struct cfq_group {
         struct blkio_group blkg;
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
         struct hlist_node cfqd_node;
-       atomic_t ref;
+       int ref;
  #endif
+       /* number of requests that are on the dispatch list or inside driver */
+       int dispatched;
  };
  
  /*
@@ -214,7 +229,6 @@ struct cfq_data {
         enum wl_type_t serving_type;
         unsigned long workload_expires;
         struct cfq_group *serving_group;
-       bool noidle_tree_requires_idle;
  
         /*
          * Each priority tree is sorted by next_request position.  These
@@ -269,9 +283,11 @@ struct cfq_data {
         unsigned int cfq_slice[2];
         unsigned int cfq_slice_async_rq;
         unsigned int cfq_slice_idle;
+       unsigned int cfq_group_idle;
         unsigned int cfq_latency;
         unsigned int cfq_group_isolation;
  
+       unsigned int cic_index;
         struct list_head cic_list;
  
         /*
@@ -346,7 +362,7 @@ CFQ_CFQQ_FNS(deep);
  CFQ_CFQQ_FNS(wait_busy);
  #undef CFQ_CFQQ_FNS
  
-#ifdef CONFIG_DEBUG_CFQ_IOSCHED
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
  #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
         blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
@@ -375,6 +391,21 @@ CFQ_CFQQ_FNS(wait_busy);
                         &cfqg->service_trees[i][j]: NULL) \
  
  
+static inline bool iops_mode(struct cfq_data *cfqd)
+{
+       /*
+        * If we are not idling on queues and it is a NCQ drive, parallel
+        * execution of requests is on and measuring time is not possible
+        * in most of the cases until and unless we drive shallower queue
+        * depths and that becomes a performance bottleneck. In such cases
+        * switch to start providing fairness in terms of number of IOs.
+        */
+       if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
+               return true;
+       else
+               return false;
+}
+
  static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
  {
         if (cfq_class_idle(cfqq))
@@ -431,13 +462,31 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
         cic->cfqq[is_sync] = cfqq;
  }
  
+#define CIC_DEAD_KEY   1ul
+#define CIC_DEAD_INDEX_SHIFT   1
+
+static inline void *cfqd_dead_key(struct cfq_data *cfqd)
+{
+       return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+}
+
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
+{
+       struct cfq_data *cfqd = cic->key;
+
+       if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
+               return NULL;
+
+       return cfqd;
+}
+
  /*
   * We regard a request as SYNC, if it's either a read or has the SYNC bit
   * set (in which case it could also be direct WRITE).
   */
  static inline bool cfq_bio_sync(struct bio *bio)
  {
-       return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
+       return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
  }
  
  /*
@@ -512,11 +561,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st)
         u64 vdisktime = st->min_vdisktime;
         struct cfq_group *cfqg;
  
-       if (st->active) {
-               cfqg = rb_entry_cfqg(st->active);
-               vdisktime = cfqg->vdisktime;
-       }
-
         if (st->left) {
                 cfqg = rb_entry_cfqg(st->left);
                 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
@@ -554,8 +598,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
         return cfq_target_latency * cfqg->weight / st->total_weight;
  }
  
-static inline void
-cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static inline unsigned
+cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
         unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
         if (cfqd->cfq_latency) {
@@ -581,6 +625,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                                     low_slice);
                 }
         }
+       return slice;
+}
+
+static inline void
+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+       unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
+
         cfqq->slice_start = jiffies;
         cfqq->slice_end = jiffies + slice;
         cfqq->allocated_slice = slice;
@@ -595,11 +647,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  static inline bool cfq_slice_used(struct cfq_queue *cfqq)
  {
         if (cfq_cfqq_slice_new(cfqq))
-               return 0;
+               return false;
         if (time_before(jiffies, cfqq->slice_end))
-               return 0;
+               return false;
  
-       return 1;
+       return true;
  }
  
  /*
@@ -625,9 +677,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
                 return rq1;
         else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
                 return rq2;
-       if (rq_is_meta(rq1) && !rq_is_meta(rq2))
+       if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
                 return rq1;
-       else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
+       else if ((rq2->cmd_flags & REQ_META) &&
+                !(rq1->cmd_flags & REQ_META))
                 return rq2;
  
         s1 = blk_rq_pos(rq1);
@@ -817,7 +870,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
         struct rb_node *n;
  
         cfqg->nr_cfqq++;
-       if (cfqg->on_st)
+       if (!RB_EMPTY_NODE(&cfqg->rb_node))
                 return;
  
         /*
@@ -833,7 +886,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
                 cfqg->vdisktime = st->min_vdisktime;
  
         __cfq_group_service_tree_add(st, cfqg);
-       cfqg->on_st = true;
         st->total_weight += cfqg->weight;
  }
  
@@ -842,9 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
  {
         struct cfq_rb_root *st = &cfqd->grp_service_tree;
  
-       if (st->active == &cfqg->rb_node)
-               st->active = NULL;
-
         BUG_ON(cfqg->nr_cfqq < 1);
         cfqg->nr_cfqq--;
  
@@ -853,12 +902,11 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
                 return;
  
         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-       cfqg->on_st = false;
         st->total_weight -= cfqg->weight;
         if (!RB_EMPTY_NODE(&cfqg->rb_node))
                 cfq_rb_erase(&cfqg->rb_node, st);
         cfqg->saved_workload_slice = 0;
-       blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
+       cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
  }
  
  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -884,8 +932,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
                         slice_used = cfqq->allocated_slice;
         }
  
-       cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
-                               cfqq->nr_sectors);
         return slice_used;
  }
  
@@ -893,19 +939,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
                                 struct cfq_queue *cfqq)
  {
         struct cfq_rb_root *st = &cfqd->grp_service_tree;
-       unsigned int used_sl, charge_sl;
+       unsigned int used_sl, charge;
         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
                         - cfqg->service_tree_idle.count;
  
         BUG_ON(nr_sync < 0);
-       used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
+       used_sl = charge = cfq_cfqq_slice_usage(cfqq);
  
-       if (!cfq_cfqq_sync(cfqq) && !nr_sync)
-               charge_sl = cfqq->allocated_slice;
+       if (iops_mode(cfqd))
+               charge = cfqq->slice_dispatch;
+       else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
+               charge = cfqq->allocated_slice;
  
         /* Can't update vdisktime while group is on service tree */
         cfq_rb_erase(&cfqg->rb_node, st);
-       cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
+       cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
         __cfq_group_service_tree_add(st, cfqg);
  
         /* This group is being expired. Save the context */
@@ -919,8 +967,11 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
  
         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
                                         st->min_vdisktime);
-       blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
-                                               cfqq->nr_sectors);
+       cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
+                       " sect=%u", used_sl, cfqq->slice_dispatch, charge,
+                       iops_mode(cfqd), cfqq->nr_sectors);
+       cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+       cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
  }
  
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -931,8 +982,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
         return NULL;
  }
  
-void
-cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
+void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+                                       unsigned int weight)
  {
         cfqg_of_blkg(blkg)->weight = weight;
  }
@@ -961,7 +1012,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
         if (!cfqg)
                 goto done;
  
-       cfqg->weight = blkcg->weight;
         for_each_cfqg_st(cfqg, i, j, st)
                 *st = CFQ_RB_ROOT;
         RB_CLEAR_NODE(&cfqg->rb_node);
@@ -972,12 +1022,23 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
          * elevator which will be dropped by either elevator exit
          * or cgroup deletion path depending on who is exiting first.
          */
-       atomic_set(&cfqg->ref, 1);
+       cfqg->ref = 1;
  
-       /* Add group onto cgroup list */
-       sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-       blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+       /*
+        * Add group onto cgroup list. It might happen that bdi->dev is
+        * not initialized yet. Initialize this new group without major
+        * and minor info and this info will be filled in once a new thread
+        * comes for IO. See code above.
+        */
+       if (bdi->dev) {
+               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
                                         MKDEV(major, minor));
+       } else
+               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+                                       0);
+
+       cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
  
         /* Add group on cfqd list */
         hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -1004,6 +1065,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
         return cfqg;
  }
  
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+       cfqg->ref++;
+       return cfqg;
+}
+
  static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
  {
         /* Currently, all async queues are mapped to root group */
@@ -1012,7 +1079,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
  
         cfqq->cfqg = cfqg;
         /* cfqq reference on cfqg */
-       atomic_inc(&cfqq->cfqg->ref);
+       cfqq->cfqg->ref++;
  }
  
  static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1020,11 +1087,12 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
         struct cfq_rb_root *st;
         int i, j;
  
-       BUG_ON(atomic_read(&cfqg->ref) <= 0);
-       if (!atomic_dec_and_test(&cfqg->ref))
+       BUG_ON(cfqg->ref <= 0);
+       cfqg->ref--;
+       if (cfqg->ref)
                 return;
         for_each_cfqg_st(cfqg, i, j, st)
-               BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+               BUG_ON(!RB_EMPTY_ROOT(&st->rb));
         kfree(cfqg);
  }
  
@@ -1053,7 +1121,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
                  * it from cgroup list, then it will take care of destroying
                  * cfqg also.
                  */
-               if (!blkiocg_del_blkio_group(&cfqg->blkg))
+               if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
                         cfq_destroy_cfqg(cfqd, cfqg);
         }
  }
@@ -1087,6 +1155,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
  {
         return &cfqd->root_group;
  }
+
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+       return cfqg;
+}
+
  static inline void
  cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
         cfqq->cfqg = cfqg;
@@ -1123,7 +1197,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
                 cfqq->orig_cfqg = cfqq->cfqg;
                 cfqq->cfqg = &cfqd->root_group;
-               atomic_inc(&cfqd->root_group.ref);
+               cfqd->root_group.ref++;
                 group_changed = 1;
         } else if (!cfqd->cfq_group_isolation
                    && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
@@ -1389,7 +1463,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
  {
         elv_rb_del(&cfqq->sort_list, rq);
         cfqq->queued[rq_is_sync(rq)]--;
+       cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+                                       rq_data_dir(rq), rq_is_sync(rq));
         cfq_add_rq_rb(rq);
+       cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+                       &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
+                       rq_is_sync(rq));
  }
  
  static struct request *
@@ -1445,7 +1524,9 @@ static void cfq_remove_request(struct request *rq)
         cfq_del_rq_rb(rq);
  
         cfqq->cfqd->rq_queued--;
-       if (rq_is_meta(rq)) {
+       cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+                                       rq_data_dir(rq), rq_is_sync(rq));
+       if (rq->cmd_flags & REQ_META) {
                 WARN_ON(!cfqq->meta_pending);
                 cfqq->meta_pending--;
         }
@@ -1476,6 +1557,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
         }
  }
  
+static void cfq_bio_merged(struct request_queue *q, struct request *req,
+                               struct bio *bio)
+{
+       cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
+                                       bio_data_dir(bio), cfq_bio_sync(bio));
+}
+
  static void
  cfq_merged_requests(struct request_queue *q, struct request *rq,
                     struct request *next)
@@ -1493,6 +1581,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
         if (cfqq->next_rq == next)
                 cfqq->next_rq = rq;
         cfq_remove_request(next);
+       cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
+                                       rq_data_dir(next), rq_is_sync(next));
  }
  
  static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -1520,12 +1610,19 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
         return cfqq == RQ_CFQQ(rq);
  }
  
+static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+       del_timer(&cfqd->idle_slice_timer);
+       cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+}
+
  static void __cfq_set_active_queue(struct cfq_data *cfqd,
                                    struct cfq_queue *cfqq)
  {
         if (cfqq) {
                 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
                                 cfqd->serving_prio, cfqd->serving_type);
+               cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
                 cfqq->slice_start = 0;
                 cfqq->dispatch_start = jiffies;
                 cfqq->allocated_slice = 0;
@@ -1539,7 +1636,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
                 cfq_clear_cfqq_fifo_expire(cfqq);
                 cfq_mark_cfqq_slice_new(cfqq);
  
-               del_timer(&cfqd->idle_slice_timer);
+               cfq_del_timer(cfqd, cfqq);
         }
  
         cfqd->active_queue = cfqq;
@@ -1555,7 +1652,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
  
         if (cfq_cfqq_wait_request(cfqq))
-               del_timer(&cfqd->idle_slice_timer);
+               cfq_del_timer(cfqd, cfqq);
  
         cfq_clear_cfqq_wait_request(cfqq);
         cfq_clear_cfqq_wait_busy(cfqq);
@@ -1572,8 +1669,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         /*
          * store what was left of this slice, if the queue idled/timed out
          */
-       if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
-               cfqq->slice_resid = cfqq->slice_end - jiffies;
+       if (timed_out) {
+               if (cfq_cfqq_slice_new(cfqq))
+                       cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
+               else
+                       cfqq->slice_resid = cfqq->slice_end - jiffies;
                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
         }
  
@@ -1587,9 +1687,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         if (cfqq == cfqd->active_queue)
                 cfqd->active_queue = NULL;
  
-       if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
-               cfqd->grp_service_tree.active = NULL;
-
         if (cfqd->active_cic) {
                 put_io_context(cfqd->active_cic->ioc);
                 cfqd->active_cic = NULL;
@@ -1784,6 +1881,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         BUG_ON(!service_tree);
         BUG_ON(!service_tree->count);
  
+       if (!cfqd->cfq_slice_idle)
+               return false;
+
         /* We never do for idle class queues. */
         if (prio == IDLE_WORKLOAD)
                 return false;
@@ -1798,17 +1898,17 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
          * in their service tree.
          */
         if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
-               return 1;
+               return true;
         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
                         service_tree->count);
-       return 0;
+       return false;
  }
  
  static void cfq_arm_slice_timer(struct cfq_data *cfqd)
  {
         struct cfq_queue *cfqq = cfqd->active_queue;
         struct cfq_io_context *cic;
-       unsigned long sl;
+       unsigned long sl, group_idle = 0;
  
         /*
          * SSD device without seek penalty, disable idling. But only do so
@@ -1824,8 +1924,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
         /*
          * idle is disabled, either manually or by past process history
          */
-       if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
-               return;
+       if (!cfq_should_idle(cfqd, cfqq)) {
+               /* no queue idling. Check for group idling */
+               if (cfqd->cfq_group_idle)
+                       group_idle = cfqd->cfq_group_idle;
+               else
+                       return;
+       }
  
         /*
          * still active requests from this queue, don't idle
@@ -1852,12 +1957,21 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
                 return;
         }
  
+       /* There are other queues in the group, don't do group idle */
+       if (group_idle && cfqq->cfqg->nr_cfqq > 1)
+               return;
+
         cfq_mark_cfqq_wait_request(cfqq);
  
-       sl = cfqd->cfq_slice_idle;
+       if (group_idle)
+               sl = cfqd->cfq_group_idle;
+       else
+               sl = cfqd->cfq_slice_idle;
  
         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-       cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+       cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
+       cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
+                       group_idle ? 1 : 0);
  }
  
  /*
@@ -1873,10 +1987,13 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
         cfq_remove_request(rq);
         cfqq->dispatched++;
+       (RQ_CFQG(rq))->dispatched++;
         elv_dispatch_sort(q, rq);
  
         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
         cfqq->nr_sectors += blk_rq_sectors(rq);
+       cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+                                       rq_data_dir(rq), rq_is_sync(rq));
  }
  
  /*
@@ -1920,7 +2037,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
         int process_refs, io_refs;
  
         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
-       process_refs = atomic_read(&cfqq->ref) - io_refs;
+       process_refs = cfqq->ref - io_refs;
         BUG_ON(process_refs < 0);
         return process_refs;
  }
@@ -1930,6 +2047,15 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
         int process_refs, new_process_refs;
         struct cfq_queue *__cfqq;
  
+       /*
+        * If there are no process references on the new_cfqq, then it is
+        * unsafe to follow the ->new_cfqq chain as other cfqq's in the
+        * chain may have dropped their last reference (not just their
+        * last process reference).
+        */
+       if (!cfqq_process_refs(new_cfqq))
+               return;
+
         /* Avoid a circular list and skip interim queue merges */
         while ((__cfqq = new_cfqq->new_cfqq)) {
                 if (__cfqq == cfqq)
@@ -1938,23 +2064,23 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
         }
  
         process_refs = cfqq_process_refs(cfqq);
+       new_process_refs = cfqq_process_refs(new_cfqq);
         /*
          * If the process for the cfqq has gone away, there is no
          * sense in merging the queues.
          */
-       if (process_refs == 0)
+       if (process_refs == 0 || new_process_refs == 0)
                 return;
  
         /*
          * Merge in the direction of the lesser amount of work.
          */
-       new_process_refs = cfqq_process_refs(new_cfqq);
         if (new_process_refs >= process_refs) {
                 cfqq->new_cfqq = new_cfqq;
-               atomic_add(process_refs, &new_cfqq->ref);
+               new_cfqq->ref += process_refs;
         } else {
                 new_cfqq->new_cfqq = cfqq;
-               atomic_add(new_process_refs, &cfqq->ref);
+               cfqq->ref += new_process_refs;
         }
  }
  
@@ -1987,12 +2113,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
         unsigned count;
         struct cfq_rb_root *st;
         unsigned group_slice;
-
-       if (!cfqg) {
-               cfqd->serving_prio = IDLE_WORKLOAD;
-               cfqd->workload_expires = jiffies + 1;
-               return;
-       }
+       enum wl_prio_t original_prio = cfqd->serving_prio;
  
         /* Choose next priority. RT > BE > IDLE */
         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
@@ -2005,6 +2126,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
                 return;
         }
  
+       if (original_prio != cfqd->serving_prio)
+               goto new_workload;
+
         /*
          * For RT and BE, we have to choose also the type
          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@ -2019,6 +2143,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
         if (count && !time_after(jiffies, cfqd->workload_expires))
                 return;
  
+new_workload:
         /* otherwise select new workload type */
         cfqd->serving_type =
                 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
@@ -2060,7 +2185,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
         slice = max_t(unsigned, slice, CFQ_MIN_TT);
         cfq_log(cfqd, "workload slice:%d", slice);
         cfqd->workload_expires = jiffies + slice;
-       cfqd->noidle_tree_requires_idle = false;
  }
  
  static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -2071,7 +2195,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
         if (RB_EMPTY_ROOT(&st->rb))
                 return NULL;
         cfqg = cfq_rb_first_group(st);
-       st->active = &cfqg->rb_node;
         update_min_vdisktime(st);
         return cfqg;
  }
@@ -2132,7 +2255,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
                         cfqq = NULL;
                         goto keep_queue;
                 } else
-                       goto expire;
+                       goto check_group_idle;
         }
  
         /*
@@ -2160,8 +2283,34 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
          * flight or is idling for a new request, allow either of these
          * conditions to happen (or time out) before selecting a new queue.
          */
-       if (timer_pending(&cfqd->idle_slice_timer) ||
-           (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
+       if (timer_pending(&cfqd->idle_slice_timer)) {
+               cfqq = NULL;
+               goto keep_queue;
+       }
+
+       /*
+        * This is a deep seek queue, but the device is much faster than
+        * the queue can deliver, don't idle
+        **/
+       if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
+           (cfq_cfqq_slice_new(cfqq) ||
+           (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
+               cfq_clear_cfqq_deep(cfqq);
+               cfq_clear_cfqq_idle_window(cfqq);
+       }
+
+       if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
+               cfqq = NULL;
+               goto keep_queue;
+       }
+
+       /*
+        * If group idle is enabled and there are requests dispatched from
+        * this group, wait for requests to complete.
+        */
+check_group_idle:
+       if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
+           && cfqq->cfqg->dispatched) {
                 cfqq = NULL;
                 goto keep_queue;
         }
@@ -2224,12 +2373,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
  {
         /* the queue hasn't finished any request, can't estimate */
         if (cfq_cfqq_slice_new(cfqq))
-               return 1;
+               return true;
         if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
                 cfqq->slice_end))
-               return 1;
+               return true;
  
-       return 0;
+       return false;
  }
  
  static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -2395,9 +2544,10 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
         struct cfq_data *cfqd = cfqq->cfqd;
         struct cfq_group *cfqg, *orig_cfqg;
  
-       BUG_ON(atomic_read(&cfqq->ref) <= 0);
+       BUG_ON(cfqq->ref <= 0);
  
-       if (!atomic_dec_and_test(&cfqq->ref))
+       cfqq->ref--;
+       if (cfqq->ref)
                 return;
  
         cfq_log_cfqq(cfqd, cfqq, "put_queue");
@@ -2476,11 +2626,12 @@ static void cfq_cic_free(struct cfq_io_context *cic)
  static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
  {
         unsigned long flags;
+       unsigned long dead_key = (unsigned long) cic->key;
  
-       BUG_ON(!cic->dead_key);
+       BUG_ON(!(dead_key & CIC_DEAD_KEY));
  
         spin_lock_irqsave(&ioc->lock, flags);
-       radix_tree_delete(&ioc->radix_root, cic->dead_key);
+       radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
         hlist_del_rcu(&cic->cic_list);
         spin_unlock_irqrestore(&ioc->lock, flags);
  
@@ -2503,15 +2654,10 @@ static void cfq_free_io_context(struct io_context *ioc)
         __call_for_each_cic(ioc, cic_free_func);
  }
  
-static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_put_cooperator(struct cfq_queue *cfqq)
  {
         struct cfq_queue *__cfqq, *next;
  
-       if (unlikely(cfqq == cfqd->active_queue)) {
-               __cfq_slice_expired(cfqd, cfqq, 0);
-               cfq_schedule_dispatch(cfqd);
-       }
-
         /*
          * If this queue was scheduled to merge with another queue, be
          * sure to drop the reference taken on that queue (and others in
@@ -2527,6 +2673,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                 cfq_put_queue(__cfqq);
                 __cfqq = next;
         }
+}
+
+static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+       if (unlikely(cfqq == cfqd->active_queue)) {
+               __cfq_slice_expired(cfqd, cfqq, 0);
+               cfq_schedule_dispatch(cfqd);
+       }
+
+       cfq_put_cooperator(cfqq);
  
         cfq_put_queue(cfqq);
  }
@@ -2539,11 +2695,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
         list_del_init(&cic->queue_list);
  
         /*
-        * Make sure key == NULL is seen for dead queues
+        * Make sure dead mark is seen for dead queues
          */
         smp_wmb();
-       cic->dead_key = (unsigned long) cic->key;
-       cic->key = NULL;
+       cic->key = cfqd_dead_key(cfqd);
  
         if (ioc->ioc_data == cic)
                 rcu_assign_pointer(ioc->ioc_data, NULL);
@@ -2562,7 +2717,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
  static void cfq_exit_single_io_context(struct io_context *ioc,
                                        struct cfq_io_context *cic)
  {
-       struct cfq_data *cfqd = cic->key;
+       struct cfq_data *cfqd = cic_to_cfqd(cic);
  
         if (cfqd) {
                 struct request_queue *q = cfqd->queue;
@@ -2575,7 +2730,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
                  * race between exiting task and queue
                  */
                 smp_read_barrier_depends();
-               if (cic->key)
+               if (cic->key == cfqd)
                         __cfq_exit_single_io_context(cfqd, cic);
  
                 spin_unlock_irqrestore(q->queue_lock, flags);
@@ -2655,7 +2810,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
  
  static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
  {
-       struct cfq_data *cfqd = cic->key;
+       struct cfq_data *cfqd = cic_to_cfqd(cic);
         struct cfq_queue *cfqq;
         unsigned long flags;
  
@@ -2695,7 +2850,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         RB_CLEAR_NODE(&cfqq->p_node);
         INIT_LIST_HEAD(&cfqq->fifo);
  
-       atomic_set(&cfqq->ref, 0);
+       cfqq->ref = 0;
         cfqq->cfqd = cfqd;
  
         cfq_mark_cfqq_prio_changed(cfqq);
@@ -2712,7 +2867,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
  static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
  {
         struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
-       struct cfq_data *cfqd = cic->key;
+       struct cfq_data *cfqd = cic_to_cfqd(cic);
         unsigned long flags;
         struct request_queue *q;
  
@@ -2831,11 +2986,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
          * pin the queue now that it's allocated, scheduler exit will prune it
          */
         if (!is_sync && !(*async_cfqq)) {
-               atomic_inc(&cfqq->ref);
+               cfqq->ref++;
                 *async_cfqq = cfqq;
         }
  
-       atomic_inc(&cfqq->ref);
+       cfqq->ref++;
         return cfqq;
  }
  
@@ -2849,12 +3004,13 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
         unsigned long flags;
  
         WARN_ON(!list_empty(&cic->queue_list));
+       BUG_ON(cic->key != cfqd_dead_key(cfqd));
  
         spin_lock_irqsave(&ioc->lock, flags);
  
         BUG_ON(ioc->ioc_data == cic);
  
-       radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
+       radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
         hlist_del_rcu(&cic->cic_list);
         spin_unlock_irqrestore(&ioc->lock, flags);
  
@@ -2866,7 +3022,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
  {
         struct cfq_io_context *cic;
         unsigned long flags;
-       void *k;
  
         if (unlikely(!ioc))
                 return NULL;
@@ -2883,13 +3038,11 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
         }
  
         do {
-               cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
+               cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
                 rcu_read_unlock();
                 if (!cic)
                         break;
-               /* ->key must be copied to avoid race with cfq_exit_queue() */
-               k = cic->key;
-               if (unlikely(!k)) {
+               if (unlikely(cic->key != cfqd)) {
                         cfq_drop_dead_cic(cfqd, ioc, cic);
                         rcu_read_lock();
                         continue;
@@ -2922,7 +3075,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
  
                 spin_lock_irqsave(&ioc->lock, flags);
                 ret = radix_tree_insert(&ioc->radix_root,
-                                               (unsigned long) cfqd, cic);
+                                               cfqd->cic_index, cic);
                 if (!ret)
                         hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
                 spin_unlock_irqrestore(&ioc->lock, flags);
@@ -3039,7 +3192,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
                 cfq_mark_cfqq_deep(cfqq);
  
-       if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+       if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+               enable_idle = 0;
+       else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
             (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
                 enable_idle = 0;
         else if (sample_valid(cic->ttime_samples)) {
@@ -3108,7 +3263,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
          * So both queues are sync. Let the new request get disk time if
          * it's a metadata request and the current queue is doing regular IO.
          */
-       if (rq_is_meta(rq) && !cfqq->meta_pending)
+       if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
                 return true;
  
         /*
@@ -3117,6 +3272,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
                 return true;
  
+       /* An idle queue should not be idle now for some reason */
+       if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
+               return true;
+
         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
                 return false;
  
@@ -3136,10 +3295,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
   */
  static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
+       struct cfq_queue *old_cfqq = cfqd->active_queue;
+
         cfq_log_cfqq(cfqd, cfqq, "preempt");
         cfq_slice_expired(cfqd, 1);
  
         /*
+        * workload type is changed, don't save slice, otherwise preempt
+        * doesn't happen
+        */
+       if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+               cfqq->cfqg->saved_workload_slice = 0;
+
+       /*
          * Put the new queue at the front of the of the current list,
          * so we know that it will be selected next.
          */
@@ -3162,7 +3330,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         struct cfq_io_context *cic = RQ_CIC(rq);
  
         cfqd->rq_queued++;
-       if (rq_is_meta(rq))
+       if (rq->cmd_flags & REQ_META)
                 cfqq->meta_pending++;
  
         cfq_update_io_thinktime(cfqd, cic);
@@ -3185,11 +3353,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                 if (cfq_cfqq_wait_request(cfqq)) {
                         if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
                             cfqd->busy_queues > 1) {
-                               del_timer(&cfqd->idle_slice_timer);
+                               cfq_del_timer(cfqd, cfqq);
                                 cfq_clear_cfqq_wait_request(cfqq);
-                               __blk_run_queue(cfqd->queue);
-                       } else
+                               __blk_run_queue(cfqd->queue, false);
+                       } else {
+                               cfq_blkiocg_update_idle_time_stats(
+                                               &cfqq->cfqg->blkg);
                                 cfq_mark_cfqq_must_dispatch(cfqq);
+                       }
                 }
         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
                 /*
@@ -3199,7 +3370,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                  * this new queue is RT and the current one is BE
                  */
                 cfq_preempt_queue(cfqd, cfqq);
-               __blk_run_queue(cfqd->queue);
+               __blk_run_queue(cfqd->queue, false);
         }
  }
  
@@ -3214,7 +3385,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
         rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
         list_add_tail(&rq->queuelist, &cfqq->fifo);
         cfq_add_rq_rb(rq);
-
+       cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+                       &cfqd->serving_group->blkg, rq_data_dir(rq),
+                       rq_is_sync(rq));
         cfq_rq_enqueued(cfqd, cfqq, rq);
  }
  
@@ -3259,6 +3432,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
         struct cfq_io_context *cic = cfqd->active_cic;
  
+       /* If the queue already has requests, don't wait */
+       if (!RB_EMPTY_ROOT(&cfqq->sort_list))
+               return false;
+
         /* If there are other queues in the group, don't wait */
         if (cfqq->cfqg->nr_cfqq > 1)
                 return false;
@@ -3292,7 +3469,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
         unsigned long now;
  
         now = jiffies;
-       cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
+       cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
+                    !!(rq->cmd_flags & REQ_NOIDLE));
  
         cfq_update_hw_tag(cfqd);
  
@@ -3300,6 +3478,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
         WARN_ON(!cfqq->dispatched);
         cfqd->rq_in_driver--;
         cfqq->dispatched--;
+       (RQ_CFQG(rq))->dispatched--;
+       cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
+                       rq_start_time_ns(rq), rq_io_start_time_ns(rq),
+                       rq_data_dir(rq), rq_is_sync(rq));
  
         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
  
@@ -3326,7 +3508,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
                  * the queue.
                  */
                 if (cfq_should_wait_busy(cfqd, cfqq)) {
-                       cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
+                       unsigned long extend_sl = cfqd->cfq_slice_idle;
+                       if (!cfqd->cfq_slice_idle)
+                               extend_sl = cfqd->cfq_group_idle;
+                       cfqq->slice_end = jiffies + extend_sl;
                         cfq_mark_cfqq_wait_busy(cfqq);
                         cfq_log_cfqq(cfqd, cfqq, "will busy wait");
                 }
@@ -3343,16 +3528,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
                         cfq_slice_expired(cfqd, 1);
                 else if (sync && cfqq_empty &&
                          !cfq_close_cooperator(cfqd, cfqq)) {
-                       cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
-                       /*
-                        * Idling is enabled for SYNC_WORKLOAD.
-                        * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
-                        * only if we processed at least one !rq_noidle request
-                        */
-                       if (cfqd->serving_type == SYNC_WORKLOAD
-                           || cfqd->noidle_tree_requires_idle
-                           || cfqq->cfqg->nr_cfqq == 1)
-                               cfq_arm_slice_timer(cfqd);
+                       cfq_arm_slice_timer(cfqd);
                 }
         }
  
@@ -3440,6 +3616,10 @@ static void cfq_put_request(struct request *rq)
                 rq->elevator_private = NULL;
                 rq->elevator_private2 = NULL;
  
+               /* Put down rq reference on cfqg */
+               cfq_put_cfqg(RQ_CFQG(rq));
+               rq->elevator_private3 = NULL;
+
                 cfq_put_queue(cfqq);
         }
  }
@@ -3470,6 +3650,9 @@ split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
         }
  
         cic_set_cfqq(cic, NULL, 1);
+
+       cfq_put_cooperator(cfqq);
+
         cfq_put_queue(cfqq);
         return NULL;
  }
@@ -3522,12 +3705,13 @@ new_queue:
         }
  
         cfqq->allocated[rw]++;
-       atomic_inc(&cfqq->ref);
+       cfqq->ref++;
+       rq->elevator_private = cic;
+       rq->elevator_private2 = cfqq;
+       rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
  
         spin_unlock_irqrestore(q->queue_lock, flags);
  
-       rq->elevator_private = cic;
-       rq->elevator_private2 = cfqq;
         return 0;
  
  queue_fail:
@@ -3547,7 +3731,7 @@ static void cfq_kick_queue(struct work_struct *work)
         struct request_queue *q = cfqd->queue;
  
         spin_lock_irq(q->queue_lock);
-       __blk_run_queue(cfqd->queue);
+       __blk_run_queue(cfqd->queue, false);
         spin_unlock_irq(q->queue_lock);
  }
  
@@ -3655,16 +3839,38 @@ static void cfq_exit_queue(struct elevator_queue *e)
  
         cfq_put_async_queues(cfqd);
         cfq_release_cfq_groups(cfqd);
-       blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+       cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
  
         spin_unlock_irq(q->queue_lock);
  
         cfq_shutdown_timer_wq(cfqd);
  
+       spin_lock(&cic_index_lock);
+       ida_remove(&cic_index_ida, cfqd->cic_index);
+       spin_unlock(&cic_index_lock);
+
         /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
         call_rcu(&cfqd->rcu, cfq_cfqd_free);
  }
  
+static int cfq_alloc_cic_index(void)
+{
+       int index, error;
+
+       do {
+               if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
+                       return -ENOMEM;
+
+               spin_lock(&cic_index_lock);
+               error = ida_get_new(&cic_index_ida, &index);
+               spin_unlock(&cic_index_lock);
+               if (error && error != -EAGAIN)
+                       return error;
+       } while (error);
+
+       return index;
+}
+
  static void *cfq_init_queue(struct request_queue *q)
  {
         struct cfq_data *cfqd;
@@ -3672,10 +3878,20 @@ static void *cfq_init_queue(struct request_queue *q)
         struct cfq_group *cfqg;
         struct cfq_rb_root *st;
  
+       i = cfq_alloc_cic_index();
+       if (i < 0)
+               return NULL;
+
         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
         if (!cfqd)
                 return NULL;
  
+       /*
+        * Don't need take queue_lock in the routine, since we are
+        * initializing the ioscheduler, and nobody is using cfqd
+        */
+       cfqd->cic_index = i;
+
         /* Init root service tree */
         cfqd->grp_service_tree = CFQ_RB_ROOT;
  
@@ -3693,9 +3909,11 @@ static void *cfq_init_queue(struct request_queue *q)
          * Take a reference to root group which we never drop. This is just
          * to make sure that cfq_put_cfqg() does not try to kfree root group
          */
-       atomic_set(&cfqg->ref, 1);
-       blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
-                                       0);
+       cfqg->ref = 1;
+       rcu_read_lock();
+       cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
+                                       (void *)cfqd, 0);
+       rcu_read_unlock();
  #endif
         /*
          * Not strictly needed (since RB_ROOT just clears the node and we
@@ -3711,7 +3929,7 @@ static void *cfq_init_queue(struct request_queue *q)
          * will not attempt to free it.
          */
         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
-       atomic_inc(&cfqd->oom_cfqq.ref);
+       cfqd->oom_cfqq.ref++;
         cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
  
         INIT_LIST_HEAD(&cfqd->cic_list);
@@ -3733,6 +3951,7 @@ static void *cfq_init_queue(struct request_queue *q)
         cfqd->cfq_slice[1] = cfq_slice_sync;
         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
         cfqd->cfq_slice_idle = cfq_slice_idle;
+       cfqd->cfq_group_idle = cfq_group_idle;
         cfqd->cfq_latency = 1;
         cfqd->cfq_group_isolation = 0;
         cfqd->hw_tag = -1;
@@ -3741,7 +3960,6 @@ static void *cfq_init_queue(struct request_queue *q)
          * second, in order to have larger depth for async operations.
          */
         cfqd->last_delayed_sync = jiffies - HZ;
-       INIT_RCU_HEAD(&cfqd->rcu);
         return cfqd;
  }
  
@@ -3806,6 +4024,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
  SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
  SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
  SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
  SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
  SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
  SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@ -3838,6 +4057,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
  STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
                 UINT_MAX, 0);
  STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
  STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
  STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
  STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@ -3859,6 +4079,7 @@ static struct elv_fs_entry cfq_attrs[] = {
         CFQ_ATTR(slice_async),
         CFQ_ATTR(slice_async_rq),
         CFQ_ATTR(slice_idle),
+       CFQ_ATTR(group_idle),
         CFQ_ATTR(low_latency),
         CFQ_ATTR(group_isolation),
         __ATTR_NULL
@@ -3870,6 +4091,7 @@ static struct elevator_type iosched_cfq = {
                 .elevator_merged_fn =           cfq_merged_request,
                 .elevator_merge_req_fn =        cfq_merged_requests,
                 .elevator_allow_merge_fn =      cfq_allow_merge,
+               .elevator_bio_merged_fn =       cfq_bio_merged,
                 .elevator_dispatch_fn =         cfq_dispatch_requests,
                 .elevator_add_req_fn =          cfq_insert_request,
                 .elevator_activate_req_fn =     cfq_activate_request,
@@ -3896,6 +4118,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
                 .blkio_unlink_group_fn =        cfq_unlink_blkio_group,
                 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
         },
+       .plid = BLKIO_POLICY_PROP,
  };
  #else
  static struct blkio_policy_type blkio_policy_cfq;
@@ -3911,6 +4134,12 @@ static int __init cfq_init(void)
         if (!cfq_slice_idle)
                 cfq_slice_idle = 1;
  
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+       if (!cfq_group_idle)
+               cfq_group_idle = 1;
+#else
+               cfq_group_idle = 0;
+#endif
         if (cfq_slab_setup())
                 return -ENOMEM;
  
@@ -3935,6 +4164,7 @@ static void __exit cfq_exit(void)
          */
         if (elv_ioc_count_read(cfq_ioc_count))
                 wait_for_completion(&all_gone);
+       ida_destroy(&cic_index_ida);
         cfq_slab_kill();
  }